In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns; sns.set(style="ticks", color_codes=True)
import hypertools as hyp 
from sklearn.linear_model import LogisticRegression # to apply the Logistic regression
from sklearn.model_selection import train_test_split # to split the data into two parts
from sklearn.model_selection import GridSearchCV# for tuning parameter
from sklearn.ensemble import RandomForestClassifier # for random forest classifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm # for Support Vector Machine
from sklearn import metrics # for the check the error and accuracy of the model

%matplotlib inline

pd.set_option('display.max_columns', 100)

In [41]:
#df= pd.read_csv('./data/Breast Cancer Wisconsin.csv',  index_col= None, na_values='?')
df = pd.read_csv('./data/breast-cancer.data', index_col=None, na_values='?')
df.head(df.shape[0])

Unnamed: 0,Recorrencia,Range Idade,coisa,Range Cuzin,Range balls,tem ou num tem,cocozin,"""esquerda ou direita""","""baixo_cima""","""temounumtem2""",Unnamed: 10
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no,
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no,
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no,
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no,
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no,
...,...,...,...,...,...,...,...,...,...,...,...
281,recurrence-events,30-39,premeno,30-34,0-2,no,2,left,left_up,no,
282,recurrence-events,30-39,premeno,20-24,0-2,no,3,left,left_up,yes,
283,recurrence-events,60-69,ge40,20-24,0-2,no,1,right,left_up,no,
284,recurrence-events,40-49,ge40,30-34,3-5,no,3,left,left_low,no,


# Data Preprocessing

In [None]:
df.shape

In [42]:
print(df.isnull().sum())
if(df.isnull().sum().sum() == df.shape[0]):
    print("No missing values")
else:
    print("Missing values found")

Recorrencia                 0
Range Idade                 0
coisa                       0
Range Cuzin                 0
Range balls                 0
tem ou num tem              8
cocozin                     0
 "esquerda ou direita"      0
 "baixo_cima"               1
 "temounumtem2"             0
Unnamed: 10               286
dtype: int64
Missing values found


In [None]:
df.drop('Unnamed: 32', axis=1 , inplace=True)
df.shape

In [None]:
df.drop('id', axis=1 , inplace=True)
df.shape

In [None]:
df["diagnosis"].value_counts()

# Data Visualization

**Syntax Pair Plot
sns.pairplot(data, hue=None, hue_order=None, palette=None, vars=None, x_vars=None, y_vars=None, kind='scatter', diag_kind='hist', markers=None, size=2.5, aspect=1, dropna=True, plot_kws=None, diag_kws=None, grid_kws=None)

**Syntax Pair Grid
sns.pairplot(data, hue=None, hue_order=None, palette=None, hue_kws=None, vars=None, x_vars=None, y_vars=None, diag_sharey=True, size=2.5, aspect=1, despine=True, dropna=True)

In [None]:
radius = df[['radius_mean','radius_se','radius_worst','diagnosis']]
sns.pairplot(radius, hue='diagnosis',palette="husl", markers=["o", "s"],size=4)

In [None]:
sns.pairplot(radius,kind="reg",size=4)

In [None]:
g = sns.PairGrid(radius,hue='diagnosis', palette="Set1",size=4)
g = g.map_diag(plt.hist)
g = g.map_offdiag(plt.scatter, s = 3)

In [None]:
g = sns.PairGrid(radius, hue="diagnosis", palette="Set2",size=4,hue_kws={"marker": ["o", "s"]})
g = g.map(plt.scatter, linewidths=1, edgecolor="w", s=40)
g = g.add_legend()

In [None]:
texture = df[['texture_mean','texture_se','texture_worst','diagnosis']]
sns.pairplot(texture, hue='diagnosis', palette="Blues_d",size=4, kind="reg")

In [None]:
perimeter = df[['perimeter_mean','perimeter_se','perimeter_worst','diagnosis']]
sns.pairplot(perimeter, hue='diagnosis', size = 4, kind="reg")

In [None]:
area = df[['area_mean','area_se','area_worst','diagnosis']]
sns.pairplot(area, hue='diagnosis', size =4)

In [None]:
smoothness = df[['smoothness_mean','smoothness_se','smoothness_worst','diagnosis']]
sns.pairplot(smoothness, hue='diagnosis')

In [None]:
compactness = df[['compactness_mean','compactness_se','compactness_worst','diagnosis']]
sns.pairplot(compactness, hue='diagnosis')

In [None]:
concavity = df[['concavity_mean','concavity_se','concavity_worst','diagnosis']]
sns.pairplot(concavity, hue='diagnosis')

In [None]:
concave_points = df[['concave points_mean','concave points_se','concave points_worst','diagnosis']]
sns.pairplot(concave_points, hue='diagnosis')

In [None]:
symmetry = df[['symmetry_mean','symmetry_se','symmetry_worst','diagnosis']]
sns.pairplot(symmetry, hue='diagnosis')

In [None]:
fractal_dimension = df[['fractal_dimension_mean','fractal_dimension_se','fractal_dimension_worst','diagnosis']]
sns.pairplot(fractal_dimension, hue='diagnosis')

In [None]:
# Move the reponse variable "diagnosis" to the end of the dataframe
end = df['diagnosis']
df.drop(labels=['diagnosis'], axis=1,inplace = True)
df.insert(30, 'diagnosis', end)
df.head()

def categorical_to_numeric_diagnosis(x):
    if x=='M':
        return 1
    if x=='B':
        return 0

df['diagnosis']= df['diagnosis'].apply(categorical_to_numeric_diagnosis)
df["diagnosis"].value_counts()


colors = np.array('b g r c m y k'.split()) #Different colors for plotting

fig,axes = plt.subplots(nrows =15,ncols=2, sharey=True,figsize = (15,50))
plt.tight_layout()
row = 0
iteration = 0
for j in range(0,len(df.columns[:-1])):
    iteration+=1
    if(j%2==0):
        k = 0
    else:
        k = 1
    sns.distplot(df[df.columns[j]],kde=False,hist_kws=dict(edgecolor="w", linewidth=2),color = np.random.choice(colors) ,ax=axes[row][k])
    if(iteration%2==0):
        row+=1
        plt.ylim(0,200)

In [None]:
#The features looks like they are broken into three main categories, value means, standard deivations, and 'worst'. We can check to see if there are any correlations between these subsets of features
features_mean = df.columns[1:9]
features_se = df.columns[9:19]
features_worst = df.columns[1]

In [None]:
# Separate out malignant and benign data for graphing
malignant = df[df['diagnosis'] ==1]
benign = df[df['diagnosis'] ==0]

In [None]:

# Column names to observe in following graphs - mean values only
observe = list(df.columns[1:11]) + ['area_worst'] + ['perimeter_worst']
observables = df.loc[:,observe]

In [None]:
plt.rcParams.update({'font.size': 8})
plot, graphs = plt.subplots(nrows=6, ncols=2, figsize=(12,14))
graphs = graphs.flatten()
for idx, graph in enumerate(graphs):
    graph.figure
    
    binwidth= (max(df[observe[idx]]) - min(df[observe[idx]]))/50
    bins = np.arange(min(df[observe[idx]]), max(df[observe[idx]]) + binwidth, binwidth)
    graph.hist([malignant[observe[idx]],benign[observe[idx]]], bins=bins, alpha=0.6, normed=True, label=['Malignant','Benign'], color=['red','blue'])
    graph.legend(loc='upper right')
    graph.set_title(observe[idx])
plt.tight_layout()

In [None]:
color_wheel = {0: "blue", 1: "red"}
colors = df["diagnosis"].map(lambda x: color_wheel.get(x))
pd.plotting.scatter_matrix(observables, c=colors, alpha = 0.5, figsize = (17, 17), diagonal = 'kde');