In [1]:
import pandas as pd

<h1>PreProcessing</h1>

In [2]:
df=pd.read_csv('mammographic_masses.data.txt')

In [3]:
df = pd.read_csv('mammographic_masses.data.txt', na_values=['?'], names = ['BI-RADS', 'age', 'shape', 'margin', 'density', 'severity'])
df.head()

Unnamed: 0,BI-RADS,age,shape,margin,density,severity
0,5.0,67.0,3.0,5.0,3.0,1
1,4.0,43.0,1.0,1.0,,1
2,5.0,58.0,4.0,5.0,3.0,1
3,4.0,28.0,1.0,1.0,3.0,0
4,5.0,74.0,1.0,5.0,,1


In [4]:
df.isna().sum()

BI-RADS      2
age          5
shape       31
margin      48
density     76
severity     0
dtype: int64

<h2>Data Cleaning</h2>

In [5]:
col=list(df.columns)
col

['BI-RADS', 'age', 'shape', 'margin', 'density', 'severity']

In [6]:
for i in range(len(col)):
    print(f"Median for {i}:",df[col[i]].median())

Median for 0: 4.0
Median for 1: 57.0
Median for 2: 3.0
Median for 3: 3.0
Median for 4: 3.0
Median for 5: 0.0


<h2>Filling with medians</h2>

In [7]:
for i in range(len(col)):
    df[col[i]].fillna(df[col[i]].median(),inplace=True)

In [8]:
df.isna().sum()

BI-RADS     0
age         0
shape       0
margin      0
density     0
severity    0
dtype: int64

In [9]:
df.head()

Unnamed: 0,BI-RADS,age,shape,margin,density,severity
0,5.0,67.0,3.0,5.0,3.0,1
1,4.0,43.0,1.0,1.0,3.0,1
2,5.0,58.0,4.0,5.0,3.0,1
3,4.0,28.0,1.0,1.0,3.0,0
4,5.0,74.0,1.0,5.0,3.0,1


In [10]:
df.drop('BI-RADS',axis=1,inplace=True)

<h3>Splitting x and y </h3>

In [45]:
x=df.iloc[:,:-1].values
y=df.loc[:,'severity'].values

<h2>Normalization</h2>

In [46]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
x_scaled = scaler.fit_transform(x)
x_scaled

array([[ 0.79698441,  0.22038395,  1.43676223,  0.22480407],
       [-0.86561042, -1.41505218, -1.18321596,  0.22480407],
       [ 0.17351135,  1.03810202,  1.43676223,  0.22480407],
       ...,
       [ 0.58916006,  1.03810202,  1.43676223,  0.22480407],
       [ 0.72770962,  1.03810202,  1.43676223,  0.22480407],
       [ 0.45061049,  0.22038395,  0.12677314,  0.22480407]])

<h3>Visualizing</h3>

In [47]:
from Detailed_Details import Detailed_Details

In [14]:
df.head()

Unnamed: 0,age,shape,margin,density,severity
0,67.0,3.0,5.0,3.0,1
1,43.0,1.0,1.0,3.0,1
2,58.0,4.0,5.0,3.0,1
3,28.0,1.0,1.0,3.0,0
4,74.0,1.0,5.0,3.0,1


In [15]:
for i in range(1,len(col)-1):
    Detailed_Details(df,col[i],'severity',7)

0,1,2,3,4,5,6
age,Total No. (age),Percentage (age),Total Outcome (0),Percentage Outcome (0),Total Outcome (1),Percentage Outcome (1)
Greater Than Mean,510,53.07,192,37.21 %,318,71.46 %
Less Than Mean,451,46.93,324,62.79 %,127,28.54 %


0,1,2,3,4,5,6
shape,Total No. (shape),Percentage (shape),Total severity (0),Percentage severity (0),Total severity (1),Percentage severity (1)
1.0,224,23.31%,186,83.04%,38,16.96%
2.0,211,21.96%,176,83.41%,35,16.59%
3.0,126,13.11%,69,54.76%,57,45.24%
4.0,400,41.62%,85,21.25%,315,78.75%


0,1,2,3,4,5,6
margin,Total No. (margin),Percentage (margin),Total severity (0),Percentage severity (0),Total severity (1),Percentage severity (1)
1.0,357,37.15%,316,88.52%,41,11.48%
2.0,24,2.5%,9,37.5%,15,62.5%
3.0,164,17.07%,80,48.78%,84,51.22%
4.0,280,29.14%,89,31.79%,191,68.21%
5.0,136,14.15%,22,16.18%,114,83.82%


0,1,2,3,4,5,6
density,Total No. (density),Percentage (density),Total severity (0),Percentage severity (0),Total severity (1),Percentage severity (1)
1.0,16,1.66%,9,56.25%,7,43.75%
2.0,59,6.14%,41,69.49%,18,30.51%
3.0,874,90.95%,459,52.52%,415,47.48%
4.0,12,1.25%,7,58.33%,5,41.67%


In [16]:
df.corr()

Unnamed: 0,age,shape,margin,density,severity
age,1.0,0.360532,0.402995,0.021119,0.431329
shape,0.360532,1.0,0.718893,0.057495,0.552781
margin,0.402995,0.718893,1.0,0.094516,0.557867
density,0.021119,0.057495,0.094516,1.0,0.054681
severity,0.431329,0.552781,0.557867,0.054681,1.0


<h3>Train with GridSearch</h3>

In [48]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [49]:
params={
    'knn': {'n_neighbors':[3,5,7,9,11,13,15],
    'metric':['cosine','euclidean','manhattan'],
    'weights':['uniform','distance']},
    
    'svc': {
    'C':[0.1,1,10,100], 
    'gamma':[1,0.1,0.01,0.01], 
    'kernel':['rbf','linear']},
    
    'dtc':{
        'criterion':['gini','entropy'],
        'max_depth':[2,4,6,8,10,12]
    },
    
    'nb':{
        'priors': [None],
    'var_smoothing': [0.00000001, 0.000000001, 0.00000001]
    },
    'rf':{
        'criterion':['gini','entropy'],
        'max_depth':[2,4,6,8,10,12]
    },
    'lr':{
        'solver':[ 'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
    }
}

    

In [50]:
#Now Finally Implementing Machine Learning
svc=SVC()
knn=KNeighborsClassifier()
dtc=DecisionTreeClassifier()
nb=GaussianNB()
rf=RandomForestClassifier()
lr=LogisticRegression()

In [51]:
models={'svc':svc,'knn':knn,'dtc':dtc,'nb':nb,'rf':rf,'lr':lr}

In [52]:
x_train,x_test,y_train,y_test=train_test_split(x_scaled,y,test_size=0.2,random_state=42)

In [53]:
model_accuracy={}
score=0.0001
for model in models.keys():
        
    mod = GridSearchCV(
        models[model],
        params[model],
        verbose=0, #Progress bar showing
        cv=20, #cross validation
        n_jobs=-1, #cores to assign
    )
        
    gridsearch_result=mod.fit(x_train,y_train)
    predict=mod.predict(x_test)
    print(f"{model} : ",gridsearch_result.best_estimator_)
    print(f"{model}")
    print(confusion_matrix(predict,y_test))
        
    if(score < float(gridsearch_result.score(x_test,y_test))):
        score=gridsearch_result.score(x_test,y_test)
        gridsearch=gridsearch_result.fit(x_train,y_train)
    if model not in model_accuracy.keys():
        model_accuracy.update({model:gridsearch_result.score(x_test,y_test)})

svc :  SVC(C=100, gamma=0.01)
svc
[[83 15]
 [18 77]]
knn :  KNeighborsClassifier(metric='manhattan', n_neighbors=15)
knn
[[83 16]
 [18 76]]
dtc :  DecisionTreeClassifier(max_depth=2)
dtc
[[73 12]
 [28 80]]
nb :  GaussianNB(var_smoothing=1e-08)
nb
[[77 12]
 [24 80]]
rf :  RandomForestClassifier(max_depth=2)
rf
[[82 14]
 [19 78]]
lr :  LogisticRegression(solver='newton-cg')
lr
[[83 15]
 [18 77]]


In [54]:
model_accuracy

{'svc': 0.8290155440414507,
 'knn': 0.8238341968911918,
 'dtc': 0.7927461139896373,
 'nb': 0.8134715025906736,
 'rf': 0.8290155440414507,
 'lr': 0.8290155440414507}

In [55]:
gridsearch.best_estimator_

<h2>Droping less correlated columns</h2>

In [56]:
df.corr()

Unnamed: 0,age,shape,margin,density,severity
age,1.0,0.360532,0.402995,0.021119,0.431329
shape,0.360532,1.0,0.718893,0.057495,0.552781
margin,0.402995,0.718893,1.0,0.094516,0.557867
density,0.021119,0.057495,0.094516,1.0,0.054681
severity,0.431329,0.552781,0.557867,0.054681,1.0


In [57]:
df_dropped=df.drop('density',axis=1)

In [58]:
df_dropped.head()

Unnamed: 0,age,shape,margin,severity
0,67.0,3.0,5.0,1
1,43.0,1.0,1.0,1
2,58.0,4.0,5.0,1
3,28.0,1.0,1.0,0
4,74.0,1.0,5.0,1


In [41]:
df_dropped.corr()

Unnamed: 0,age,shape,margin,severity
age,1.0,0.360532,0.402995,0.431329
shape,0.360532,1.0,0.718893,0.552781
margin,0.402995,0.718893,1.0,0.557867
severity,0.431329,0.552781,0.557867,1.0


In [59]:
x=df_dropped.iloc[:,:-1].values
y=df_dropped.loc[:,'severity'].values

In [60]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
x_scaled = scaler.fit_transform(x)
x_scaled

array([[ 0.79698441,  0.22038395,  1.43676223],
       [-0.86561042, -1.41505218, -1.18321596],
       [ 0.17351135,  1.03810202,  1.43676223],
       ...,
       [ 0.58916006,  1.03810202,  1.43676223],
       [ 0.72770962,  1.03810202,  1.43676223],
       [ 0.45061049,  0.22038395,  0.12677314]])

In [61]:
x_train,x_test,y_train,y_test=train_test_split(x_scaled,y,test_size=0.2,random_state=42)

In [62]:
model_accuracy={}
score=0.0001
for model in models.keys():
        
    mod = GridSearchCV(
        models[model],
        params[model],
        verbose=0, #Progress bar showing
        cv=20, #cross validation
        n_jobs=-1, #cores to assign
    )
        
    gridsearch_result=mod.fit(x_train,y_train)
    predict=mod.predict(x_test)
    print(f"{model} : ",gridsearch_result.best_estimator_)
    print(f"{model}")
    print(confusion_matrix(predict,y_test))
        
    if(score < float(gridsearch_result.score(x_test,y_test))):
        score=gridsearch_result.score(x_test,y_test)
        gridsearch=gridsearch_result.fit(x_train,y_train)
    if model not in model_accuracy.keys():
        model_accuracy.update({model:gridsearch_result.score(x_test,y_test)})

svc :  SVC(C=0.1, gamma=1)
svc
[[84 14]
 [17 78]]
knn :  KNeighborsClassifier(metric='manhattan', n_neighbors=15)
knn
[[85 16]
 [16 76]]
dtc :  DecisionTreeClassifier(max_depth=2)
dtc
[[73 12]
 [28 80]]
nb :  GaussianNB(var_smoothing=1e-08)
nb
[[77 11]
 [24 81]]
rf :  RandomForestClassifier(max_depth=2)
rf
[[81 14]
 [20 78]]
lr :  LogisticRegression(solver='liblinear')
lr
[[83 15]
 [18 77]]


In [65]:
model_accuracy

{'svc': 0.8393782383419689,
 'knn': 0.8341968911917098,
 'dtc': 0.7927461139896373,
 'nb': 0.8186528497409327,
 'rf': 0.8238341968911918,
 'lr': 0.8290155440414507}

In [66]:
gridsearch.best_estimator_

<h2>Using Neural Networks</h2>

In [67]:
x=df.iloc[:,:-1].values
y=df.loc[:,'severity'].values

In [68]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
x_scaled = scaler.fit_transform(x)
x_scaled

array([[ 0.79698441,  0.22038395,  1.43676223,  0.22480407],
       [-0.86561042, -1.41505218, -1.18321596,  0.22480407],
       [ 0.17351135,  1.03810202,  1.43676223,  0.22480407],
       ...,
       [ 0.58916006,  1.03810202,  1.43676223,  0.22480407],
       [ 0.72770962,  1.03810202,  1.43676223,  0.22480407],
       [ 0.45061049,  0.22038395,  0.12677314,  0.22480407]])

In [69]:
x_train,x_test,y_train,y_test=train_test_split(x_scaled,y,test_size=0.2,random_state=42)

In [70]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

In [78]:
model = Sequential()
    #4 feature inputs going into an 6-unit layer (more does not seem to help - in fact you can go down to 4)
model.add(Dense(6, input_dim=4, kernel_initializer='normal', activation='relu'))
    # "Deep learning" turns out to be unnecessary - this additional hidden layer doesn't help either.
model.add(Dense(4, kernel_initializer='normal', activation='relu'))
    # Output layer with a binary classification (benign or malignant)
model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    # Compile model; adam seemed to work best
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [80]:
model.fit(x_scaled,y,verbose=2,epochs=200)

Epoch 1/200
31/31 - 0s - loss: 0.4593 - accuracy: 0.7992 - 203ms/epoch - 7ms/step
Epoch 2/200
31/31 - 0s - loss: 0.4589 - accuracy: 0.7981 - 190ms/epoch - 6ms/step
Epoch 3/200
31/31 - 0s - loss: 0.4586 - accuracy: 0.8012 - 188ms/epoch - 6ms/step
Epoch 4/200
31/31 - 0s - loss: 0.4583 - accuracy: 0.8002 - 191ms/epoch - 6ms/step
Epoch 5/200
31/31 - 0s - loss: 0.4581 - accuracy: 0.8012 - 190ms/epoch - 6ms/step
Epoch 6/200
31/31 - 0s - loss: 0.4577 - accuracy: 0.8012 - 195ms/epoch - 6ms/step
Epoch 7/200
31/31 - 0s - loss: 0.4576 - accuracy: 0.8012 - 186ms/epoch - 6ms/step
Epoch 8/200
31/31 - 0s - loss: 0.4581 - accuracy: 0.8023 - 187ms/epoch - 6ms/step
Epoch 9/200
31/31 - 0s - loss: 0.4578 - accuracy: 0.8033 - 195ms/epoch - 6ms/step
Epoch 10/200
31/31 - 0s - loss: 0.4571 - accuracy: 0.8033 - 198ms/epoch - 6ms/step
Epoch 11/200
31/31 - 0s - loss: 0.4568 - accuracy: 0.7992 - 182ms/epoch - 6ms/step
Epoch 12/200
31/31 - 0s - loss: 0.4568 - accuracy: 0.8033 - 188ms/epoch - 6ms/step
Epoch 13/200


<keras.callbacks.History at 0x12e0809bac0>

In [81]:
x=df_dropped.iloc[:,:-1].values
y=df_dropped.loc[:,'severity'].values

In [82]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
x_scaled = scaler.fit_transform(x)
x_scaled

array([[ 0.79698441,  0.22038395,  1.43676223],
       [-0.86561042, -1.41505218, -1.18321596],
       [ 0.17351135,  1.03810202,  1.43676223],
       ...,
       [ 0.58916006,  1.03810202,  1.43676223],
       [ 0.72770962,  1.03810202,  1.43676223],
       [ 0.45061049,  0.22038395,  0.12677314]])

In [84]:
model = Sequential()
    #4 feature inputs going into an 6-unit layer (more does not seem to help - in fact you can go down to 4)
model.add(Dense(6, input_dim=3, kernel_initializer='normal', activation='relu'))
    # "Deep learning" turns out to be unnecessary - this additional hidden layer doesn't help either.
model.add(Dense(4, kernel_initializer='normal', activation='relu'))
    # Output layer with a binary classification (benign or malignant)
model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    # Compile model; adam seemed to work best
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [85]:
model.fit(x_scaled,y,verbose=2,epochs=200)

Epoch 1/200
31/31 - 1s - loss: 0.6925 - accuracy: 0.5317 - 685ms/epoch - 22ms/step
Epoch 2/200
31/31 - 0s - loss: 0.6887 - accuracy: 0.5369 - 195ms/epoch - 6ms/step
Epoch 3/200
31/31 - 0s - loss: 0.6775 - accuracy: 0.5494 - 188ms/epoch - 6ms/step
Epoch 4/200
31/31 - 0s - loss: 0.6510 - accuracy: 0.7482 - 196ms/epoch - 6ms/step
Epoch 5/200
31/31 - 0s - loss: 0.6097 - accuracy: 0.7867 - 206ms/epoch - 7ms/step
Epoch 6/200
31/31 - 0s - loss: 0.5694 - accuracy: 0.7898 - 190ms/epoch - 6ms/step
Epoch 7/200
31/31 - 0s - loss: 0.5416 - accuracy: 0.7908 - 212ms/epoch - 7ms/step
Epoch 8/200
31/31 - 0s - loss: 0.5230 - accuracy: 0.7950 - 185ms/epoch - 6ms/step
Epoch 9/200
31/31 - 0s - loss: 0.5091 - accuracy: 0.7960 - 206ms/epoch - 7ms/step
Epoch 10/200
31/31 - 0s - loss: 0.4990 - accuracy: 0.7908 - 185ms/epoch - 6ms/step
Epoch 11/200
31/31 - 0s - loss: 0.4901 - accuracy: 0.7919 - 190ms/epoch - 6ms/step
Epoch 12/200
31/31 - 0s - loss: 0.4825 - accuracy: 0.7940 - 222ms/epoch - 7ms/step
Epoch 13/200

<keras.callbacks.History at 0x12d43bd7910>