In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from keras import Sequential
from keras.layers import Dense

In [2]:
# read the dataset
data = pd.read_csv('Data_Sets/mammographic_masses.data')
data.head(10)

Unnamed: 0,5,67,3,5.1,3.1,1
0,4,43,1,1,?,1
1,5,58,4,5,3,1
2,4,28,1,1,3,0
3,5,74,1,5,?,1
4,4,65,1,?,3,0
5,4,70,?,?,3,0
6,5,42,1,?,3,0
7,5,57,1,5,3,1
8,5,60,?,5,1,1
9,5,76,1,4,3,1


In [3]:
# change the column names
data = pd.DataFrame(data)
data.columns = ['BI_RADS','age','shape','margin','density','severity']
data.head()

Unnamed: 0,BI_RADS,age,shape,margin,density,severity
0,4,43,1,1,?,1
1,5,58,4,5,3,1
2,4,28,1,1,3,0
3,5,74,1,5,?,1
4,4,65,1,?,3,0


In [4]:
# replace all question marks with NaN
data = data.replace('?',np.nan)
data.head()

Unnamed: 0,BI_RADS,age,shape,margin,density,severity
0,4,43,1,1.0,,1
1,5,58,4,5.0,3.0,1
2,4,28,1,1.0,3.0,0
3,5,74,1,5.0,,1
4,4,65,1,,3.0,0


In [5]:
# checking the null values in each column
data = data.astype(float)
data.isnull().sum()

BI_RADS      2
age          5
shape       31
margin      48
density     76
severity     0
dtype: int64

In [6]:
# replacing the null values with mean of columns
data = data.apply(lambda x: x.fillna(x.mean()),axis=0)
data = data.astype(int)
data.head(10)

Unnamed: 0,BI_RADS,age,shape,margin,density,severity
0,4,43,1,1,2,1
1,5,58,4,5,3,1
2,4,28,1,1,3,0
3,5,74,1,5,2,1
4,4,65,1,2,3,0
5,4,70,2,2,3,0
6,5,42,1,2,3,0
7,5,57,1,5,3,1
8,5,60,2,5,1,1
9,5,76,1,4,3,1


In [7]:
data.isnull().sum()

BI_RADS     0
age         0
shape       0
margin      0
density     0
severity    0
dtype: int64

In [8]:
# dropping the BI_RADS column
data.drop('BI_RADS',axis=1,inplace=True)
data.head(10)

Unnamed: 0,age,shape,margin,density,severity
0,43,1,1,2,1
1,58,4,5,3,1
2,28,1,1,3,0
3,74,1,5,2,1
4,65,1,2,3,0
5,70,2,2,3,0
6,42,1,2,3,0
7,57,1,5,3,1
8,60,2,5,1,1
9,76,1,4,3,1


In [9]:
# encoding the categorical variable
shape = pd.get_dummies(data['shape'],drop_first=True)
shape.head(10)

Unnamed: 0,2,3,4
0,0,0,0
1,0,0,1
2,0,0,0
3,0,0,0
4,0,0,0
5,1,0,0
6,0,0,0
7,0,0,0
8,1,0,0
9,0,0,0


In [10]:
# encoding the categorical variable
margin = pd.get_dummies(data['margin'],drop_first=True)
margin.head(10)

Unnamed: 0,2,3,4,5
0,0,0,0,0
1,0,0,0,1
2,0,0,0,0
3,0,0,0,1
4,1,0,0,0
5,1,0,0,0
6,1,0,0,0
7,0,0,0,1
8,0,0,0,1
9,0,0,1,0


In [11]:
# encoding the categorical variable
density = pd.get_dummies(data['density'],drop_first=True)
density.head(10)

Unnamed: 0,2,3,4
0,1,0,0
1,0,1,0
2,0,1,0
3,1,0,0
4,0,1,0
5,0,1,0
6,0,1,0
7,0,1,0
8,0,0,0
9,0,1,0


In [12]:
# dropping the categorical columns
data.drop(['shape','margin','density'],axis=1,inplace=True)
data.head(10)

Unnamed: 0,age,severity
0,43,1
1,58,1
2,28,0
3,74,1
4,65,0
5,70,0
6,42,0
7,57,1
8,60,1
9,76,1


In [13]:
# concatenating the dummy
data = pd.concat([data,shape,margin,density],axis=1)
data.head(10)

Unnamed: 0,age,severity,2,3,4,2.1,3.1,4.1,5,2.2,3.2,4.2
0,43,1,0,0,0,0,0,0,0,1,0,0
1,58,1,0,0,1,0,0,0,1,0,1,0
2,28,0,0,0,0,0,0,0,0,0,1,0
3,74,1,0,0,0,0,0,0,1,1,0,0
4,65,0,0,0,0,1,0,0,0,0,1,0
5,70,0,1,0,0,1,0,0,0,0,1,0
6,42,0,0,0,0,1,0,0,0,0,1,0
7,57,1,0,0,0,0,0,0,1,0,1,0
8,60,1,1,0,0,0,0,0,1,0,0,0
9,76,1,0,0,0,0,0,1,0,0,1,0


In [14]:
# splitting the independent and dependent variables
x = data.drop('severity',axis=1)
y = data['severity']

In [15]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=1/31,random_state=0)

In [16]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [17]:
# applying the logistic regression
model = LogisticRegression()
model.fit(x_train,y_train)
predict = model.predict(x_test)
accuracy_score(y_test,predict)*100

83.87096774193549

In [18]:
# applying the knn
model = KNeighborsClassifier(n_neighbors=5,p=2,metric='minkowski')
model.fit(x_train,y_train)
predict = model.predict(x_test)
accuracy_score(y_test,predict)*100

87.09677419354838

In [19]:
# applying the svm
model = SVC(kernel='linear')
model.fit(x_train,y_train)
predict = model.predict(x_test)
accuracy_score(y_test,predict)*100

87.09677419354838

In [20]:
# applying the decision tree
model = DecisionTreeClassifier(criterion='entropy',random_state=0)
model.fit(x_train,y_train)
predict = model.predict(x_test)
accuracy_score(y_test,predict)*100

77.41935483870968

In [21]:
# applying the random forest
model = RandomForestClassifier(n_estimators=100,criterion="entropy")
model.fit(x_train,y_train)
predict = model.predict(x_test)
accuracy_score(y_test,predict)*100

83.87096774193549

In [22]:
# applying the Naives Bayes
model = GaussianNB()
model.fit(x_train,y_train)
predict = model.predict(x_test)
accuracy_score(y_test,predict)*100

77.41935483870968

In [23]:
# applying neural networks
model = Sequential()
model.add(Dense(6,activation='relu',kernel_initializer='uniform',input_dim=11))
model.add(Dense(6,kernel_initializer='uniform',activation='relu'))
model.add(Dense(1,activation='sigmoid',kernel_initializer='uniform'))
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.fit(x_train,y_train,batch_size=1,epochs=100,validation_data=(x_test,y_test))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f11647a5dc0>

In [24]:
y_pred = model.predict(x_test)
y_pred = y_pred > 0.5
accuracy_score(y_pred,y_test)*100

87.09677419354838