In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import cross_validation
from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')



In [2]:
#Importing dataset
df = pd.read_csv('breast-cancer-wisconsin.csv')
df.replace('?',-99999,inplace=True)
df.drop(['id'],1,inplace=True)

X = np.array(df.drop(['class','bare_nuclei'],1))
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .20,random_state = 42)

In [3]:
# Spot Check Algorithms

models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))


# evaluate each model in turn
results = []
names = []

for name, model in models:
	cv_results = cross_val_score(model, X_train, y_train, cv=5)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)


LR: 0.957013 (0.018414)
LDA: 0.949838 (0.016804)
KNN: 0.949838 (0.021019)
CART: 0.935551 (0.021621)
NB: 0.956997 (0.015641)
SVM: 0.958735 (0.018688)


In [4]:

# Make predictions on validation dataset
lr = LogisticRegression()
lr.fit(X_train, y_train)
predictions = lr.predict(X_test)

print("lr testing")
print(accuracy_score(y_test, predictions))
#print(confusion_matrix(y_test, predictions))
#print(classification_report(y_test, predictions))

coeff = lr.coef_
inter = lr.intercept_

print('Coefficients: {}'.format(coeff))
print('Intercept: {}'.format(inter))

lr testing
0.9714285714285714
Coefficients: [[0.31242254 0.10996486 0.44464014 0.18344483 0.04387358 0.31652595
  0.1017104  0.13944887]]
Intercept: [-6.1653044]


In [5]:
#Applying Neural network
from keras.layers import Dense
from keras.models import Sequential
from keras.utils import to_categorical
from sklearn.preprocessing import scale
y2 = pd.get_dummies(df['class'])

X_train, X_test, y_train, y_test = train_test_split(X,y2,test_size = .20,random_state = 42)

model = Sequential()
model.add(Dense(X.shape[1], input_dim=X.shape[1], activation='relu'))
model.add(Dense(5, activation='relu'))
model.add(Dense(3, activation='relu'))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer = 'rmsprop',metrics=['accuracy'])

history = model.fit(X_train,y_train,nb_epoch= 15, verbose=1,batch_size=5)

[test_loss, test_acc] = model.evaluate(X_test, y_test, batch_size=5)
print("Evaluation result on Test Data : Loss = {}, accuracy = {}".format(test_loss, test_acc))

Using TensorFlow backend.


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Evaluation result on Test Data : Loss = 0.23308257226433074, accuracy = 0.9285714328289032
