In [None]:
import pandas as pd
import pylab as pl
import numpy as np
import scipy.optimize as opt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# get data
!wget -O cell_samples.csv https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/cell_samples.csv

In [None]:
df = pd.read_csv("cell_samples.csv")
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
# distribution of classes based on clump thickness and uniformity of cell size
ax = df[df['Class'] == 4][0:50].plot(kind = 'scatter', x = 'Clump', y = 'UnifSize', color = 'DarkBlue', label = 'malignant')
df[df['Class'] == 2][0:50].plot(kind = 'scatter', x = 'Clump', y = 'UnifSize', color = 'Yellow', label = 'benign', ax = ax) # ax = ax put the combine the charts into one
plt.show()

In [None]:
df.dtypes

In [None]:
# drop the BareNuc columns since it contains strings
df = df[pd.to_numeric(df['BareNuc'], errors='coerce').notnull()]
df['BareNuc'] = df['BareNuc'].astype('int')
df.dtypes

In [None]:
# taking the feature dataset needed for train dataset
X = df[['Clump', 'UnifSize', 'UnifShape', 'MargAdh', 'SingEpiSize', 'BareNuc', 'BlandChrom', 'NormNucl', 'Mit']]
X = np.asarray(X)
X[0:5]

In [None]:
# the target dataset needed
y = df['Class']
y = np.asarray(y)
y[0:5]

In [None]:
# train/test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 4)
print('Train set:', X_train.shape, y_train.shape)
print('Test set:', X_test.shape, y_test.shape)

In [None]:
# svm model
rom sklearn import svm
clf = svm.SVC(kernel = 'rbf')
clf.fit(X_train, y_train)

In [None]:
# predict
yhat = clf.predict(X_test)
yhat[0:5]

In [None]:
# Evaluation
from sklearn.metrics import classification_report, confusion_matrix
import itertools

In [None]:
def plot_confusion_matrix(cm, classes, normalize = False, title = 'Confusion matrix', cmap= plt.cm.Blues):
    
    """
    this function prints and plots the confusion matrix. Normalization can be applied by setting 'normalize = True'
    """
    
    if normalize:
        cm = cm.astype('float') / cm.sum(axis = 1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print("Confusion matrix, without normalization")
    
    print(cm)
    
    plt.imshow(cm, interpolation = 'nearest', cmap = cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation = 45)
    plt.yticks(tick_marks, classes)
    
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), horizontalalignment = 'center', color = 'white' if cm[i, j] > thresh else "black")
    
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')        

In [None]:
#compute confusopm matrix
cnf_matrix = confusion_matrix(y_test, yhat, labels =[2, 4])
np.set_printoptions(precision = 2)

print(classification_report(y_test, yhat))

# plot non_normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes = ['Benign(2)', 'Malignant(4)'], normalize = False, title ='Confusion matrix')

In [None]:
from sklearn.metrics import f1_score
f1_score(y_test, yhat, average = 'weighted')

In [None]:
# jaccard accuracy
from sklearn.metrics import jaccard_similarity_score
jaccard_similarity_score(y_test, yhat)

In [None]:
# model with linear kernel
clf2 = svm.SVC(kernel = 'linear')
clf2.fit(X_train, y_train)
yhat2 = clf2.predict(X_test)
yhat2[0:5]
print("Avg F1-score: %.4f" % f1_score(y_test, yhat2, average='weighted'))
print("Jaccard score: %.4f" % jaccard_similarity_score(y_test, yhat2))
