# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Data Preprocessing
Every data, that we wish to fed into a learning algorithm to learns the patterns and trends in it, must be frist cleaned and made ready for the algorithm to understand it.

In [2]:
#loading dataset

columns = ["id","diagnosis","radius_mean","texture_mean","perimeter_mean","area_mean","smoothness_mean","compactness_mean",
           "concavity_mean","concave points_mean","symmetry_mean","fractal_dimension_mean","radius_se","texture_se",
           "perimeter_se","area_se","smoothness_se","compactness_se","concavity_se","concave points_se","symmetry_se",
           "fractal_dimension_se","radius_worst","texture_worst","perimeter_worst","area_worst","smoothness_worst",
           "compactness_worst","concavity_worst","concave points_worst","symmetry_worst","fractal_dimension_worst",]
df = pd.read_csv("wdbc.csv",names = columns)
print(df.shape)
df.head(10)

(569, 32)


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
5,843786,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,...,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244
6,844359,M,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,...,22.88,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368
7,84458202,M,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,...,17.06,28.14,110.6,897.0,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151
8,844981,M,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,...,15.49,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072
9,84501001,M,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,...,15.09,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075


In [3]:
# droping id columns because it is not needed.
del df['id']
#checking for column with a empty cell i.e containing NA value
df.isna().any()

diagnosis                  False
radius_mean                False
texture_mean               False
perimeter_mean             False
area_mean                  False
smoothness_mean            False
compactness_mean           False
concavity_mean             False
concave points_mean        False
symmetry_mean              False
fractal_dimension_mean     False
radius_se                  False
texture_se                 False
perimeter_se               False
area_se                    False
smoothness_se              False
compactness_se             False
concavity_se               False
concave points_se          False
symmetry_se                False
fractal_dimension_se       False
radius_worst               False
texture_worst              False
perimeter_worst            False
area_worst                 False
smoothness_worst           False
compactness_worst          False
concavity_worst            False
concave points_worst       False
symmetry_worst             False
fractal_di

In [4]:
# checking total malignant(cancer) cells and benign(non-cancer) cells.
df['diagnosis'].value_counts()

B    357
M    212
Name: diagnosis, dtype: int64

In [5]:
# changing categorical data into int numbers so that learning algorithms can understand them.
# the malignant representation "M" will be converted to 1 and benign representation to 0.
label_encoder = LabelEncoder()
df['diagnosis'] = label_encoder.fit_transform(df["diagnosis"].values)
df['diagnosis'].value_counts()

0    357
1    212
Name: diagnosis, dtype: int64

In [6]:
# now we will be spliting dataset into training and testing parts.
y = df.iloc[:,0].values
x = df.iloc[:,0:].values
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)

In [7]:
# now we will feature scale the data. It is important so that all the valuse will have same weight for
# our learning algorithm.

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Model Building
Now that our data is ready to be fed in learning algorithms, we will proceed toward model building. We will be using many different classification algorithms and will see how each would preform on same data.


In [8]:
#Using Logistic Regression Algorithm to the Training Set

lr = LogisticRegression(random_state = 0)
lr.fit(X_train, Y_train)

#Using KNeighborsClassifier Method of neighbors class to use Nearest Neighbor algorithm

knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn.fit(X_train, Y_train)

#Using SVC method of svm class to use Support Vector Machine Algorithm

svm = SVC(kernel = 'linear', random_state = 0)
svm.fit(X_train, Y_train)

#Using GaussianNB method of naïve_bayes class to use Naïve Bayes Algorithm

gnb = GaussianNB()
gnb.fit(X_train, Y_train)

#Using DecisionTreeClassifier of tree class to use Decision Tree Algorithm

dt = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
dt.fit(X_train, Y_train)

#Using RandomForestClassifier method of ensemble class to use Random Forest Classification algorithm

rf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
rf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [9]:
# now we will check accuracy of each model by testing them on X_test.
model = (lr,knn,svm,gnb,dt,rf)
models = ('lr','knn','svm','gnb','dt','rf')
for i in range(len((lr,knn,svm,gnb,dt,rf))):
    print("The accuracy of {} is {}".format(models[i],accuracy_score(Y_test, model[i].predict(X_test))))
    print()#Print a new line

The accuracy of lr is 1.0

The accuracy of knn is 0.986013986013986

The accuracy of svm is 1.0

The accuracy of gnb is 1.0

The accuracy of dt is 1.0

The accuracy of rf is 1.0



In [10]:
# now we will use confusion matrix to see how many times the models predicted accurately.
# the count of rightly predicted will be on the main diagnol of the matrix.
# the count of wrongly predicted will be on the other diagnol of the matrix.

for i in range(len(model)):
    cm = confusion_matrix(Y_test, model[i].predict(X_test))
 
    print(cm)
    print('{} Testing Accuracy = "{}!"'.format(models[i],  (cm[0][0] + cm[1][1]) / (cm[0][0] + cm[1][1] + cm[1][0] + cm[0][1])))
    print()# Print a new line

[[90  0]
 [ 0 53]]
lr Testing Accuracy = "1.0!"

[[90  0]
 [ 2 51]]
knn Testing Accuracy = "0.986013986013986!"

[[90  0]
 [ 0 53]]
svm Testing Accuracy = "1.0!"

[[90  0]
 [ 0 53]]
gnb Testing Accuracy = "1.0!"

[[90  0]
 [ 0 53]]
dt Testing Accuracy = "1.0!"

[[90  0]
 [ 0 53]]
rf Testing Accuracy = "1.0!"



In [11]:
for i in range(len(model)):
    pred = model[i].predict(X_test)
    print("{}".format(models[i]))
    unique, counts = np.unique(pred, return_counts=True)
    pred = dict(zip(unique, counts))
    print(pred)

    #Print a space
    print()

    #Print the actual values
    unique, counts = np.unique(Y_test, return_counts=True)
    Y_test_count = dict(zip(unique, counts))
    print(Y_test_count)

lr
{0: 90, 1: 53}

{0: 90, 1: 53}
knn
{0: 92, 1: 51}

{0: 90, 1: 53}
svm
{0: 90, 1: 53}

{0: 90, 1: 53}
gnb
{0: 90, 1: 53}

{0: 90, 1: 53}
dt
{0: 90, 1: 53}

{0: 90, 1: 53}
rf
{0: 90, 1: 53}

{0: 90, 1: 53}
