In [1]:
# Step 1. Importing Packages

import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
# Step 2. Load data from CSV file

cell_df = pd.read_csv("cell_samples.csv")
cell_df.head()

Unnamed: 0,ID,Clump,UnifSize,UnifShape,MargAdh,SingEpiSize,BareNuc,BlandChrom,NormNucl,Mit,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [3]:
# Info - 
cell_df

Unnamed: 0,ID,Clump,UnifSize,UnifShape,MargAdh,SingEpiSize,BareNuc,BlandChrom,NormNucl,Mit,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4


In [4]:
# Checking the data
cell_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           699 non-null    int64 
 1   Clump        699 non-null    int64 
 2   UnifSize     699 non-null    int64 
 3   UnifShape    699 non-null    int64 
 4   MargAdh      699 non-null    int64 
 5   SingEpiSize  699 non-null    int64 
 6   BareNuc      699 non-null    object
 7   BlandChrom   699 non-null    int64 
 8   NormNucl     699 non-null    int64 
 9   Mit          699 non-null    int64 
 10  Class        699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB


In [5]:
# Droping BareNuc because of object datatype

cell_df.drop('BareNuc',axis = 1, inplace = True)

In [6]:
# After removing BareNuc

cell_df.head()

Unnamed: 0,ID,Clump,UnifSize,UnifShape,MargAdh,SingEpiSize,BlandChrom,NormNucl,Mit,Class
0,1000025,5,1,1,1,2,3,1,1,2
1,1002945,5,4,4,5,7,3,2,1,2
2,1015425,3,1,1,1,2,3,1,1,2
3,1016277,6,8,8,1,3,3,7,1,2
4,1017023,4,1,1,3,2,3,1,1,2


In [7]:
# Data Selection 
# Independent Variable

feature_df = cell_df[['Clump','UnifSize','UnifShape','MargAdh','SingEpiSize','BlandChrom','NormNucl','Mit']]
feature_df.head()

Unnamed: 0,Clump,UnifSize,UnifShape,MargAdh,SingEpiSize,BlandChrom,NormNucl,Mit
0,5,1,1,1,2,3,1,1
1,5,4,4,5,7,3,2,1
2,3,1,1,1,2,3,1,1
3,6,8,8,1,3,3,7,1
4,4,1,1,3,2,3,1,1


In [8]:
# Info
feature_df.dtypes

Clump          int64
UnifSize       int64
UnifShape      int64
MargAdh        int64
SingEpiSize    int64
BlandChrom     int64
NormNucl       int64
Mit            int64
dtype: object

In [9]:
# Coverting into array and storing into x
x = np.asarray(feature_df)
x[0:3]

array([[5, 1, 1, 1, 2, 3, 1, 1],
       [5, 4, 4, 5, 7, 3, 2, 1],
       [3, 1, 1, 1, 2, 3, 1, 1]], dtype=int64)

In [10]:
# col class convert into int
# Coverting into array and storing into y

cell_df['Class'] = cell_df['Class'].astype('int')

y = np.asarray(cell_df['Class'])

y[0:3]

array([2, 2, 2])

In [11]:
# Train / Test dataset

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2 , random_state = 50)

print('Train Set:' , x_train.shape, y_train.shape)
print('Test Set:' , x_test.shape, y_test.shape)

Train Set: (559, 8) (559,)
Test Set: (140, 8) (140,)


In [12]:
# Modeling ( SVM with Scikit-learn)

'''The SVM algorithm offers a choice of kernel functions for performing its processing. Basically, mapping data into a higher 
 dimensional space is called kernelling. The mathematical function used for the transformation is known as the kernel function,
 and can be of different types, such as:

1.Linear
2.Polynomial
3.Radial basis function (RBF)
4.Sigmoid
 Each of these functions has its characteristics, its pros and cons, and its equation, but as there's no easy way of knowing
 which function performs best with any given dataset, we usually choose different functions in turn and compare the results. 
 Let's just use the default, RBF (Radial Basis Function) for this lab'''

"The SVM algorithm offers a choice of kernel functions for performing its processing. Basically, mapping data into a higher \n dimensional space is called kernelling. The mathematical function used for the transformation is known as the kernel function,\n and can be of different types, such as:\n\n1.Linear\n2.Polynomial\n3.Radial basis function (RBF)\n4.Sigmoid\n Each of these functions has its characteristics, its pros and cons, and its equation, but as there's no easy way of knowing\n which function performs best with any given dataset, we usually choose different functions in turn and compare the results. \n Let's just use the default, RBF (Radial Basis Function) for this lab"

In [13]:
from sklearn import svm
clf = svm.SVC(kernel ='poly')
clf.fit(x_train, y_train)

SVC(kernel='poly')

In [14]:
# After being fitted, model can then be used to predict new values

yhat = clf.predict(x_test)
yhat[0:3]

array([2, 2, 2])

In [15]:
# Evaluation

from sklearn.metrics import f1_score
f1_score(y_test, yhat, average = 'weighted')

0.9425054112554112

In [16]:
# Practice 
# Can you rebuild the model, but this time with a __linear__ kernel? You can use __kernel='linear'__ option, when you
# define the svm. How the accuracy changes with the new kernel function

In [17]:
#The kernel function takes the original input data and calculates the inner products between pairs of data points. 
#By applying a non-linear mapping, it allows SVMs to find a decision boundary that separates different classes more effectively

In [21]:
# here we use rbf. rbf is more powerfull than poly

clf2 = svm.SVC(kernel = 'rbf')
clf2 = clf2.fit(x_train, y_train)
yhat2 = clf2.predict(x_test)
print("Avg F1-score: %4f" % f1_score(y_test, yhat2, average= "weighted"))

Avg F1-score: 0.971429


In [22]:
clf3 = svm.SVC(kernel = 'linear')
clf3 = clf3.fit(x_train, y_train)
yhat2 = clf3.predict(x_test)
print("Avg F1-score: %4f" % f1_score(y_test, yhat2, average= "weighted"))

Avg F1-score: 0.964387
