In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Data Cleaning and removal of null(NaN) values

In [2]:
import random
data = pd.read_csv('cervical-cancer_csv.csv')
data.head()
# The columns that have no use are dropped
data = data.drop(columns=['STDs: Time since first diagnosis', 'STDs: Time since last diagnosis'])
# thresh = 30 means that only the rows which have at least 30 non null(NaN) values are kept
# I shall now replace the remaining null(NaN) values appropriately
data = data.dropna(thresh=30)
data
# Columns with ordinal values like age will have their NaN replaced by the average of such column
avg_partners =  data['Num of pregnancies'].mean()
# avg_partners is 2.55 so that's approx 3, average age of first sex is 17.12, that's approx 17, average pregnancy is 2.4, approx 2
avg_partners
replace1 = 3
replace2 = 17
replace3 = 2
data['Number of sexual partners'].fillna(replace1, inplace=True)
data['First sexual intercourse'].fillna(replace2, inplace=True)
data['Num of pregnancies'].fillna(replace3, inplace=True)
# Columns containing 0s and 1s will have their NaN replced with the random of 0 or 1 in such column
g = random.randint(0,1)
data.fillna(g, inplace=True)
data
# data.to_csv('new_cervical.csv', index=False)


Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs:HPV,STDs: Number of diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.00,0.0,...,0.0,0,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.00,0.0,...,0.0,0,0,0,0,0,0,0,0,0
2,34,1.0,17.0,1.0,0.0,0.0,0.0,0.0,0.00,0.0,...,0.0,0,0,0,0,0,0,0,0,0
3,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.00,0.0,...,0.0,0,1,0,1,0,0,0,0,0
4,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.00,0.0,...,0.0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
830,34,3.0,18.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,...,0.0,0,0,0,0,0,0,0,0,0
831,32,2.0,19.0,1.0,0.0,0.0,0.0,1.0,8.00,0.0,...,0.0,0,0,0,0,0,0,0,0,0
832,25,2.0,17.0,0.0,0.0,0.0,0.0,1.0,0.08,0.0,...,0.0,0,0,0,0,0,0,0,1,0
833,33,2.0,24.0,2.0,0.0,0.0,0.0,1.0,0.08,0.0,...,0.0,0,0,0,0,0,0,0,0,0


In [3]:
# The dependent variables of this dataset are Dx, Hinselmann, Schiller, Citology, Biopsy
# In medical terms Dx means Diagnosis,  Hinselmann and Schiller are cancer tests actually
# Citology and Biopsy are medical tests for detecting cancer cells also

# it's a classifiction model so it's either a yes/no
# My Approach will be  1) To create a new variable which will be the mode of all 5 dependent variables and then classify
# 2) create a new variable which is yes if at least two of the tests say yes which turned out to be less accurate after running
# data['Biopsy'].unique()
mode_y = data[['Dx','Hinselmann','Schiller','Citology','Biopsy']].mode(axis=1)
# mode_y = pd.DataFrame(mode_y)

# Approach 1

data['y'] = mode_y
data

# Approach 2
# prob_y = data[['Dx','Hinselmann', 'Schiller', 'Citology', 'Biopsy']]
# prob_sum = prob_y.sum(axis =1) 
# prob = (prob_sum >= 2).astype(int)
# data['y2'] = prob
# data[['y', 'y2']].to_csv('mode.csv', index=False)


Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Number of diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy,y
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.00,0.0,...,0,0,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.00,0.0,...,0,0,0,0,0,0,0,0,0,0
2,34,1.0,17.0,1.0,0.0,0.0,0.0,0.0,0.00,0.0,...,0,0,0,0,0,0,0,0,0,0
3,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.00,0.0,...,0,1,0,1,0,0,0,0,0,0
4,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.00,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
830,34,3.0,18.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,...,0,0,0,0,0,0,0,0,0,0
831,32,2.0,19.0,1.0,0.0,0.0,0.0,1.0,8.00,0.0,...,0,0,0,0,0,0,0,0,0,0
832,25,2.0,17.0,0.0,0.0,0.0,0.0,1.0,0.08,0.0,...,0,0,0,0,0,0,0,1,0,0
833,33,2.0,24.0,2.0,0.0,0.0,0.0,1.0,0.08,0.0,...,0,0,0,0,0,0,0,0,0,0


### Feature Selection

In [4]:
X = data.iloc[:, 0:29].values
y = data.iloc[:, -1].values

In [5]:
#splitting into train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [6]:
# feature scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

### Training the dataset and performance metrics

In [7]:
# Using Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(criterion='entropy', random_state = 0)
classifier.fit(X_train, y_train)

In [8]:
y_pred = classifier.predict(X_test)

In [9]:
# confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[172   0]
 [ 12   0]]


In [10]:
# applying k-fold cross validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print(accuracies)
# the cv = 10 arguement means that the process should cross validate 10 times, the accuracies are printed, displayed in a list form
print(accuracies.mean()) 
# mean of accuracies
print(accuracies.std())
# standard deviation of accuracies

[0.96363636 0.96363636 0.94545455 0.94545455 0.94545455 0.94545455
 0.94545455 0.94545455 0.94545455 0.96296296]
0.9508417508417508
0.008230928803015496


In [11]:
# Using Support Vector Machine Classifier
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
# gaussian kernel used here
classifier.fit(X_train, y_train)


In [12]:
# confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[172   0]
 [ 12   0]]


In [13]:
# applying k-fold cross validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print(accuracies)
# the cv = 10 arguement means that the process should cross validate 10 times, the accuracies are printed, displayed in a list form
print(accuracies.mean()) 
# mean of accuracies
print(accuracies.std())
# standard deviation of accuracies

[0.96363636 0.96363636 0.94545455 0.94545455 0.94545455 0.94545455
 0.94545455 0.94545455 0.94545455 0.96296296]
0.9508417508417508
0.008230928803015496


In [14]:
# Using XGBClassifier

from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

In [15]:
# confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[172   0]
 [ 12   0]]


In [16]:
# applying k-fold cross validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print(accuracies)
# the cv = 10 arguement means that the process should cross validate 10 times, the accuracies are printed, displayed in a list form
print(accuracies.mean()) 
# mean of accuracies
print(accuracies.std())
# standard deviation of accuracies

[0.96363636 0.94545455 0.94545455 0.92727273 0.96363636 0.92727273
 0.94545455 0.94545455 0.96363636 0.94444444]
0.9471717171717172
0.012745297481704908


In [17]:
# Using Logistic Regression
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

In [18]:
# confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[172   0]
 [ 12   0]]


In [19]:
# applying k-fold cross validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print(accuracies)
# the cv = 10 arguement means that the process should cross validate 10 times, the accuracies are printed, displayed in a list form
print(accuracies.mean()) 
# mean of accuracies
print(accuracies.std())
# standard deviation of accuracies

[0.96363636 0.94545455 0.90909091 0.94545455 0.94545455 0.92727273
 0.94545455 0.94545455 0.94545455 0.96296296]
0.9435690235690236
0.01501487206146428
