In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [65]:
gene = pd.read_csv("gene_expression.csv")
gene[:4]

Unnamed: 0,Gene One,Gene Two,Cancer Present
0,4.3,3.9,1
1,2.5,6.3,0
2,5.7,3.9,1
3,6.1,6.2,0


In [3]:
gene.describe() #There seem to be no outlier

Unnamed: 0,Gene One,Gene Two,Cancer Present
count,3000.0,3000.0,3000.0
mean,5.600133,5.410467,0.5
std,1.828388,1.729081,0.500083
min,1.0,1.0,0.0
25%,4.3,4.0,0.0
50%,5.6,5.4,0.5
75%,6.9,6.7,1.0
max,10.0,10.0,1.0


In [4]:
gene.dtypes

Gene One          float64
Gene Two          float64
Cancer Present      int64
dtype: object

In [5]:
gene.isna().sum() #No missing data

Gene One          0
Gene Two          0
Cancer Present    0
dtype: int64

In [6]:
gene.max()

Gene One          10.0
Gene Two          10.0
Cancer Present     1.0
dtype: float64

In [8]:
#Seperate dependant and independant varaiables
x = gene.iloc[:, 0:-1].values
y = gene.iloc[:, -1].values
x[:3]

array([[4.3, 3.9],
       [2.5, 6.3],
       [5.7, 3.9]])

In [63]:
# Split for training and testing

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
x_test[:5]

array([[7.7, 4.1],
       [5.3, 2.8],
       [8.9, 4.2],
       [3.3, 5.7],
       [4.7, 7.3]])

In [12]:
x_test.shape

(750, 2)

In [13]:
x_train.shape

(2250, 2)

In [67]:
# Merging the test and prediction
x_test_df = pd.DataFrame(x_test).rename(columns = {0: "Gene One", 1: "Gene Two"})
y_test_df = pd.DataFrame(y_test).rename(columns = {0: "y_test"})
y_predict_df = pd.DataFrame(y_predict).rename(columns = {0: "y_predict"})

pd.concat([x_test_df,y_test_df, y_predict_df], axis = 1)

Unnamed: 0,Gene One,Gene Two,y_test,y_predict
0,7.7,4.1,1,1
1,5.3,2.8,1,1
2,8.9,4.2,1,1
3,3.3,5.7,0,0
4,4.7,7.3,0,0
...,...,...,...,...
745,8.9,5.3,1,1
746,6.3,4.3,1,1
747,6.9,3.3,1,1
748,5.8,6.1,0,0


#### USING KNN

In [16]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
model = KNeighborsClassifier()
# Elbow method to know the value for k, by default k is 5
test_error_rates = []
for k in range(1, 10):
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(x_train, y_train)
    y_pred_test = knn_model.predict(x_test)
    test_error = 1 - accuracy_score(y_test, y_pred_test)
    test_error_rates.append(test_error)
test_error_rates #The best k value is 8

[0.10399999999999998,
 0.10266666666666668,
 0.07199999999999995,
 0.07866666666666666,
 0.07333333333333336,
 0.07333333333333336,
 0.06799999999999995,
 0.07066666666666666,
 0.06533333333333335]

In [18]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=8, weights='uniform',algorithm='auto',leaf_size=30,p=2,metric='minkowski')
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
y_predict[:3]

array([1, 1, 1], dtype=int64)

In [19]:
# Check accuracy
from sklearn.metrics import accuracy_score, confusion_matrix 
print(f" The accuracy score = {accuracy_score(y_test, y_predict) * 100:0.2f}%")
confusion_matrix(y_test, y_predict)

 The accuracy score = 92.93%


array([[381,  20],
       [ 33, 316]], dtype=int64)

#### USING DECISION TREE

In [28]:
from sklearn.tree import DecisionTreeClassifier
model2 = DecisionTreeClassifier(criterion='entropy',splitter='random', random_state= 12)
model2.fit(x_train, y_train)
y_predict2 = model2.predict(x_test)
y_predict2[:5]

array([1, 1, 1, 0, 0], dtype=int64)

In [29]:
# Check accuracy
from sklearn.metrics import accuracy_score, confusion_matrix 
print(f" The accuracy score = {accuracy_score(y_test, y_predict2) * 100:0.2f}%")
confusion_matrix(y_test, y_predict2)

 The accuracy score = 88.67%


array([[364,  37],
       [ 48, 301]], dtype=int64)

#### USING RANDOM FOREST

In [41]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=10, criterion= "gini", random_state=0)
classifier.fit(x_train, y_train)
y_predict3 = classifier.predict(x_test)
y_predict3[:3]

array([1, 1, 1], dtype=int64)

In [42]:
# Check accuracy
from sklearn.metrics import accuracy_score, confusion_matrix 
print(f" The accuracy score = {accuracy_score(y_test, y_predict3) * 100:0.2f}%")
confusion_matrix(y_test, y_predict3)

 The accuracy score = 91.33%


array([[369,  32],
       [ 33, 316]], dtype=int64)

#### USING LOGISTIC REGRESSION

In [56]:
#Training the model
from sklearn.linear_model import LogisticRegression
Log_model = LogisticRegression(penalty = 'l2',solver = 'newton-cg', random_state= 0) 
#random_State is optional #recall:to seed in numpy; np.random.seed(23) before np.random.randint(1,3,2)

#Fit to train model
Log_model.fit(x_train, y_train)
y_predict4 = Log_model.predict(x_test)
y_predict4[:3]

array([1, 1, 1], dtype=int64)

In [57]:
# Check accuracy
from sklearn.metrics import accuracy_score, confusion_matrix 
print(f" The accuracy score = {accuracy_score(y_test, y_predict4) * 100:0.2f}%")
confusion_matrix(y_test, y_predict4)

 The accuracy score = 85.20%


array([[340,  61],
       [ 50, 299]], dtype=int64)

#### USING SUPPORT VECTOR MACHINE

In [58]:
from sklearn.svm import SVC
model_SV = SVC()
model_SV.fit(x_train, y_train)
y_predict5 = model_SV.predict(x_test)
y_predict5[:4]

array([1, 1, 1, 0], dtype=int64)

In [59]:
from sklearn.metrics import accuracy_score, confusion_matrix 
print(f" The accuracy score = {accuracy_score(y_test, y_predict5) * 100:0.2f}%")
confusion_matrix(y_test, y_predict5)

 The accuracy score = 94.53%


array([[384,  17],
       [ 24, 325]], dtype=int64)

#### USING NAIVES BAYES

In [60]:
from sklearn.naive_bayes import GaussianNB
Gu_model = GaussianNB()
Gu_model.fit(x_train, y_train)
y_predict6 = Gu_model.predict(x_test)
y_predict6[:4]

array([1, 1, 1, 0], dtype=int64)

In [61]:
from sklearn.metrics import accuracy_score, confusion_matrix 
print(f" The accuracy score = {accuracy_score(y_test, y_predict6) * 100:0.2f}%")
confusion_matrix(y_test, y_predict6)

 The accuracy score = 84.80%


array([[338,  63],
       [ 51, 298]], dtype=int64)

In [62]:
#Support Vector Machine seems to have the best accuracy score so it should be used
model_SV.predict([[5.7,3.9]])

array([1], dtype=int64)

In [70]:
model_SV.predict([[5.8,6.1]])

array([0], dtype=int64)