In [68]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

import datetime
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder

%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [69]:
# load and read any dataset
data = pd.read_csv('D:/Cuproject/Urinary_biomarkers_for_Pancreatic_Cancer/cancer/Debernardi_et_al_2020_data.csv')
data.shape

(590, 14)

### Analyse the Raw data

In [70]:
data.head(5)

Unnamed: 0,sample_id,patient_cohort,sample_origin,age,sex,diagnosis,stage,benign_sample_diagnosis,plasma_CA19_9,creatinine,LYVE1,REG1B,TFF1,REG1A
0,S1,Cohort1,BPTB,33,F,1,,,11.7,1.83222,0.893219,52.94884,654.282174,1262.0
1,S10,Cohort1,BPTB,81,F,1,,,,0.97266,2.037585,94.46703,209.48825,228.407
2,S100,Cohort2,BPTB,51,M,1,,,7.0,0.78039,0.145589,102.366,461.141,
3,S101,Cohort2,BPTB,61,M,1,,,8.0,0.70122,0.002805,60.579,142.95,
4,S102,Cohort2,BPTB,62,M,1,,,9.0,0.21489,0.00086,65.54,41.088,


In [71]:
data_documentation = pd.read_csv('D:/Cuproject/Urinary_biomarkers_for_Pancreatic_Cancer/documentation/Debernardi_et_al_2020_documentation.csv')
data_documentation.head(14)

Unnamed: 0,Column name,Original column name,Details
0,sample_id,Sample ID,Unique string identifying each subject
1,patient_cohort,Patient's Cohort,"Cohort 1, previously used samples; Cohort 2, ..."
2,sample_origin,Sample Origin,"BPTB: Barts Pancreas Tissue Bank, London, UK; ..."
3,age,Age,Age in years
4,sex,Sex,"M = male, F = female"
5,diagnosis,"Diagnosis (1=Control, 2=Benign, 3=PDAC)","1 = control (no pancreatic disease), 2 = benig..."
6,stage,Stage,"For those with pancratic cancer, what stage wa..."
7,benign_sample_diagnosis,Benign Samples Diagnosis,"For those with a benign, non-cancerous diagnos..."
8,plasma_CA19_9,Plasma CA19-9 U/ml,Blood plasma levels of CA 19–9 monoclonal anti...
9,creatinine,Creatinine mg/ml,Urinary biomarker of kidney function


## Preprocessing

### Null Imputation

In [72]:
# to check which column has how many null data
data.isnull().sum()

sample_id                    0
patient_cohort               0
sample_origin                0
age                          0
sex                          0
diagnosis                    0
stage                      391
benign_sample_diagnosis    382
plasma_CA19_9              240
creatinine                   0
LYVE1                        0
REG1B                        0
TFF1                         0
REG1A                      284
dtype: int64

In [73]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590 entries, 0 to 589
Data columns (total 14 columns):
sample_id                  590 non-null object
patient_cohort             590 non-null object
sample_origin              590 non-null object
age                        590 non-null int64
sex                        590 non-null object
diagnosis                  590 non-null int64
stage                      199 non-null object
benign_sample_diagnosis    208 non-null object
plasma_CA19_9              350 non-null float64
creatinine                 590 non-null float64
LYVE1                      590 non-null float64
REG1B                      590 non-null float64
TFF1                       590 non-null float64
REG1A                      306 non-null float64
dtypes: float64(6), int64(2), object(6)
memory usage: 64.7+ KB


In [74]:
data.nunique()

sample_id                  590
patient_cohort               2
sample_origin                4
age                         63
sex                          2
diagnosis                    3
stage                        8
benign_sample_diagnosis     52
plasma_CA19_9              266
creatinine                 198
LYVE1                      535
REG1B                      580
TFF1                       560
REG1A                      298
dtype: int64

In [75]:
#since there are 590 non unique, it's better to drop that column 
data = data.drop(columns=['sample_id'])

### Division of Data

In [76]:
#Since our 1st target value is diagnosis and stage has maximum null vlues hence seprating data into two parts
#one data without stage i.e.,diagnosis_data(A) and another data with stage i.e.,stage_data(B) but without diagnosis
final_data=data
diagnosis_data=final_data.drop('stage',axis=1)
stage_data = final_data.drop('diagnosis',axis=1)

In [77]:
stage_data.shape

(590, 12)

### Preprocessing and Null Imputation on diagnosis_data

In [78]:
diagnosis_data.isnull().sum()

patient_cohort               0
sample_origin                0
age                          0
sex                          0
diagnosis                    0
benign_sample_diagnosis    382
plasma_CA19_9              240
creatinine                   0
LYVE1                        0
REG1B                        0
TFF1                         0
REG1A                      284
dtype: int64

In [79]:
#Analysing benign_sample_diagnosis
diagnosis_data[diagnosis_data.benign_sample_diagnosis.isnull()==False].head(7)

Unnamed: 0,patient_cohort,sample_origin,age,sex,diagnosis,benign_sample_diagnosis,plasma_CA19_9,creatinine,LYVE1,REG1B,TFF1,REG1A
183,Cohort2,BPTB,32,F,2,Abdominal Pain,12.0,1.16493,5.417692,21.135,445.725,
184,Cohort2,BPTB,59,F,2,Abdominal Pain,,0.41847,0.526248,24.111,4.09,
185,Cohort2,BPTB,65,F,2,Abdominal Pain,,0.36192,2.343091,12.957,0.02823,
186,Cohort2,BPTB,39,F,2,Abdominal Pain,,0.89349,0.736352,21.55,105.943,
187,Cohort2,BPTB,65,F,2,Abdominal Pain,13.0,0.37323,0.228904,13.977,7.367,
188,Cohort2,BPTB,59,M,2,Abdominal Pain,49.0,0.74646,4.9695,59.362,489.465,
189,Cohort1,BPTB,52,M,2,Biliary Stricture (Secondary to Stent),31.62,0.375492,3.71148,215.57942,656.7924,1305.153


In [80]:
#There are many null values hence droping it
# diagnosis_data = diagnosis_data.drop(columns=['benign_sample_diagnosis'])
diagnosis_data = diagnosis_data.drop(columns=['benign_sample_diagnosis','plasma_CA19_9','REG1A'])

In [81]:
diagnosis_data.shape

(590, 9)

### Spliting data into train test

In [82]:
# Converting the main_train into X and y so that we can pass it onto train_test_split function
# ind --> contains the dataframe without the target i.e diagnosis which are the independent varibles
ind_diagnosis_data = diagnosis_data.drop('diagnosis',axis=1)
# dep --> contains only the target value 
dep_diagnosis_data = diagnosis_data['diagnosis']

## Feature Engeeneering

### Encoding

In [83]:
#Encoding diagnosis in such a way that people with pancreatic cancer is 1 else 0
L=len(dep_diagnosis_data)
for i in range(L):
    if dep_diagnosis_data[i]==3:
        dep_diagnosis_data[i]=1
    else:
        dep_diagnosis_data[i]=0
        

In [84]:
dep_diagnosis_data.tail()

585    1
586    1
587    1
588    1
589    1
Name: diagnosis, dtype: int64

### Label Encoding

In [85]:
sex = LabelEncoder()
sex.fit(ind_diagnosis_data['sex'])
ind_diagnosis_data['sex'] = sex.transform(ind_diagnosis_data['sex'])

In [86]:
ind_diagnosis_data.head()

Unnamed: 0,patient_cohort,sample_origin,age,sex,creatinine,LYVE1,REG1B,TFF1
0,Cohort1,BPTB,33,0,1.83222,0.893219,52.94884,654.282174
1,Cohort1,BPTB,81,0,0.97266,2.037585,94.46703,209.48825
2,Cohort2,BPTB,51,1,0.78039,0.145589,102.366,461.141
3,Cohort2,BPTB,61,1,0.70122,0.002805,60.579,142.95
4,Cohort2,BPTB,62,1,0.21489,0.00086,65.54,41.088


In [87]:
# We are keeping sample origin so that model get's data about diversity
# patient_cohort = LabelEncoder()
# patient_cohort.fit(ind_diagnosis_data['patient_cohort'])
# ind_diagnosis_data['patient_cohort'] = patient_cohort.transform(ind_diagnosis_data['patient_cohort'])

### Ordinal Encoding

In [88]:
#We are keeping sample origin so that model get's data about diversity
patient_cohort = OrdinalEncoder()
# patient_cohort.fit(ind_diagnosis_data['patient_cohort'])

ind_diagnosis_data["patient_cohort"] = patient_cohort.fit_transform(ind_diagnosis_data[["patient_cohort"]])
# ind_diagnosis_data['patient_cohort'] = patient_cohort.transform(ind_diagnosis_data['patient_cohort'])

In [89]:
sample_origin = OrdinalEncoder()
ind_diagnosis_data["sample_origin"] = sample_origin.fit_transform(ind_diagnosis_data[["sample_origin"]])

In [90]:
ind_diagnosis_data.tail()

Unnamed: 0,patient_cohort,sample_origin,age,sex,creatinine,LYVE1,REG1B,TFF1
585,1.0,0.0,68,1,0.52026,7.058209,156.241,525.178
586,1.0,0.0,71,0,0.85956,8.341207,16.915,245.947
587,1.0,0.0,63,1,1.36851,7.674707,289.701,537.286
588,1.0,0.0,75,0,1.33458,8.206777,205.93,722.523
589,0.0,0.0,74,1,1.50423,8.200958,411.938275,2021.321078


In [91]:
ind_diagnosis_data.nunique()

patient_cohort      2
sample_origin       4
age                63
sex                 2
creatinine        198
LYVE1             535
REG1B             580
TFF1              560
dtype: int64

In [92]:
ind_diagnosis_data.sample_origin.unique()

array([0., 2., 1., 3.])

In [93]:
sample_origin = OrdinalEncoder()
ind_diagnosis_data["sample_origin"] = sample_origin.fit_transform(ind_diagnosis_data[["sample_origin"]])

In [94]:
#Filling all null values with mean
for i in ind_diagnosis_data[2:]:
    if i!='sex':
        ind_diagnosis_data[i] = ind_diagnosis_data[i].fillna(ind_diagnosis_data[i].mean())
ind_diagnosis_data.head()

Unnamed: 0,patient_cohort,sample_origin,age,sex,creatinine,LYVE1,REG1B,TFF1
0,0.0,0.0,33,0,1.83222,0.893219,52.94884,654.282174
1,0.0,0.0,81,0,0.97266,2.037585,94.46703,209.48825
2,1.0,0.0,51,1,0.78039,0.145589,102.366,461.141
3,1.0,0.0,61,1,0.70122,0.002805,60.579,142.95
4,1.0,0.0,62,1,0.21489,0.00086,65.54,41.088


## Modeling

### SVM

In [95]:
from sklearn import svm
from sklearn.model_selection import train_test_split
import sklearn.metrics as sm

clf = svm.SVC()

X_train, X_test, y_train, y_test = train_test_split(ind_diagnosis_data,dep_diagnosis_data,test_size=0.2)#between 80% and 20%

SVM = clf.fit(X_train, y_train)

score = SVM.score(X_test, y_test)
print('Test set score: {}'.format(score))

Test set score: 0.6610169491525424


In [96]:
predicte = clf.predict(X_test)

In [97]:
print("R2 score =", round(sm.r2_score(y_test, predicte), 2))
print("Accuracy= ", clf.score(X_test,y_test))

R2 score = -0.51
Accuracy=  0.6610169491525424


In [98]:
#clearly there is good accuracy
X_train.head()

Unnamed: 0,patient_cohort,sample_origin,age,sex,creatinine,LYVE1,REG1B,TFF1
11,1.0,0.0,71,1,1.05183,0.860337,243.082,608.284
295,0.0,0.0,46,0,0.368706,7.029876,0.6146,124.428225
561,1.0,0.0,75,1,0.73515,2.72438,183.879,1383.49
587,1.0,0.0,63,1,1.36851,7.674707,289.701,537.286
356,1.0,0.0,52,0,0.3393,0.001357,0.293,73.736


In [99]:
X_test.head()

Unnamed: 0,patient_cohort,sample_origin,age,sex,creatinine,LYVE1,REG1B,TFF1
407,0.0,1.0,69,0,0.5655,7.849105,267.85682,1063.949
514,0.0,2.0,77,0,0.21489,1.712249,20.536467,232.7158
32,1.0,0.0,44,0,1.17624,4.239308,7.62,436.361
540,0.0,1.0,62,1,0.75777,9.325072,246.6702,994.9241
116,0.0,0.0,51,1,0.84825,3.629087,2.917572,213.542133


In [100]:
dep_diagnosis_data.tail()

585    1
586    1
587    1
588    1
589    1
Name: diagnosis, dtype: int64

In [101]:
#prediction using svm
# clf.predict()
#passing x dataframe for prediction of y
diagnosis_predict_arr = clf.predict(X_test)

In [102]:
diagnosis_predict_arr

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [103]:
diagnosis_predict=  pd.DataFrame(zip(diagnosis_predict_arr,y_test),columns=['Predicted_diagnosis','Actuals'])

In [104]:
diagnosis_predict.head()

Unnamed: 0,Predicted_diagnosis,Actuals
0,0,1
1,0,1
2,0,0
3,0,1
4,0,0


### Random Forest

In [105]:
from sklearn.ensemble import RandomForestRegressor
clf2 = RandomForestRegressor(n_estimators=5000)
clf2.fit(X_train, y_train)

# Predicting the Validation Set Results
predicted3 = clf2.predict(X_test)

In [106]:
print("R2 score =", round(sm.r2_score(y_test, predicted3), 2))
print("Accuracy= ", clf2.score(X_test,y_test))

R2 score = 0.4
Accuracy=  0.3967771257435896


In [107]:
#prediction using random Forest
# clf.predict()
#passing x dataframe for prediction of y
diagnosis_predict_arr1 = clf2.predict(X_test)

In [108]:
diagnosis_predict_arr1

array([9.204e-01, 8.996e-01, 1.118e-01, 9.720e-01, 1.094e-01, 6.212e-01,
       1.400e-03, 4.930e-01, 4.818e-01, 7.020e-01, 5.100e-02, 9.284e-01,
       7.892e-01, 6.000e-04, 9.000e-03, 4.600e-01, 9.258e-01, 6.000e-03,
       2.912e-01, 6.200e-03, 8.176e-01, 7.400e-03, 9.760e-01, 2.400e-03,
       8.954e-01, 1.200e-03, 8.148e-01, 3.164e-01, 3.738e-01, 6.392e-01,
       6.200e-03, 2.110e-01, 7.716e-01, 3.524e-01, 5.846e-01, 4.052e-01,
       8.970e-01, 4.396e-01, 2.800e-03, 2.412e-01, 1.644e-01, 6.792e-01,
       2.000e-04, 0.000e+00, 8.020e-01, 7.400e-03, 9.400e-03, 3.880e-01,
       1.000e-03, 9.600e-01, 0.000e+00, 1.030e-01, 3.560e-02, 2.338e-01,
       4.000e-04, 1.520e-02, 1.450e-01, 2.000e-04, 3.242e-01, 4.056e-01,
       6.360e-01, 9.474e-01, 3.224e-01, 8.294e-01, 1.640e-02, 7.830e-01,
       0.000e+00, 6.164e-01, 3.760e-01, 1.040e-02, 9.934e-01, 3.646e-01,
       9.906e-01, 9.580e-02, 9.140e-02, 4.200e-03, 6.380e-02, 3.476e-01,
       4.340e-02, 8.592e-01, 1.804e-01, 4.000e-04, 

In [109]:
diagnosis_predict1=  pd.DataFrame(zip(diagnosis_predict_arr,y_test),columns=['Predicted_diagnosis','Actuals'])

In [110]:
diagnosis_predict1.tail()

Unnamed: 0,Predicted_diagnosis,Actuals
113,0,1
114,0,0
115,0,1
116,0,0
117,0,1


### XGBost Model

In [111]:
import xgboost as xgb
m2 = xgb.XGBRegressor()
m2.fit(X_train, y_train)

# Predicting the Validation Set Results
predicted2 = m2.predict(X_test)

In [112]:
print("R2 score =", round(sm.r2_score(y_test, predicted2), 2))
print("Accuracy= ", m2.score(X_test,y_test))

R2 score = 0.28
Accuracy=  0.2831137955984273


In [113]:
#prediction using xgboost
# clf.predict()
#passing x dataframe for prediction of y
diagnosis_predict_arr2 = m2.predict(X_test)

In [114]:
diagnosis_predict_arr2

array([ 9.17361677e-01,  9.91234839e-01, -8.12310353e-02,  1.11487305e+00,
       -1.19609579e-01,  5.73994398e-01,  3.61783803e-03,  6.30472004e-01,
        2.21369326e-01,  8.13714385e-01,  1.73800867e-02,  9.38611329e-01,
        7.60510385e-01, -6.72730654e-02,  2.49098539e-02,  6.83651805e-01,
        9.93966579e-01, -6.06727041e-02,  2.07651421e-01,  4.06320952e-02,
        1.13827050e+00, -6.65173866e-03,  1.02042711e+00, -3.64440642e-02,
        7.58900225e-01, -1.47325229e-02,  1.01293933e+00,  5.76128960e-01,
        2.95803219e-01,  7.08064675e-01, -4.10915818e-03,  4.50654536e-01,
        9.89740312e-01,  4.02446985e-01,  4.37508047e-01,  3.26011539e-01,
        1.13256311e+00,  5.14639497e-01, -4.39941371e-03, -1.80263549e-01,
        7.25687575e-03,  9.37608778e-01, -2.17715874e-02, -8.90978333e-03,
        9.42387283e-01, -3.83477062e-02,  5.90511002e-02,  2.78525621e-01,
       -2.61946768e-02,  9.66216743e-01,  3.20913666e-03, -2.40097046e-01,
        1.09366409e-01,  

In [115]:
diagnosis_predict2=  pd.DataFrame(zip(diagnosis_predict_arr,y_test),columns=['Predicted_diagnosis','Actuals'])

In [116]:
diagnosis_predict2.head()

Unnamed: 0,Predicted_diagnosis,Actuals
0,0,1
1,0,1
2,0,0
3,0,1
4,0,0


### LinearRegression Model

In [117]:
# Fitting Simple Linear Regression to the Training Set
from sklearn.linear_model import LinearRegression
import sklearn.metrics as sm
m1 = LinearRegression()
m1.fit(X_train, y_train)

# Predicting the Test Set Results
predicted1 = m1.predict(X_test)
predicted1_for_test = m1.predict(X_test)

In [118]:
print("R2 score =", round(sm.r2_score(y_test, predicted1), 2))
print("Accuracy= ", m1.score(X_test,y_test))

R2 score = 0.39
Accuracy=  0.39482435555938283


In [119]:
#prediction using Leniar Regression
# clf.predict()
#passing x dataframe for prediction of y
diagnosis_predict_arr4 = m1.predict(X_test)

In [120]:
diagnosis_predict_arr4

array([ 0.83537411,  0.56047114,  0.12341482,  0.83384897,  0.29513734,
        0.31766829,  0.01542406,  0.30367836,  0.33417203,  0.61701089,
        0.2441484 ,  0.67289266,  0.57517064,  0.12309332,  0.13354389,
        0.16044191,  0.79300614,  0.0357853 ,  0.01195596,  0.12732103,
        0.71854614,  0.02624171,  0.90771444,  0.16907297,  1.20012703,
        0.03777961,  1.49381988,  0.05236513,  0.43550416,  0.51534366,
       -0.09464227, -0.15446372,  0.58719581,  0.3607321 ,  0.17170822,
        0.36287053,  0.48557629,  0.43295932,  0.19848016,  0.62986781,
        0.40183098,  0.45269434, -0.04705221,  0.09532692,  0.47143255,
       -0.07456031,  0.06980272,  0.51991995,  0.26684569,  1.14349202,
       -0.0132325 ,  0.21925871, -0.16008857, -0.14353055,  0.09712376,
        0.07940449,  0.35791352, -0.04613008,  0.47915739,  0.43919171,
        0.62619846,  1.18653997,  0.14143447,  0.60677234,  0.14646436,
        0.73268007,  0.08836858,  0.46883187,  0.39809071,  0.05

In [121]:
diagnosis_predict4=  pd.DataFrame(zip(diagnosis_predict_arr,y_test),columns=['Predicted_diagnosis','Actuals'])

In [122]:
diagnosis_predict4.head()

Unnamed: 0,Predicted_diagnosis,Actuals
0,0,1
1,0,1
2,0,0
3,0,1
4,0,0


### Prepocessing the data to pridict stage using trained model

In [123]:
#if stage.isnull is returning False,hence no null value else there is null value
stage_train_data = stage_data[stage_data.stage.isnull()==False]
stage_test_data = stage_data[stage_data.stage.isnull()]

###  Label Encoding

In [124]:
stage_label_enc = LabelEncoder()
stage_label_enc.fit(stage_train_data['stage'])
stage_train_data['stage'] = stage_label_enc.transform(stage_train_data['stage'])

### Spliting

In [125]:
# Converting the main_train into X and y so that we can pass it onto train_test_split function
# ind --> contains the dataframe without the target i.e diagnosis which are the independent varibles
ind_stage_data = stage_train_data.drop('stage',axis=1)
# dep --> contains only the target value 
dep_stage_data = stage_train_data['stage']

In [126]:
ind_stage_data.head()

Unnamed: 0,patient_cohort,sample_origin,age,sex,benign_sample_diagnosis,plasma_CA19_9,creatinine,LYVE1,REG1B,TFF1,REG1A
391,Cohort1,ESP,81,F,,,0.5655,12.01715,431.42253,874.0997,
392,Cohort1,LIV,57,M,,10.0,1.73043,2.628425,40.620818,529.984,155.27
393,Cohort1,BPTB,55,M,,11.0,0.47502,2.830541,33.40615,323.17577,210.185
394,Cohort2,BPTB,58,M,,11.0,0.44109,0.632433,188.253,138.63,
395,Cohort1,LIV,73,M,,3236.0,0.70122,12.24582,196.92183,1529.183124,432.917


In [127]:
#There are many null values hence droping it
ind_stage_data = ind_stage_data.drop(columns=['benign_sample_diagnosis'])

In [128]:
dep_stage_data.head()

391    0
392    1
393    1
394    1
395    2
Name: stage, dtype: int32

In [62]:
dep_stage_data.shape

(199,)

### Encoding

In [129]:
sample_origin = OrdinalEncoder()
ind_stage_data["sample_origin"] = sample_origin.fit_transform(ind_stage_data[["sample_origin"]])

In [130]:
sex = LabelEncoder()
sex.fit(ind_stage_data['sex'])
ind_stage_data['sex'] = sex.transform(ind_stage_data['sex'])

In [132]:
patient_cohort = OrdinalEncoder()
# patient_cohort.fit(ind_diagnosis_data['patient_cohort'])

ind_stage_data["patient_cohort"] = patient_cohort.fit_transform(ind_stage_data[["patient_cohort"]])

In [133]:
#Filling all null values with mean
for i in ind_stage_data[2:]:
    if i!='sex':
        ind_stage_data[i] = ind_stage_data[i].fillna(ind_stage_data[i].mean())
ind_stage_data.head()

Unnamed: 0,patient_cohort,sample_origin,age,sex,plasma_CA19_9,creatinine,LYVE1,REG1B,TFF1,REG1A
391,0.0,1.0,81,0,1476.154733,0.5655,12.01715,431.42253,874.0997,1138.323721
392,0.0,2.0,57,1,10.0,1.73043,2.628425,40.620818,529.984,155.27
393,0.0,0.0,55,1,11.0,0.47502,2.830541,33.40615,323.17577,210.185
394,1.0,0.0,58,1,11.0,0.44109,0.632433,188.253,138.63,1138.323721
395,0.0,2.0,73,1,3236.0,0.70122,12.24582,196.92183,1529.183124,432.917


### Traing Previously trained model on diagnoosis

### SVM Model

In [134]:
#Previously used SVM model
X_train, X_test, y_train, y_test = train_test_split(ind_stage_data,dep_stage_data,test_size=0.2)#between 80% and 20%
SVM = clf.fit(X_train, y_train)

score = SVM.score(X_test, y_test)
print('Test set score: {}'.format(score))

Test set score: 0.25


In [135]:
predicte = clf.predict(X_test)

In [136]:
print("R2 score =", round(sm.r2_score(y_test, predicte), 2))
print("Accuracy= ", clf.score(X_test,y_test))

R2 score = -0.59
Accuracy=  0.25


### Random Forest

In [145]:
clf2.fit(X_train, y_train)

# Predicting the Validation Set Results
stage_predicted3 = clf2.predict(X_test)

In [146]:
print("R2 score =", round(sm.r2_score(y_test, stage_predicted3), 2))
print("Accuracy= ", clf2.score(X_test,y_test))

R2 score = -0.11
Accuracy=  -0.11428487499999984


### XGBost Model

In [147]:
m2.fit(X_train, y_train)

# Predicting the Validation Set Results
predicted2 = m2.predict(X_test)

In [148]:
print("R2 score =", round(sm.r2_score(y_test, predicted2), 2))
print("Accuracy= ", m2.score(X_test,y_test))

R2 score = -0.41
Accuracy=  -0.4141040736965855


### LinearRegression Model

In [149]:
m1.fit(X_train, y_train)

# Predicting the Test Set Results
predicted1 = m1.predict(X_test)
predicted1_for_test = m1.predict(X_test)

In [150]:
print("R2 score =", round(sm.r2_score(y_test, predicted1), 2))
print("Accuracy= ", m1.score(X_test,y_test))

R2 score = -0.23
Accuracy=  -0.22628202368808273


## END

In [45]:
#prediction using random Forest
# clf.predict()
#passing x dataframe for prediction of y
diagnosis_predict_arr = clf2.predict(ind_diagnosis_data)

In [137]:
# diagnosis_predict_arr

In [47]:
# diagnosis_predict=  pd.DataFrame(zip(diagnosis_predict_arr),columns=['Predicted_diagnosis'])

In [48]:
# diagnosis_predict.shape

(590, 1)

In [52]:
# diagnosis_predict

Unnamed: 0,Predicted_diagnosis
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
585,0.9
586,0.4
587,1.0
588,0.9


#### Adding this Predicted_diagnosis in ind_stage_data

### Below here is just for Reference Don't run the code from here

In [None]:
# #if clear_date.isnull is returning False,hence no null value else there is null value
# final_data=data
# final_data = final_data[final_data.diagnosis.isnull()==False]
# final_test = final_data[final_data.diagnosis.isnull()]

In [47]:
# #if stage.isnull is returning False,hence no null value else there is null value
# final_train = data[data.stage.isnull()==False]
# final_test = data[data.stage.isnull()]

In [48]:
# final_train.isnull().sum()

sample_id                    0
patient_cohort               0
sample_origin                0
age                          0
sex                          0
diagnosis                    0
stage                        0
benign_sample_diagnosis    199
plasma_CA19_9               49
creatinine                   0
LYVE1                        0
REG1B                        0
TFF1                         0
REG1A                       59
dtype: int64

In [49]:
# final_test.isnull().sum()

sample_id                    0
patient_cohort               0
sample_origin                0
age                          0
sex                          0
diagnosis                    0
stage                      391
benign_sample_diagnosis    183
plasma_CA19_9              191
creatinine                   0
LYVE1                        0
REG1B                        0
TFF1                         0
REG1A                      225
dtype: int64

In [50]:
# final_train.shape

(199, 14)

In [51]:
# #Droping rows where plasma_CA19_9 and REG1A is null
# final_test = final_test[(final_test.plasma_CA19_9.isnull()==False)]

In [52]:
# final_test.isnull().sum()

sample_id                    0
patient_cohort               0
sample_origin                0
age                          0
sex                          0
diagnosis                    0
stage                      200
benign_sample_diagnosis     92
plasma_CA19_9                0
creatinine                   0
LYVE1                        0
REG1B                        0
TFF1                         0
REG1A                      125
dtype: int64

In [53]:
# final_test.shape

(200, 14)

### Spliting final_train into Train/Test

In [25]:
# # Converting the main_train into X and y so that we can pass it onto train_test_split function
# # ind --> contains the dataframe without the target i.e delay which are the independent varibles
# final_train=data
# ind = final_train.drop('diagnosis',axis=1)
# # dep --> contains only the target value 
# dep = final_train['diagnosis']