# Supervised learning predicting living status

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

In [2]:
lung_cancer = pd.read_csv('lung_cancer_all_dummified.csv')

In [3]:
lung_cancer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2564 entries, 0 to 2563
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Study ID                   2564 non-null   int64  
 1   Cancer Type Detailed       2564 non-null   int64  
 2   Sex                        2564 non-null   int64  
 3   Age                        2564 non-null   int64  
 4   Smoking Status             2564 non-null   int64  
 5   Mutation Count             2564 non-null   int64  
 6   Fraction Genome Altered    2564 non-null   float64
 7   Overall Survival Status    2564 non-null   int64  
 8   Overall Survival (Months)  2564 non-null   float64
dtypes: float64(2), int64(7)
memory usage: 180.4 KB


In [4]:
lung_cancer.head()

Unnamed: 0,Study ID,Cancer Type Detailed,Sex,Age,Smoking Status,Mutation Count,Fraction Genome Altered,Overall Survival Status,Overall Survival (Months)
0,1,1,0,70,0,0,0.4565,0,0.0
1,1,1,0,81,0,0,0.0,0,23.98
2,1,1,0,67,0,289,0.2221,0,50.03
3,1,1,1,79,0,0,0.2362,1,3.98
4,1,1,0,68,0,1272,0.0854,0,19.94


### Supervised learning on living status

- Logistic regression
- SVM
- Decision Tree
- Random Forest

##### preprocess the data

In [5]:
lung_cancer_new = lung_cancer.drop(columns = ['Study ID'])

In [6]:
X = lung_cancer_new.iloc[:, 0:6]
y = lung_cancer_new['Overall Survival Status']

In [7]:
#Logistic Regression
# Set up function parameters for different cross validation strategies
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits=5)
skfold = StratifiedKFold(n_splits=5, shuffle=True) 
rkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=1)

In [8]:
#Step 1: Split the data into training and testing set
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X, y)

# randomly assign some data to the test-set and the rest to the training-set
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state = 42) 


In [9]:
#StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

#### KNN

In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

knn = KNeighborsClassifier()
k_range = list(range(1, 31))

param_grid = dict(n_neighbors=k_range)
kfold = KFold(n_splits = 10, shuffle = True, random_state = 5)
knn_grid = GridSearchCV(knn, param_grid, cv=kfold).fit(X_train_scaled, y_train)

print("Test set Score: {:.2f}".format(knn_grid.score(X_test_scaled, y_test)))
print("Best Parameter: {}".format(knn_grid.best_params_))

Test set Score: 0.90
Best Parameter: {'n_neighbors': 1}


In [11]:
from sklearn.metrics import confusion_matrix
predict_knn_grid = knn_grid.predict(X_test_scaled) #model_1 test data predictions go here
knn_model = confusion_matrix(y_test, predict_knn_grid) #model_1 confusion matrix on test data goes here
train_acc_knn = knn_grid.score(X_train_scaled,y_train)
test_acc_knn = knn_grid.score(X_test_scaled,y_test)
print("Training accuracy: ",train_acc_knn)
print("Testing  accuracy: ",test_acc_knn)
print(knn_model)


from sklearn.metrics import f1_score,recall_score,precision_score
import warnings
warnings.filterwarnings('ignore')

f1_knn = f1_score(y_test, predict_knn_grid)#model_1 f1 score on test data goes here
precision_knn = recall_score(y_test, predict_knn_grid)#model_1 precision score on test data goes here
recall_knn = precision_score(y_test, predict_knn_grid)#model_1 recall score on test data goes here

print("precision: ",precision_knn)
print("recall: ",recall_knn)
print("f1 score: ",f1_knn)



Training accuracy:  1.0
Testing  accuracy:  0.897489539748954
[[399  68]
 [ 30 459]]
precision:  0.9386503067484663
recall:  0.8709677419354839
f1 score:  0.9035433070866141


#### Logistic regression

In [12]:
# GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

logreg_pipe = make_pipeline(StandardScaler(), LogisticRegression())
#print(logreg_pipe.steps) 
# Name of step = 'logisticregression' + __ + C

logreg_param_grid = {'logisticregression__C': np.linspace(1, 100, 100)}
logreg_grid = GridSearchCV(logreg_pipe, logreg_param_grid).fit(X_train, y_train)

print("Test set Score: {:.2f}".format(logreg_grid.score(X_test, y_test)))
print("Best Parameter: {}".format(logreg_grid.best_params_))

Test set Score: 0.58
Best Parameter: {'logisticregression__C': 1.0}


In [13]:
from sklearn.metrics import confusion_matrix
predict_logreg_grid = logreg_grid.predict(X_test) #model_1 test data predictions go here
logreg_model = confusion_matrix(y_test, predict_logreg_grid) #model_1 confusion matrix on test data goes here
train_acc_logreg = logreg_grid.score(X_train,y_train)
test_acc_logreg = logreg_grid.score(X_test,y_test)
print("Training accuracy: ",train_acc_logreg)
print("Testing  accuracy: ",test_acc_logreg)
print(logreg_model)


from sklearn.metrics import f1_score,recall_score,precision_score
import warnings
warnings.filterwarnings('ignore')

f1_logreg = f1_score(y_test, predict_logreg_grid)#model_1 f1 score on test data goes here
precision_logreg = recall_score(y_test, predict_logreg_grid)#model_1 precision score on test data goes here
recall_logreg = precision_score(y_test, predict_logreg_grid)#model_1 recall score on test data goes here

print("precision: ",precision_logreg)
print("recall: ",recall_logreg)
print("f1 score: ",f1_logreg)


Training accuracy:  0.6164574616457462
Testing  accuracy:  0.5805439330543933
[[245 222]
 [179 310]]
precision:  0.6339468302658486
recall:  0.5827067669172933
f1 score:  0.6072477962781586


#### Decision Tree

In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

kfold = KFold(n_splits = 10, shuffle = True, random_state = 5)
max_depth = {'max_depth': np.arange(1, 20, 1), 'criterion': ['gini', 'entropy']}
tree = DecisionTreeClassifier()
grid_dec_tree = GridSearchCV(tree, param_grid = max_depth, cv = kfold)
grid_dec_tree.fit(X_train_scaled, y_train)
cv_dec_tree = cross_val_score(grid_dec_tree, X_train_scaled, y_train)

print("Decision Tree Model KFold cross validation average score is: {:.3f}".format(np.mean(cv_dec_tree)))
print("Best mean cross-validation score of decision tree: {:.3f}".format(grid_dec_tree.best_score_))
print("Best parameter of decision tree: {}".format(grid_dec_tree.best_params_))

Decision Tree Model KFold cross validation average score is: 0.877
Best mean cross-validation score of decision tree: 0.890
Best parameter of decision tree: {'criterion': 'gini', 'max_depth': 19}


In [15]:
from sklearn.metrics import confusion_matrix
predict_grid_dec_tree = grid_dec_tree.predict(X_test) #model_1 test data predictions go here
grid_dec_tree_model = confusion_matrix(y_test, predict_grid_dec_tree) #model_1 confusion matrix on test data goes here
train_acc_grid_dec_tree = grid_dec_tree.score(X_train_scaled,y_train)
test_acc_grid_dec_tree = grid_dec_tree.score(X_test_scaled,y_test)
print("Training accuracy: ",train_acc_grid_dec_tree)
print("Testing  accuracy: ",test_acc_grid_dec_tree)
print(grid_dec_tree_model)


from sklearn.metrics import f1_score,recall_score,precision_score
import warnings
warnings.filterwarnings('ignore')

f1_tree = f1_score(y_test, predict_grid_dec_tree)#model_1 f1 score on test data goes here
precision_tree = recall_score(y_test, predict_grid_dec_tree)#model_1 precision score on test data goes here
recall_tree = precision_score(y_test, predict_grid_dec_tree)#model_1 recall score on test data goes here

print("precision: ",precision_tree)
print("recall: ",recall_tree)
print("f1 score: ",f1_tree)

Training accuracy:  0.99302649930265
Testing  accuracy:  0.895397489539749
[[404  63]
 [423  66]]
precision:  0.13496932515337423
recall:  0.5116279069767442
f1 score:  0.21359223300970875


#### Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

kfold = KFold(n_splits = 10, shuffle = True, random_state = 5)
max_depth = {'max_depth': np.arange(1, 20, 1)}
forest = RandomForestClassifier()
grid_forest = GridSearchCV(forest, param_grid = max_depth, cv = kfold)
grid_forest.fit(X_train_scaled, y_train)
cv_forest = cross_val_score(grid_forest, X_test_scaled, y_test)

print("Random Forest Model KFold cross validation average score is: {:.3f}".format(np.mean(cv_forest)))
print("Best mean cross-validation score of Random Forest: {:.3f}".format(grid_forest.best_score_))
print("Best parameter of Random Forest: {}".format(grid_forest.best_params_))

Random Forest Model KFold cross validation average score is: 0.734
Best mean cross-validation score of Random Forest: 0.913
Best parameter of Random Forest: {'max_depth': 19}


In [17]:
from sklearn.metrics import confusion_matrix
predict_cv_forest = grid_forest.predict(X_test_scaled) #model_1 test data predictions go here
grid_cv_forest = confusion_matrix(y_test, predict_cv_forest) #model_1 confusion matrix on test data goes here
train_acc_cv_forest = grid_forest.score(X_train_scaled,y_train)
test_acc_cv_forest = grid_forest.score(X_test_scaled,y_test)
print("Training accuracy: ",train_acc_cv_forest)
print("Testing  accuracy: ",test_acc_cv_forest)
print(grid_cv_forest)


from sklearn.metrics import f1_score,recall_score,precision_score
import warnings
warnings.filterwarnings('ignore')

f1_forest = f1_score(y_test, predict_cv_forest)#model_1 f1 score on test data goes here
precision_forest = recall_score(y_test, predict_cv_forest)#model_1 precision score on test data goes here
recall_forest = precision_score(y_test, predict_cv_forest)#model_1 recall score on test data goes here

print("precision: ",precision_forest)
print("recall: ",recall_forest)
print("f1 score: ",f1_forest)



Training accuracy:  1.0
Testing  accuracy:  0.9037656903765691
[[402  65]
 [ 27 462]]
precision:  0.9447852760736196
recall:  0.8766603415559773
f1 score:  0.9094488188976377


**Conclusion:**
- Random Forest has the highest score using the original model.
- The best parameter for random forest is when the max depth equals to 19.
- I just eliminated 'Study ID' and kept 'Cancer Type Detailed' to independent variables. I think Cancer Type may affect survival status. So I keep this variable.

#### Model Evaluation

In [21]:
from bokeh.io import output_notebook, show 
from bokeh.plotting import figure
from bokeh.layouts import gridplot
output_notebook()

In [24]:
def draw_one_roc_curve(fpr,tpr,thresholds,auc,model):
    #Set up the ColumnDataSource object
    from bokeh.models import LabelSet, ColumnDataSource,HoverTool
    import pandas as pd
    df_d = pd.DataFrame([fpr,tpr,thresholds]).transpose()
    df_d.columns = ["fpr","tpr","threshold"]
    source = ColumnDataSource(df_d)    
    
    
    # Create custom HoverTool -- we'll make one for each curve
    hover_ROC = HoverTool(names=['ROC'], tooltips=[("TPR", "@tpr"), 
                                                   ("FPR", "@fpr"), 
                                                   ("Thresh", "@threshold"),
                                                  ])

    # Create the tools
    p_tools_ROC = [hover_ROC, 'crosshair', 'zoom_in', 'zoom_out', 'save', 'reset', 'tap', 'box_zoom']

    p1 = figure(title="ROC Curve for "+model, tools=p_tools_ROC,x_range=(0,1),y_range=(0,1))

    p1.xaxis.axis_label = 'False Positive Rate' 
    p1.yaxis.axis_label = 'True Positive Rate'

    # plot curve and datapts
    p1.line('fpr', 'tpr', line_width=1, color="blue", source=source)
    p1.circle('fpr', 'tpr', size=3, color="orange", legend_label='auc='+auc, source=source, name='ROC')

    # Plot chance (tpr = fpr 45 degrees line)
    p1.line([0, 1], [0, 1], line_dash='dashed', line_width=0.5, color='black', name='Chance')

    # Keep the legend at the bottom 
    p1.legend.location = "bottom_right"
    
    #Return the figure
    return p1
    
    
def draw_roc_curves():
  
    #Get the predicted probabilities for each model
    #Note that we can't just use predictions because they will be 0,1 values
    from sklearn.model_selection import cross_val_predict
    predic_prob_model_1 = cross_val_predict(knn_grid,X_test_scaled,y_test,cv=5)
    predic_prob_model_2 = cross_val_predict(grid_forest,X_test_scaled,y_test,cv=5)

    #Get the AUC for each model
    from sklearn.metrics import roc_curve, roc_auc_score
    auc_m1 = roc_auc_score(y_test,predic_prob_model_1)
    auc_m2 = roc_auc_score(y_test,predic_prob_model_2)

    
    #Format auc to two decimal places
    auc_m1 = "%1.2f"%auc_m1
    auc_m2 = "%1.2f"%auc_m2
  
    #Using the predicted probabilities, get the roc curves
    #fpr = false positive rate
    #tpr = true positive rate
    #thresholds = threshold choices
    #The ROC curve reports the fpr and tpr for each chosen threshold
    fpr_m1,tpr_m1,thresholds_m1 = roc_curve(y_test,predic_prob_model_1)
    fpr_m2,tpr_m2,thresholds_m2 = roc_curve(y_test,predic_prob_model_2)

    #Draw the various ROC Curves
    p1=draw_one_roc_curve(fpr_m1,tpr_m1,thresholds_m1,auc_m1,"KNN Model")
    p2=draw_one_roc_curve(fpr_m2,tpr_m2,thresholds_m2,auc_m2,"Random Forest Model")
    

    #Set up the grid for all the curves
    grid = gridplot([[p1,p2]],sizing_mode="scale_both",merge_tools=True)

    #Show the curves
    show(grid)

#Call the function
draw_roc_curves()

In [25]:
def draw_one_PR_curve(precision,recall,thresholds,f1_score ,model):
    from bokeh.models import LabelSet, ColumnDataSource, Label
    import pandas as pd

    df_d = pd.DataFrame([recall,precision,thresholds]).transpose()
    df_d.columns = ["recall","precision","threshold"]

    source = ColumnDataSource(df_d)

    p_tools = ['crosshair', 'zoom_in', 'zoom_out', 'save', 'reset', 'tap', 'box_zoom']

    #Figure
    p = figure(title="PR Curve for "+model, tools=p_tools)
    p.xaxis.axis_label = 'threshold' 
    p.yaxis.axis_label = 'precision/recall'
    
    #Add lines for precision and recall
    p.line('threshold', 'precision', line_width=1, color="blue", source=source,legend_label="precision")
    p.line('threshold', 'recall', line_width=1, color="red", source=source,legend_label="recall")
    
    f1_label = Label(x=1.0, y=.70, x_units='screen', y_units='screen', text='F1 Score='+f1_score, render_mode='css',
      border_line_color='black', border_line_alpha=0.0,
      background_fill_color='white', background_fill_alpha=1.0)
    
    p.add_layout(f1_label)
   
    # legend location
    p.legend.location = "bottom_left"
    return p

def draw_pr_curves():
    #Get the predicted probabilities for each model
    #Note that we can't just use predictions because they will be 0,1 values
    from sklearn.model_selection import cross_val_predict
    from sklearn.metrics import precision_recall_curve

    predic_prob_model_1 = cross_val_predict(knn_grid,X_test_scaled,y_test,cv=5)
    predic_prob_model_2 = cross_val_predict(grid_forest,X_test_scaled,y_test,cv=5)
    

    #Get precisions and recalls
    precision_1,recall_1,thresholds_1 = precision_recall_curve(y_test,predic_prob_model_1)
    precision_2,recall_2,thresholds_2 = precision_recall_curve(y_test,predic_prob_model_2)
    
    #draw the curves
    p1 = draw_one_PR_curve(precision_1,recall_1,thresholds_1,str("%1.2f"%f1_knn),"KNN Model")
    p2 = draw_one_PR_curve(precision_2,recall_2,thresholds_2,str("%1.2f"%f1_forest),"Random Forest Model")
 
    #Set up the grid for all the curves
    grid = gridplot([[p1,p2]],sizing_mode="scale_both",merge_tools=True)

    #Show the curves
    show(grid)

draw_pr_curves()