# Supervised learning predicting  cancer type detailed

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

In [2]:
lung_cancer = pd.read_csv('lung_cancer_all_dummified.csv')

In [3]:
lung_cancer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2564 entries, 0 to 2563
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Study ID                   2564 non-null   int64  
 1   Cancer Type Detailed       2564 non-null   int64  
 2   Sex                        2564 non-null   int64  
 3   Age                        2564 non-null   int64  
 4   Smoking Status             2564 non-null   int64  
 5   Mutation Count             2564 non-null   int64  
 6   Fraction Genome Altered    2564 non-null   float64
 7   Overall Survival Status    2564 non-null   int64  
 8   Overall Survival (Months)  2564 non-null   float64
dtypes: float64(2), int64(7)
memory usage: 180.4 KB


In [4]:
lung_cancer.head()

Unnamed: 0,Study ID,Cancer Type Detailed,Sex,Age,Smoking Status,Mutation Count,Fraction Genome Altered,Overall Survival Status,Overall Survival (Months)
0,1,1,0,70,0,0,0.4565,0,0.0
1,1,1,0,81,0,0,0.0,0,23.98
2,1,1,0,67,0,289,0.2221,0,50.03
3,1,1,1,79,0,0,0.2362,1,3.98
4,1,1,0,68,0,1272,0.0854,0,19.94


- Logistic regression
- SVM
- Decision Tree
- Random Forest

#### Logistic regression

In [5]:
X = lung_cancer.iloc[:, 2:]
y = lung_cancer['Cancer Type Detailed']

In [7]:
#Logistic Regression
# Set up function parameters for different cross validation strategies
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits=5)
skfold = StratifiedKFold(n_splits=5, shuffle=True) 
rkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=1)

In [8]:
#Step 1: Split the data into training and testing set
from sklearn.model_selection import train_test_split

# randomly assign some data to the test-set and the rest to the training-set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42) 


In [9]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression().fit(X_train, y_train)

print("Training set score: {:.2f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.2f}".format(logreg.score(X_test, y_test))) 

# Kfold Cross Validation
print("Mean Cross Validation, KFold: {:.2f}".format(np.mean(cross_val_score(logreg, X_train, y_train, cv=kfold))))

# Wine type prediction from test set (I'll use this later)
logreg_predicted_vals = logreg.predict(X_test)


Training set score: 0.72
Test set score: 0.70
Mean Cross Validation, KFold: 0.71


In [10]:
# GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

logreg_pipe = make_pipeline(StandardScaler(), LogisticRegression())
#print(logreg_pipe.steps) 
# Name of step = 'logisticregression' + __ + C

logreg_param_grid = {'logisticregression__C': np.linspace(1, 100, 100)}
logreg_grid = GridSearchCV(logreg_pipe, logreg_param_grid).fit(X_train, y_train)

print("Test set Score: {:.2f}".format(logreg_grid.score(X_test, y_test)))
print("Best Parameter: {}".format(logreg_grid.best_params_))

Test set Score: 0.71
Best Parameter: {'logisticregression__C': 1.0}


#### SVM

In [11]:
from sklearn import svm

svc = svm.SVC()
svc.fit(X_train, y_train)
print("Training set score: {:.2f}".format(svc.score(X_train, y_train)))
print("Test set score: {:.2f}".format(svc.score(X_test, y_test)))

# Kfold cross validation
print("Mean Cross-Validation, Kfold: {:.2f}".format(np.mean(cross_val_score(svc, X_train, y_train, scoring='r2', cv=kfold))))
svc_unscaled = np.mean(cross_val_score(svc, X_train, y_train, scoring='r2', cv=kfold))

Training set score: 0.66
Test set score: 0.64
Mean Cross-Validation, Kfold: -0.39


In [12]:
# GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

svc_pipe = make_pipeline(StandardScaler(), svm.SVC())

svc_param_grid = {'C': np.arange(0, 20, 1),
              'gamma': np.arange(0.01, 1, 0.05),
                 }

svc_grid = GridSearchCV(svc_pipe, svc_param_grid, cv=kfold).fit(X_train, y_train)

print("Test set Score: {:.2f}".format(svc_grid.score(X_test, y_test)))
#print("Best Cross-Validation Score: {:.2f}".format(svc_grid.best_score_))
print("Best Parameter: {}".format(svc_grid.best_params_))

ValueError: Invalid parameter C for estimator Pipeline(steps=[('standardscaler', StandardScaler()), ('svc', SVC())]). Check the list of available parameters with `estimator.get_params().keys()`.

#### Decision Tree

In [13]:
from sklearn.tree import DecisionTreeClassifier

tree_class = DecisionTreeClassifier(random_state=0)
tree_class.fit(X_train, y_train)
tree_class.score(X_test, y_test)

0.8954758190327613

In [14]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

kfold = KFold(n_splits = 10, shuffle = True, random_state = 5)
max_depth = {'max_depth': np.arange(1, 20, 1)}
tree = DecisionTreeClassifier()
grid_dec_tree = GridSearchCV(tree, param_grid = max_depth, cv = kfold)
grid_dec_tree.fit(X_train, y_train)
cv_dec_tree = cross_val_score(grid_dec_tree, X_train, y_train)

print("Decision Tree Model KFold cross validation average score is: {:.3f}".format(np.mean(cv_dec_tree)))
print("Best mean cross-validation score of decision tree: {:.3f}".format(grid_dec_tree.best_score_))
print("Best parameter of decision tree: {}".format(grid_dec_tree.best_params_))

Decision Tree Model KFold cross validation average score is: 0.862
Best mean cross-validation score of decision tree: 0.881
Best parameter of decision tree: {'max_depth': 17}


#### Random Forest

In [15]:
from sklearn.ensemble import RandomForestClassifier

forest_classifier = RandomForestClassifier(random_state=0)
forest_classifier.fit(X_train, y_train)
forest_classifier.score(X_test, y_test)

0.906396255850234

In [16]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

kfold = KFold(n_splits = 10, shuffle = True, random_state = 5)
max_depth = {'max_depth': np.arange(1, 20, 1)}
forest = RandomForestClassifier()
grid_forest = GridSearchCV(forest, param_grid = max_depth, cv = kfold)
grid_forest.fit(X_train, y_train)
cv_forest = cross_val_score(grid_forest, X_test, y_test)

print("Random Forest Model KFold cross validation average score is: {:.3f}".format(np.mean(cv_forest)))
print("Best mean cross-validation score of Random Forest: {:.3f}".format(grid_forest.best_score_))
print("Best parameter of Random Forest: {}".format(grid_forest.best_params_))

Random Forest Model KFold cross validation average score is: 0.789
Best mean cross-validation score of Random Forest: 0.901
Best parameter of Random Forest: {'max_depth': 16}


**Conclusion:**
- Random Forest has the highest score using the original model, but its cross validation score is not high, which means it might be unstable, so I might need to compare decision tree and random forest.
- The best parameter for random forest is when the max depth equals to 16.