# Driver Classification with Mutational Signature Exposure
- UECE group 1 sample (356 sample, 311 no mutations, 45 with driver mutations) used for classifier training and cross-validation. 

In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import sklearn

In [2]:
# Import required libraries for performance metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

In [3]:
# Import required libraries for machine learning classifiers
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model #LinearRegression()
from sklearn.svm import LinearSVC
from sklearn.svm import SVR
from sklearn import svm
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [4]:
# Define dictionary with performance metrics

scoring = {'accuracy':make_scorer(accuracy_score), 
          'precision':make_scorer(precision_score),
          'recall':make_scorer(recall_score),
          'f1_score':make_scorer(f1_score),
           'MCC': make_scorer(matthews_corrcoef)}

In [14]:
# construct classifiers

log_model = LogisticRegression()
lda_model = LinearDiscriminantAnalysis()
svc_lin_model = LinearSVC(dual=False)
svc_model = SVC() #without using gamma='auto
knn_model = KNeighborsClassifier()
dtr_model = DecisionTreeClassifier()
rfc_model = RandomForestClassifier()
gnb_model = GaussianNB()

In [5]:
# Define the models evaluation function

def models_cross_validation(X, y, folds):
    # X = dataset of features
    # y = target dataset
    # folds = number of folds for cross-validation

    log = cross_validate(log_model, X, y, cv=folds, scoring=scoring)
    lda = cross_validate(lda_model, X, y, cv=folds, scoring=scoring)
    
    svc_lin = cross_validate(svc_lin_model, X, y, cv=folds, scoring=scoring)
    svc = cross_validate(svc_model, X, y, cv=folds, scoring=scoring)
    
    # svr_rbf = cross_validate(svr_rbf_model, X, y, cv=folds, scoring=scoring)
    # svr_lin = cross_validate(svr_lin_model, X, y, cv=folds, scoring=scoring)
    # svr_poly = cross_validate(svr_poly_model, X, y, cv=folds, scoring=scoring)
    
    knn = cross_validate(knn_model, X, y, cv=folds, scoring=scoring)
    dtr = cross_validate(dtr_model, X, y, cv=folds, scoring=scoring)
    rfc = cross_validate(rfc_model, X, y, cv=folds, scoring=scoring)
    gnb = cross_validate(gnb_model, X, y, cv=folds, scoring=scoring)
    
    # Create a data frame with the models perfoamnce metrics scores
    
    models_scores_table = pd.DataFrame({'Logistic Regression':[log['test_accuracy'].mean(),
                                                               log['test_precision'].mean(),
                                                               log['test_recall'].mean(),
                                                               log['test_f1_score'].mean(),
                                                               log['test_MCC'].mean()],
                                       
                                        'Linear Discriminant':[lda['test_accuracy'].mean(),
                                                               lda['test_precision'].mean(),
                                                               lda['test_recall'].mean(),
                                                               lda['test_f1_score'].mean(),
                                                               lda['test_MCC'].mean()],

                                        'Support Vector Classifier (LinearSVC)':[svc_lin['test_accuracy'].mean(),
                                                                                 svc_lin['test_precision'].mean(),
                                                                                 svc_lin['test_recall'].mean(),
                                                                                 svc_lin['test_f1_score'].mean(),
                                                                                 svc_lin['test_MCC'].mean()],
               
                                        'Support Vector Classifier':[svc['test_accuracy'].mean(),
                                                                     svc['test_precision'].mean(),
                                                                     svc['test_recall'].mean(),
                                                                     svc['test_f1_score'].mean(),
                                                                     svc['test_MCC'].mean()],

                                        'KNeighbors Classifier':[knn['test_accuracy'].mean(),
                                                                 knn['test_precision'].mean(),
                                                                 knn['test_recall'].mean(),
                                                                 knn['test_f1_score'].mean(),
                                                                 knn['test_MCC'].mean()],
                                        
                                        'Decision Tree':[dtr['test_accuracy'].mean(),
                                                         dtr['test_precision'].mean(),
                                                         dtr['test_recall'].mean(),
                                                         dtr['test_f1_score'].mean(),
                                                         dtr['test_MCC'].mean()],
                                       
                                      'Random Forest':[rfc['test_accuracy'].mean(),
                                                       rfc['test_precision'].mean(),
                                                       rfc['test_recall'].mean(),
                                                       rfc['test_f1_score'].mean(),
                                                       rfc['test_MCC'].mean()],
                                       
                                      'Gaussian Naive Bayes':[gnb['test_accuracy'].mean(),
                                                              gnb['test_precision'].mean(),
                                                              gnb['test_recall'].mean(),
                                                              gnb['test_f1_score'].mean(),
                                                              gnb['test_MCC'].mean()]},
                                       
                                       index = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'MCC'])
    # Add 'Best Score' column
    models_scores_table['Best Score'] = models_scores_table.idxmax(axis=1)

    return models_scores_table

In [6]:
def calculate_zeros(file):
    
    bases_mutated_motif = (file[file['bases_mutated_in_motif'] == 0].groupby('motif')['bases_mutated_in_motif'].count() / file.groupby('motif')['bases_mutated_in_motif'].count()).reset_index()
    bases_not_mutated_motif = (file[file['bases_not_mutated_in_motif'] == 0].groupby('motif')['bases_not_mutated_in_motif'].count() / file.groupby('motif')['bases_not_mutated_in_motif'].count()).reset_index()
    merged = bases_mutated_motif.merge(bases_not_mutated_motif, 'left', on = 'motif')

    return merged

In [8]:
# data = pd.read_csv('../ucec_classification.csv')
data = pd.read_csv("../scripts/classification_g1_sig.csv")

In [9]:
sig_cols = [col for col in data if col.startswith('COSMICv3')]

## Selected feature data

In [10]:
data['ExpPOLEDriverLabel'].unique()

array(['No mutation', 'Driver'], dtype=object)

In [18]:
data['Tumor_Sample_Barcode'].unique().size

356

In [11]:
# Add a column for binary driver vs. no mutation 
# data['POLEDriverMSIStatus'].replace(['D0ML', 'D0M2', 'D1ML', 'D1M2', 'D2ML', 'D2M2'], [0,1,2,3,4,5], inplace=True)
# data['POLEDriverMSIStatus'].replace([0,1,2,3,4,5], [0, 0, 0, 0, 1, 1], inplace=True)
data['ExpPOLEDriverLabel'].replace(['Driver', 'No mutation'], [1, 0], inplace=True)

In [12]:
sig_cols = [col for col in data if col.startswith('COSMICv3')]
X = data[sig_cols].fillna(0) ## replace NaN with 0
y = data['ExpPOLEDriverLabel']

In [16]:
models_cross_validation(X, y, 5)

Unnamed: 0,Logistic Regression,Linear Discriminant,Support Vector Classifier (LinearSVC),Support Vector Classifier,KNeighbors Classifier,Decision Tree,Random Forest,Gaussian Naive Bayes,Best Score
Accuracy,0.955086,0.988732,0.983099,0.988732,0.985915,0.985955,0.985994,0.966354,Linear Discriminant
Precision,1.0,1.0,0.975,1.0,0.977778,0.957778,1.0,0.977778,Logistic Regression
Recall,0.644444,0.911111,0.888889,0.911111,0.911111,0.933333,0.888889,0.755556,Decision Tree
F1 Score,0.782857,0.952941,0.929412,0.952941,0.942484,0.943722,0.938235,0.838492,Linear Discriminant
MCC,0.782498,0.948237,0.921461,0.948237,0.93573,0.936895,0.934313,0.837194,Linear Discriminant


## Predict driver status on non-driver with best performing classifier

In [19]:
## Support Vector Classifier (nonlinear) and Linear discriminant performs the same, choosing SVC for now for predicting the non drivers

svc = cross_validate(svc_model, X, y, cv=5, scoring=scoring)

In [21]:
data_nondriver = pd.read_csv("../scripts/classification_g1_nondriver_sig.csv")

In [22]:
X_test = data_nondriver[sig_cols].fillna(0)
y_test = data_nondriver['ExpPOLEDriverLabel']

In [23]:
svc_model.fit(X, y)

SVC()

In [24]:
y_pred = svc_model.predict(X_test)

In [25]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [26]:
# get nondriver samples with prediction = driver 
data_nondriver.loc[y_pred==1].Tumor_Sample_Barcode.to_numpy()

array(['TCGA-AX-A0J1-01A-11W-A062-09', 'TCGA-AX-A1CE-01A-11D-A135-09',
       'TCGA-AX-A2HD-01A-21D-A17D-09', 'TCGA-B5-A1MR-01A-31D-A14G-09'],
      dtype=object)

In [27]:
# get nondriver samples with prediction = no mutations 
data_nondriver.loc[y_pred==0].Tumor_Sample_Barcode.to_numpy()

array(['TCGA-A5-A0GB-01A-11W-A062-09', 'TCGA-A5-A0VP-01A-21D-A10B-09',
       'TCGA-A5-A2K7-01A-11D-A17W-09', 'TCGA-A5-A7WJ-01A-12D-A34Q-09',
       'TCGA-A5-AB3J-01A-11D-A403-09', 'TCGA-AJ-A3BH-01A-11D-A19Y-09',
       'TCGA-AP-A054-01A-11W-A062-09', 'TCGA-AP-A1DK-01A-11D-A135-09',
       'TCGA-AP-A1DM-01A-21D-A135-09', 'TCGA-AX-A1C9-01A-11D-A135-09',
       'TCGA-AX-A1CF-01A-11D-A135-09', 'TCGA-AX-A2HA-01A-12D-A18P-09',
       'TCGA-B5-A11H-01A-11D-A122-09', 'TCGA-B5-A11Y-01A-21D-A10M-09',
       'TCGA-B5-A1MX-01A-11D-A142-09', 'TCGA-B5-A5OC-01A-21D-A27P-09',
       'TCGA-BG-A0LX-01A-11W-A062-09', 'TCGA-BG-A18A-01A-21D-A12J-09',
       'TCGA-BG-A221-01A-21D-A159-09', 'TCGA-BG-A222-01A-11D-A159-09',
       'TCGA-BG-A2L7-01A-11D-A18P-09', 'TCGA-BS-A0UM-01A-11W-A10C-09',
       'TCGA-D1-A167-01A-11D-A12J-09', 'TCGA-D1-A16N-01A-11D-A12J-09',
       'TCGA-D1-A17F-01A-11D-A12J-09', 'TCGA-DF-A2KN-01A-11D-A17W-09',
       'TCGA-E6-A2P9-01A-11D-A19Y-09', 'TCGA-EY-A1GK-01A-11D-A13L-09',
      