### Model training
1. Split data into 70-30 for training/ validation set
2. Conduct feature engineering (e.g., adding new columns/ one hot encoding) on training and testing set **after** splitting of data to prevent data leakage
3. Using training set, do cross fold validation to select the best model (e.g., adaboost vs xgboost vs logistic regression) and best parameters
4. Using the best model obtained from step 2, train the best model using the full training dataset and validate using the validation dataset - tune the model using the validation dataset
5. Use the tuned model to predict on the unseen dataset that will be given and produce the dataset in the required output format

Note:
- When dealing with only drach sites, comment out the codes that talk about `Relative Position`
- When dealing with drach + non-drach sites, 2 approaches can be considered:
    - Consider `Relative Position` 
    - Do not consider relative positions - remember to drop `Position` variable

### Evaluation criteria:
Model will be evaluated on ROC AUC and PR AUC of validation dataset

**Make sure that ROC AUC is greater than 0.5 and PR AUC is above 0.04**

In [1]:
import math
import random
import numpy as np
import pandas as pd
import seaborn as sns
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import precision_recall_curve, auc
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier

# Version 1 - Contains both DRACH and non-DRACH sites
# df = pd.read_csv('./Data/full_df trial.csv')

# Version 2 - Contains only DRACH sites
df = pd.read_csv('./Data/full_df mastercopy.csv')
labels = pd.read_csv('./Data/data.info')
df = df.merge(labels, how='inner', left_on = ['Transcript Name', 'Position'], right_on = ['transcript_id', 'transcript_position'])
df['Label'] = df['label']
df = df.drop(['label', 'transcript_id', 'transcript_position', 'gene_id'], axis=1)

df

Unnamed: 0,Transcript Name,Gene Name,Position,Bases,Mean SD,Mean_Mean,Mean Dwelling Time,Median SD,Median_Mean,Median Dwelling Time,Label
0,ENST00000000233,ENSG00000004059,244,AGACC,7.382162,125.913513,0.009373,6.650,126.0,0.007970,0
1,ENST00000000233,ENSG00000004059,261,AAACT,3.226535,107.889535,0.006813,3.000,108.0,0.005885,0
2,ENST00000000233,ENSG00000004059,316,AAACA,3.642703,98.947027,0.007416,3.780,99.5,0.006310,0
3,ENST00000000233,ENSG00000004059,332,GAACA,2.899200,97.836500,0.008632,2.635,97.5,0.007320,0
4,ENST00000000233,ENSG00000004059,368,GGACA,5.870303,121.954545,0.011479,5.660,122.0,0.010500,0
...,...,...,...,...,...,...,...,...,...,...,...
121833,ENST00000641834,ENSG00000167747,1348,GGACA,4.929726,116.342466,0.007300,4.580,116.0,0.005310,1
121834,ENST00000641834,ENSG00000167747,1429,TGACA,9.105797,114.927536,0.010305,9.140,116.0,0.009600,0
121835,ENST00000641834,ENSG00000167747,1531,GGACA,4.759688,113.562500,0.006877,4.440,114.0,0.005725,1
121836,ENST00000641834,ENSG00000167747,1537,TGACC,6.552982,123.263158,0.007419,5.790,124.0,0.006810,0


In [2]:
# Obtain all unique gene names in the dataframe
gene_names = []
for i in df['Gene Name']:
    if i not in gene_names:
        gene_names.append(i)

# Split dataset into training and validation data (70/30 ratio) based on the Gene name
random.seed(4262)
training_genes = random.sample(gene_names, int(0.7 * len(gene_names)) ) # sample 70% of genes to be used in training
validation_genes = list(set(gene_names) - set(training_genes)) # remainder of genes that are not sampled will be used in validation
training_data = df[df['Gene Name'].isin(training_genes)]
validation_data = df[df['Gene Name'].isin(validation_genes)]

training_y = training_data['Label'].reset_index(drop=True)
training_X = training_data.drop(['Label'], axis=1).reset_index(drop=True)
validation_y = validation_data['Label'].reset_index(drop=True)
validation_X = validation_data.drop(['Label'], axis=1).reset_index(drop=True)

# One hot encoding for train and test set
training_X_dummies = pd.get_dummies(training_X['Bases'], drop_first=True)
training_X = pd.concat([training_X, training_X_dummies],axis=1)
validation_X_dummies = pd.get_dummies(validation_X['Bases'], drop_first=True)
validation_X = pd.concat([validation_X, validation_X_dummies],axis=1)

training_X

Unnamed: 0,Transcript Name,Gene Name,Position,Bases,Mean SD,Mean_Mean,Mean Dwelling Time,Median SD,Median_Mean,Median Dwelling Time,...,GAACT,GGACA,GGACC,GGACT,TAACA,TAACC,TAACT,TGACA,TGACC,TGACT
0,ENST00000000233,ENSG00000004059,244,AGACC,7.382162,125.913513,0.009373,6.650,126.00,0.007970,...,0,0,0,0,0,0,0,0,0,0
1,ENST00000000233,ENSG00000004059,261,AAACT,3.226535,107.889535,0.006813,3.000,108.00,0.005885,...,0,0,0,0,0,0,0,0,0,0
2,ENST00000000233,ENSG00000004059,316,AAACA,3.642703,98.947027,0.007416,3.780,99.50,0.006310,...,0,0,0,0,0,0,0,0,0,0
3,ENST00000000233,ENSG00000004059,332,GAACA,2.899200,97.836500,0.008632,2.635,97.50,0.007320,...,0,0,0,0,0,0,0,0,0,0
4,ENST00000000233,ENSG00000004059,368,GGACA,5.870303,121.954545,0.011479,5.660,122.00,0.010500,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85761,ENST00000641784,ENSG00000284707,3122,TGACC,6.736167,118.050000,0.007801,6.365,117.00,0.007165,...,0,0,0,0,0,0,0,0,1,0
85762,ENST00000641784,ENSG00000284707,3142,TGACC,8.877377,119.721311,0.007614,8.880,120.00,0.006100,...,0,0,0,0,0,0,0,0,1,0
85763,ENST00000641784,ENSG00000284707,3224,GGACT,4.507167,124.466667,0.008247,4.420,125.00,0.006310,...,0,0,0,1,0,0,0,0,0,0
85764,ENST00000641784,ENSG00000284707,3243,GAACA,2.998125,97.346875,0.007648,2.675,97.60,0.006580,...,0,0,0,0,0,0,0,0,0,0


In [3]:
validation_X

Unnamed: 0,Transcript Name,Gene Name,Position,Bases,Mean SD,Mean_Mean,Mean Dwelling Time,Median SD,Median_Mean,Median Dwelling Time,...,GAACT,GGACA,GGACC,GGACT,TAACA,TAACC,TAACT,TGACA,TGACC,TGACT
0,ENST00000000412,ENSG00000003056,355,AAACT,2.608600,106.584000,0.007782,2.635,108.0,0.006535,...,0,0,0,0,0,0,0,0,0,0
1,ENST00000000412,ENSG00000003056,367,GGACC,6.045319,122.489362,0.007403,5.760,123.0,0.006930,...,0,0,1,0,0,0,0,0,0,0
2,ENST00000000412,ENSG00000003056,496,GGACT,5.986667,125.666667,0.009377,6.050,126.0,0.008300,...,0,0,0,1,0,0,0,0,0,0
3,ENST00000000412,ENSG00000003056,501,GGACT,5.950893,123.821429,0.010334,5.260,124.0,0.009180,...,0,0,0,1,0,0,0,0,0,0
4,ENST00000000412,ENSG00000003056,547,AGACA,5.005962,123.750000,0.009155,4.630,124.0,0.008235,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36067,ENST00000641834,ENSG00000167747,1348,GGACA,4.929726,116.342466,0.007300,4.580,116.0,0.005310,...,0,1,0,0,0,0,0,0,0,0
36068,ENST00000641834,ENSG00000167747,1429,TGACA,9.105797,114.927536,0.010305,9.140,116.0,0.009600,...,0,0,0,0,0,0,0,1,0,0
36069,ENST00000641834,ENSG00000167747,1531,GGACA,4.759688,113.562500,0.006877,4.440,114.0,0.005725,...,0,1,0,0,0,0,0,0,0,0
36070,ENST00000641834,ENSG00000167747,1537,TGACC,6.552982,123.263158,0.007419,5.790,124.0,0.006810,...,0,0,0,0,0,0,0,0,1,0


### Feature engineering functions

In [4]:
# From the EDA, it appears that bases ['GGACT', 'GAACT', 'GGACA', 'GGACC', 'AGACT', 'TGACT'] appear the highest amount of 
# times in m6a positive sites - as such, I will use a greater weight when those bases are present
def indicate_top_bases(bases):
    if bases in ['GGACT', 'GAACT']:
        return 2
    elif bases in ['GGACA', 'GGACC', 'AGACT', 'TGACT']:
        return 1
    return 0

# From the EDA, it appears that bases ['GGACT', 'GAACT', 'GGACA', 'GGACC', 'AGACT', 'TGACT'] appear the highest amount of 
# times in m6a negative sites - as such, I will use a smaller weight when those bases are present
def indicate_bottom_bases(bases):
    if bases in ['AAACT', 'GAACA', 'TGACA', 'AGACA', 'TGACC', 'GGACA']:
        return -1
    return 0

# Obtain the counts of the individual bases and use them as features
def count_bases(bases):
    a,t,c=0,0,0
    for i in bases:
        if i == 'A':
            a+=1
        elif i == 'T':
            t+=1
        elif i == 'C':
            c+=1
    return a,t,c

In [5]:
training_X['Top Bases Indicator'] = training_X['Bases'].apply(indicate_top_bases)
training_X['Bottom Bases Indicator'] = training_X['Bases'].apply(indicate_bottom_bases)
training_X['Combined Bases Indicator'] = training_X['Top Bases Indicator'] + training_X['Bottom Bases Indicator']
training_X['Count_A'], training_X['Count_T'], training_X['Count_C'] = zip(*training_X['Bases'].apply(count_bases))
training_X = training_X.drop(['Bases'],axis=1)

training_X

Unnamed: 0,Transcript Name,Gene Name,Position,Mean SD,Mean_Mean,Mean Dwelling Time,Median SD,Median_Mean,Median Dwelling Time,AAACC,...,TAACT,TGACA,TGACC,TGACT,Top Bases Indicator,Bottom Bases Indicator,Combined Bases Indicator,Count_A,Count_T,Count_C
0,ENST00000000233,ENSG00000004059,244,7.382162,125.913513,0.009373,6.650,126.00,0.007970,0,...,0,0,0,0,0,0,0,2,0,2
1,ENST00000000233,ENSG00000004059,261,3.226535,107.889535,0.006813,3.000,108.00,0.005885,0,...,0,0,0,0,0,-1,-1,3,1,1
2,ENST00000000233,ENSG00000004059,316,3.642703,98.947027,0.007416,3.780,99.50,0.006310,0,...,0,0,0,0,0,0,0,4,0,1
3,ENST00000000233,ENSG00000004059,332,2.899200,97.836500,0.008632,2.635,97.50,0.007320,0,...,0,0,0,0,0,-1,-1,3,0,1
4,ENST00000000233,ENSG00000004059,368,5.870303,121.954545,0.011479,5.660,122.00,0.010500,0,...,0,0,0,0,1,-1,0,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85761,ENST00000641784,ENSG00000284707,3122,6.736167,118.050000,0.007801,6.365,117.00,0.007165,0,...,0,0,1,0,0,-1,-1,1,1,2
85762,ENST00000641784,ENSG00000284707,3142,8.877377,119.721311,0.007614,8.880,120.00,0.006100,0,...,0,0,1,0,0,-1,-1,1,1,2
85763,ENST00000641784,ENSG00000284707,3224,4.507167,124.466667,0.008247,4.420,125.00,0.006310,0,...,0,0,0,0,2,0,2,1,1,1
85764,ENST00000641784,ENSG00000284707,3243,2.998125,97.346875,0.007648,2.675,97.60,0.006580,0,...,0,0,0,0,0,-1,-1,3,0,1


In [6]:
validation_X['Top Bases Indicator'] = validation_X['Bases'].apply(indicate_top_bases)
validation_X['Bottom Bases Indicator'] = validation_X['Bases'].apply(indicate_bottom_bases)
validation_X['Combined Bases Indicator'] = validation_X['Top Bases Indicator'] + validation_X['Bottom Bases Indicator']
validation_X['Count_A'], validation_X['Count_T'], validation_X['Count_C'] = zip(*validation_X['Bases'].apply(count_bases)) 
validation_X = validation_X.drop(['Bases'],axis=1)

validation_X

Unnamed: 0,Transcript Name,Gene Name,Position,Mean SD,Mean_Mean,Mean Dwelling Time,Median SD,Median_Mean,Median Dwelling Time,AAACC,...,TAACT,TGACA,TGACC,TGACT,Top Bases Indicator,Bottom Bases Indicator,Combined Bases Indicator,Count_A,Count_T,Count_C
0,ENST00000000412,ENSG00000003056,355,2.608600,106.584000,0.007782,2.635,108.0,0.006535,0,...,0,0,0,0,0,-1,-1,3,1,1
1,ENST00000000412,ENSG00000003056,367,6.045319,122.489362,0.007403,5.760,123.0,0.006930,0,...,0,0,0,0,1,0,1,1,0,2
2,ENST00000000412,ENSG00000003056,496,5.986667,125.666667,0.009377,6.050,126.0,0.008300,0,...,0,0,0,0,2,0,2,1,1,1
3,ENST00000000412,ENSG00000003056,501,5.950893,123.821429,0.010334,5.260,124.0,0.009180,0,...,0,0,0,0,2,0,2,1,1,1
4,ENST00000000412,ENSG00000003056,547,5.005962,123.750000,0.009155,4.630,124.0,0.008235,0,...,0,0,0,0,0,-1,-1,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36067,ENST00000641834,ENSG00000167747,1348,4.929726,116.342466,0.007300,4.580,116.0,0.005310,0,...,0,0,0,0,1,-1,0,2,0,1
36068,ENST00000641834,ENSG00000167747,1429,9.105797,114.927536,0.010305,9.140,116.0,0.009600,0,...,0,1,0,0,0,-1,-1,2,1,1
36069,ENST00000641834,ENSG00000167747,1531,4.759688,113.562500,0.006877,4.440,114.0,0.005725,0,...,0,0,0,0,1,-1,0,2,0,1
36070,ENST00000641834,ENSG00000167747,1537,6.552982,123.263158,0.007419,5.790,124.0,0.006810,0,...,0,0,1,0,0,-1,-1,1,1,2


### Feature selection using Pearson Correlation

In [7]:
df_cor = training_X.drop(['Transcript Name', 'Gene Name'], axis=1)
cor_matrix = df_cor.corr().abs()
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.75)] # Drop features if correlation is > 0.75
df_cor = df_cor.drop(df_cor[to_drop], axis=1)
df_cor

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))


Unnamed: 0,Position,Mean SD,Mean_Mean,Mean Dwelling Time,AAACC,AAACT,AGACA,AGACC,AGACT,GAACA,...,TAACC,TAACT,TGACA,TGACC,TGACT,Top Bases Indicator,Bottom Bases Indicator,Count_A,Count_T,Count_C
0,244,7.382162,125.913513,0.009373,0,0,0,1,0,0,...,0,0,0,0,0,0,0,2,0,2
1,261,3.226535,107.889535,0.006813,0,1,0,0,0,0,...,0,0,0,0,0,0,-1,3,1,1
2,316,3.642703,98.947027,0.007416,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4,0,1
3,332,2.899200,97.836500,0.008632,0,0,0,0,0,1,...,0,0,0,0,0,0,-1,3,0,1
4,368,5.870303,121.954545,0.011479,0,0,0,0,0,0,...,0,0,0,0,0,1,-1,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85761,3122,6.736167,118.050000,0.007801,0,0,0,0,0,0,...,0,0,0,1,0,0,-1,1,1,2
85762,3142,8.877377,119.721311,0.007614,0,0,0,0,0,0,...,0,0,0,1,0,0,-1,1,1,2
85763,3224,4.507167,124.466667,0.008247,0,0,0,0,0,0,...,0,0,0,0,0,2,0,1,1,1
85764,3243,2.998125,97.346875,0.007648,0,0,0,0,0,1,...,0,0,0,0,0,0,-1,3,0,1


In [8]:
# Drop Position variable if not dealing with relative positions 
training_X = training_X.drop(training_X[to_drop], axis=1).drop(['Position'],axis=1)
validation_X = validation_X.drop(validation_X[to_drop], axis=1).drop(['Position'],axis=1)
training_X

Unnamed: 0,Transcript Name,Gene Name,Mean SD,Mean_Mean,Mean Dwelling Time,AAACC,AAACT,AGACA,AGACC,AGACT,...,TAACC,TAACT,TGACA,TGACC,TGACT,Top Bases Indicator,Bottom Bases Indicator,Count_A,Count_T,Count_C
0,ENST00000000233,ENSG00000004059,7.382162,125.913513,0.009373,0,0,0,1,0,...,0,0,0,0,0,0,0,2,0,2
1,ENST00000000233,ENSG00000004059,3.226535,107.889535,0.006813,0,1,0,0,0,...,0,0,0,0,0,0,-1,3,1,1
2,ENST00000000233,ENSG00000004059,3.642703,98.947027,0.007416,0,0,0,0,0,...,0,0,0,0,0,0,0,4,0,1
3,ENST00000000233,ENSG00000004059,2.899200,97.836500,0.008632,0,0,0,0,0,...,0,0,0,0,0,0,-1,3,0,1
4,ENST00000000233,ENSG00000004059,5.870303,121.954545,0.011479,0,0,0,0,0,...,0,0,0,0,0,1,-1,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85761,ENST00000641784,ENSG00000284707,6.736167,118.050000,0.007801,0,0,0,0,0,...,0,0,0,1,0,0,-1,1,1,2
85762,ENST00000641784,ENSG00000284707,8.877377,119.721311,0.007614,0,0,0,0,0,...,0,0,0,1,0,0,-1,1,1,2
85763,ENST00000641784,ENSG00000284707,4.507167,124.466667,0.008247,0,0,0,0,0,...,0,0,0,0,0,2,0,1,1,1
85764,ENST00000641784,ENSG00000284707,2.998125,97.346875,0.007648,0,0,0,0,0,...,0,0,0,0,0,0,-1,3,0,1


In [9]:
training_X.columns

Index(['Transcript Name', 'Gene Name', 'Mean SD', 'Mean_Mean',
       'Mean Dwelling Time', 'AAACC', 'AAACT', 'AGACA', 'AGACC', 'AGACT',
       'GAACA', 'GAACC', 'GAACT', 'GGACA', 'GGACC', 'GGACT', 'TAACA', 'TAACC',
       'TAACT', 'TGACA', 'TGACC', 'TGACT', 'Top Bases Indicator',
       'Bottom Bases Indicator', 'Count_A', 'Count_T', 'Count_C'],
      dtype='object')

### K-fold validation 
Will be performing k cross-fold validation where k=10 using training dataset

In [10]:
# Break genes of training dataset up into n chunks
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [11]:
# # Obtain all unique genes in training dataset
# unique_genes_training = []
# for i in training_X['Gene Name']:
#     if i not in unique_genes_training:
#         unique_genes_training.append(i)
# unique_genes_training2 = unique_genes_training.copy()        
# unique_genes_training

In [12]:
# random.seed(4262)
# random.shuffle(unique_genes_training)
# cross_val_data = chunks(unique_genes_training, round(len(unique_genes_training)/10)) # Will be using k=10 folds
# cross_val_data = list(cross_val_data)

# # Check to see what the cross_val_data looks like
# for i in cross_val_data:
#     print(i)
#     print(len(i))
#     print('')

In [13]:
training_X

Unnamed: 0,Transcript Name,Gene Name,Mean SD,Mean_Mean,Mean Dwelling Time,AAACC,AAACT,AGACA,AGACC,AGACT,...,TAACC,TAACT,TGACA,TGACC,TGACT,Top Bases Indicator,Bottom Bases Indicator,Count_A,Count_T,Count_C
0,ENST00000000233,ENSG00000004059,7.382162,125.913513,0.009373,0,0,0,1,0,...,0,0,0,0,0,0,0,2,0,2
1,ENST00000000233,ENSG00000004059,3.226535,107.889535,0.006813,0,1,0,0,0,...,0,0,0,0,0,0,-1,3,1,1
2,ENST00000000233,ENSG00000004059,3.642703,98.947027,0.007416,0,0,0,0,0,...,0,0,0,0,0,0,0,4,0,1
3,ENST00000000233,ENSG00000004059,2.899200,97.836500,0.008632,0,0,0,0,0,...,0,0,0,0,0,0,-1,3,0,1
4,ENST00000000233,ENSG00000004059,5.870303,121.954545,0.011479,0,0,0,0,0,...,0,0,0,0,0,1,-1,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85761,ENST00000641784,ENSG00000284707,6.736167,118.050000,0.007801,0,0,0,0,0,...,0,0,0,1,0,0,-1,1,1,2
85762,ENST00000641784,ENSG00000284707,8.877377,119.721311,0.007614,0,0,0,0,0,...,0,0,0,1,0,0,-1,1,1,2
85763,ENST00000641784,ENSG00000284707,4.507167,124.466667,0.008247,0,0,0,0,0,...,0,0,0,0,0,2,0,1,1,1
85764,ENST00000641784,ENSG00000284707,2.998125,97.346875,0.007648,0,0,0,0,0,...,0,0,0,0,0,0,-1,3,0,1


In [14]:
# roc_auc_scores = []
# pr_auc_scores = []
# for i in range(len(cross_val_data)):
#     print(cross_val_data[i]) # The set that is selected will be used for validation
#     # Train test split on training set
#     training_cross_val = training_data[~training_data['Gene Name'].isin(cross_val_data[i])]
#     validation_cross_val = training_data[training_data['Gene Name'].isin(cross_val_data[i])]
# #     print(validation_cross_val)
    
#     training_cross_val_y = training_cross_val['Label'].reset_index(drop=True)
#     training_cross_val_X = training_cross_val.drop(['Label'], axis=1).reset_index(drop=True)
#     validation_cross_val_y = validation_cross_val['Label'].reset_index(drop=True)
#     validation_cross_val_X = validation_cross_val.drop(['Label'], axis=1).reset_index(drop=True)
    
#     # Feature engineering
#     training_cross_val_X['Top Bases Indicator'] = training_cross_val_X['Bases'].apply(indicate_top_bases)
#     training_cross_val_X['Bottom Bases Indicator'] = training_cross_val_X['Bases'].apply(indicate_bottom_bases)
#     training_cross_val_X['Combined Bases Indicator'] = training_cross_val_X['Top Bases Indicator'] + training_cross_val_X['Bottom Bases Indicator']
#     training_cross_val_X['Count_A'], training_cross_val_X['Count_T'], training_cross_val_X['Count_C'] = zip(*training_cross_val_X['Bases'].apply(count_bases)) 
#     training_cross_val_dummies = pd.get_dummies(training_cross_val_X['Bases'], drop_first=True)
#     training_cross_val_X = pd.concat([training_cross_val_X, training_cross_val_dummies],axis=1).drop(['Bases'],axis=1)

#     validation_cross_val_X['Top Bases Indicator'] = validation_cross_val_X['Bases'].apply(indicate_top_bases)
#     validation_cross_val_X['Bottom Bases Indicator'] = validation_cross_val_X['Bases'].apply(indicate_bottom_bases)
#     validation_cross_val_X['Combined Bases Indicator'] = validation_cross_val_X['Top Bases Indicator'] + validation_cross_val_X['Bottom Bases Indicator']
#     validation_cross_val_X['Count_A'], validation_cross_val_X['Count_T'], validation_cross_val_X['Count_C'] = zip(*validation_cross_val_X['Bases'].apply(count_bases)) 
#     validation_cross_val_dummies = pd.get_dummies(validation_cross_val_X['Bases'], drop_first=True)
#     validation_cross_val_X = pd.concat([validation_cross_val_X, validation_cross_val_dummies],axis=1).drop(['Bases'],axis=1)

#     # Drop Position variable when not dealing with relative positions
#     training_cross_val_X = training_cross_val_X.drop(['Transcript Name', 'Gene Name'], axis=1).drop(['Position'],axis=1)
#     validation_cross_val_X = validation_cross_val_X.drop(['Transcript Name', 'Gene Name'], axis=1).drop(['Position'],axis=1)
#     print(validation_cross_val_X)
    
#     # First apply SMOTE to bring minority class distribution to 10% of majority class then use RandomUnderSampler to bring majority class down
#     # to 50 percent more than minority class before fitting model
#     over = SMOTE(sampling_strategy=0.1)
#     rus = RandomUnderSampler(random_state=4262, sampling_strategy = 0.5)
#     training_cross_val_X, training_cross_val_y = over.fit_resample(training_cross_val_X, training_cross_val_y)
#     training_cross_val_X, training_cross_val_y = rus.fit_resample(training_cross_val_X, training_cross_val_y)
    
#     # Fitting ML model
#     random.seed(4262)
#     model = GradientBoostingClassifier(random_state=4262, max_depth=5)
#     model.fit(training_cross_val_X, training_cross_val_y)
#     y_pred = model.predict_proba(validation_cross_val_X)
#     roc_auc_scores.append(roc_auc_score(validation_cross_val_y, y_pred[:,1], average=None))

#     # Data to plot precision - recall curve
#     precision, recall, thresholds = precision_recall_curve(validation_cross_val_y, y_pred[:,1])
#     # Use AUC function to calculate the area under the curve of precision recall curve
#     auc_precision_recall = auc(recall, precision)
#     pr_auc_scores.append(auc_precision_recall)
    
# print('Mean roc auc score is', np.mean(roc_auc_scores))
# print('Mean pr auc score is', np.mean(auc_precision_recall))

### Manually Combine SMOTE and Random Undersampling
https://machinelearningmastery.com/combine-oversampling-and-undersampling-for-imbalanced-classification/

*Note that smote and random undersampling will be applied to training dataset only*

In [15]:
training_X = training_X.drop(['Transcript Name', 'Gene Name'], axis=1)
validation_X = validation_X.drop(['Transcript Name', 'Gene Name'], axis=1)

In [16]:
validation_X

Unnamed: 0,Mean SD,Mean_Mean,Mean Dwelling Time,AAACC,AAACT,AGACA,AGACC,AGACT,GAACA,GAACC,...,TAACC,TAACT,TGACA,TGACC,TGACT,Top Bases Indicator,Bottom Bases Indicator,Count_A,Count_T,Count_C
0,2.608600,106.584000,0.007782,0,1,0,0,0,0,0,...,0,0,0,0,0,0,-1,3,1,1
1,6.045319,122.489362,0.007403,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,2
2,5.986667,125.666667,0.009377,0,0,0,0,0,0,0,...,0,0,0,0,0,2,0,1,1,1
3,5.950893,123.821429,0.010334,0,0,0,0,0,0,0,...,0,0,0,0,0,2,0,1,1,1
4,5.005962,123.750000,0.009155,0,0,1,0,0,0,0,...,0,0,0,0,0,0,-1,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36067,4.929726,116.342466,0.007300,0,0,0,0,0,0,0,...,0,0,0,0,0,1,-1,2,0,1
36068,9.105797,114.927536,0.010305,0,0,0,0,0,0,0,...,0,0,1,0,0,0,-1,2,1,1
36069,4.759688,113.562500,0.006877,0,0,0,0,0,0,0,...,0,0,0,0,0,1,-1,2,0,1
36070,6.552982,123.263158,0.007419,0,0,0,0,0,0,0,...,0,0,0,1,0,0,-1,1,1,2


In [17]:
validation_y

0        0
1        0
2        0
3        0
4        0
        ..
36067    1
36068    0
36069    1
36070    0
36071    0
Name: Label, Length: 36072, dtype: int64

In [18]:
# First apply SMOTE to bring minority class distribution to 10% of majority class then use RandomUnderSampler to bring majority class down
# to 50 percent more than minority class before fitting model
over = SMOTE(sampling_strategy=0.1)
rus = RandomUnderSampler(random_state=4262, sampling_strategy = 0.5)
training_X, training_y = over.fit_resample(training_X, training_y)
training_X, training_y = rus.fit_resample(training_X, training_y)
training_X

Unnamed: 0,Mean SD,Mean_Mean,Mean Dwelling Time,AAACC,AAACT,AGACA,AGACC,AGACT,GAACA,GAACC,...,TAACC,TAACT,TGACA,TGACC,TGACT,Top Bases Indicator,Bottom Bases Indicator,Count_A,Count_T,Count_C
0,6.836585,118.317073,0.010537,0,0,0,0,0,0,0,...,0,0,0,0,0,1,-1,2,0,1
1,3.764595,103.418919,0.007741,0,1,0,0,0,0,0,...,0,0,0,0,0,0,-1,3,1,1
2,3.034091,100.018182,0.006734,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4,0,1
3,7.098623,123.579710,0.006487,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,1,2,1
4,4.036018,94.738957,0.007145,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24622,5.690057,120.217398,0.011651,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,2
24623,3.550719,94.278611,0.006676,0,0,0,0,0,1,0,...,0,0,0,0,0,0,-1,3,0,1
24624,6.542863,115.177511,0.007178,0,0,0,0,0,0,0,...,0,0,1,0,0,0,-1,2,1,1
24625,5.138790,114.222913,0.007493,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,2


### Model training for solely DRACH sites

In [19]:
random.seed(4262)
clf = GradientBoostingClassifier(random_state=4262, max_depth=5)  # Change model here - e.g., AdaBoostClassifier, RandomForestClassifier
clf.fit(training_X, training_y)
y_pred = clf.predict_proba(validation_X)
print('ROC_AUC score is', roc_auc_score(validation_y, y_pred[:,1], average=None))
# ROC_AUC score is 0.8697662438965525

ROC_AUC score is 0.8699131113439433


In [20]:
# Data to plot precision - recall curve
precision, recall, thresholds = precision_recall_curve(validation_y, y_pred[:,1])
# Use AUC function to calculate the area under the curve of precision recall curve
auc_precision_recall = auc(recall, precision)
print('PR AUC score is', auc_precision_recall)
# PR AUC score is 0.42662665822205936

PR AUC score is 0.42590716045151106


In [29]:
# import pickle
# pickle.dump(clf, open('gradientboostingclassifier.sav', 'wb'))

### Model training for Non-DRACH + DRACH sites

In [22]:
# random.seed(4262)
# clf = GradientBoostingClassifier(random_state=4262, max_depth=5)  # Change model here - e.g., AdaBoostClassifier, RandomForestClassifier
# clf.fit(training_X, training_y)
# y_pred = clf.predict_proba(validation_X)

In [23]:
# y_pred = pd.DataFrame(y_pred[:,1], columns = ['Predicted Score'])
# validation_y = validation_y.to_frame()
# y_pred

In [24]:
# # Make sure that only DRACH sites are present before checking scores
# y_pred_and_validation_y = pd.concat([y_pred, validation_y],axis=1)
# indicator = pd.DataFrame([0,1,0] * int(len(y_pred_and_validation_y) / 3), columns = ['Indicator'])
# y_pred_and_validation_y = pd.concat([y_pred_and_validation_y, indicator], axis=1)
# y_pred_and_validation_y = y_pred_and_validation_y[y_pred_and_validation_y['Indicator'] == 1]
# y_pred_and_validation_y = y_pred_and_validation_y.drop(['Indicator'], axis=1)
# y_pred_and_validation_y

In [25]:
# # Convert DataFrame to Series before checking ROC AUC and PR AUC Score
# validation_y = y_pred_and_validation_y['Label'].squeeze()
# y_pred = y_pred_and_validation_y['Predicted Score'].squeeze()

In [26]:
# print('ROC_AUC score is', roc_auc_score(validation_y, y_pred, average=None))
# # ROC_AUC score is 0.8665415772607338 (One hot encoding)

In [27]:
# # Data to plot precision - recall curve
# precision, recall, thresholds = precision_recall_curve(validation_y, y_pred)
# # Use AUC function to calculate the area under the curve of precision recall curve
# auc_precision_recall = auc(recall, precision)
# print('PR AUC score is', auc_precision_recall)
# # PR AUC score is 0.4156796450782604 (One hot encoding)

In [28]:
# import pickle
# pickle.dump(clf, open('gradientboostingclassifier.sav', 'wb'))