### Model training
1. Split data into 70-30 for training/ validation set
2. Conduct feature engineering (e.g., adding new columns/ one hot encoding) on training and testing set **after** splitting of data to prevent data leakage
3. Using training set, do cross fold validation to select the best model (e.g., adaboost vs xgboost vs logistic regression) and best parameters
4. Using the best model obtained from step 2, train the best model using the full training dataset and validate using the validation dataset - tune the model using the validation dataset
5. Use the tuned model to predict on the unseen dataset that will be given and produce the dataset in the required output format

Note:
- When dealing with only drach sites, comment out the codes that talk about `Relative Position`
- When dealing with drach + non-drach sites, 2 approaches can be considered:
    - Consider `Relative Position` 
    - Do not consider relative positions - remember to drop `Position` variable

### Evaluation criteria:
Model will be evaluated on ROC AUC and PR AUC of validation dataset

**Make sure that ROC AUC is greater than 0.5 and PR AUC is above 0.04**

In [1]:
import math
import random
import numpy as np
import pandas as pd
import seaborn as sns
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import precision_recall_curve, auc
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier

# Version 1 - Contains both DRACH and non-DRACH sites
df = pd.read_csv('./Data/full_df trial.csv')

# Version 2 - Contains only DRACH sites
# df = pd.read_csv('./Data/full_df mastercopy.csv')
# labels = pd.read_csv('./Data/data.info')
# df = df.merge(labels, how='inner', left_on = ['Transcript Name', 'Position'], right_on = ['transcript_id', 'transcript_position'])
# df['Label'] = df['label']
# df = df.drop(['label', 'transcript_id', 'transcript_position', 'gene_id'], axis=1)

df

Unnamed: 0,Transcript Name,Gene Name,Position,Bases,Mean SD,Mean_Mean,Mean Dwelling Time,Median SD,Median_Mean,Median Dwelling Time,Label
0,ENST00000000233,ENSG00000004059,243,AAGAC,4.223784,123.702703,0.008264,3.730,125.00,0.006970,0
1,ENST00000000233,ENSG00000004059,244,AGACC,7.382162,125.913513,0.009373,6.650,126.00,0.007970,0
2,ENST00000000233,ENSG00000004059,245,GACCA,4.386989,80.570270,0.007345,3.440,80.50,0.005980,0
3,ENST00000000233,ENSG00000004059,260,CAAAC,3.216424,109.681395,0.006609,2.880,110.00,0.005640,0
4,ENST00000000233,ENSG00000004059,261,AAACT,3.226535,107.889535,0.006813,3.000,108.00,0.005885,0
...,...,...,...,...,...,...,...,...,...,...,...
365509,ENST00000641834,ENSG00000167747,1537,TGACC,6.552982,123.263158,0.007419,5.790,124.00,0.006810,0
365510,ENST00000641834,ENSG00000167747,1538,GACCA,2.540877,82.289474,0.006472,2.330,82.00,0.006310,0
365511,ENST00000641834,ENSG00000167747,1692,TTGAC,4.090577,105.807692,0.008788,3.160,107.00,0.007090,0
365512,ENST00000641834,ENSG00000167747,1693,TGACA,8.702885,113.134615,0.006907,8.675,113.00,0.006705,0


In [2]:
# Obtain all unique gene names in the dataframe
gene_names = []
for i in df['Gene Name']:
    if i not in gene_names:
        gene_names.append(i)

# Split dataset into training and validation data (70/30 ratio) based on the Gene name
random.seed(4262)
training_genes = random.sample(gene_names, int(0.7 * len(gene_names)) ) # sample 70% of genes to be used in training
validation_genes = list(set(gene_names) - set(training_genes)) # remainder of genes that are not sampled will be used in validation
training_data = df[df['Gene Name'].isin(training_genes)]
validation_data = df[df['Gene Name'].isin(validation_genes)]

training_y = training_data['Label'].reset_index(drop=True)
training_X = training_data.drop(['Label'], axis=1).reset_index(drop=True)
validation_y = validation_data['Label'].reset_index(drop=True)
validation_X = validation_data.drop(['Label'], axis=1).reset_index(drop=True)

# One hot encoding for train and test set
training_X_dummies = pd.get_dummies(training_X['Bases'], drop_first=True)
training_X = pd.concat([training_X, training_X_dummies],axis=1)
validation_X_dummies = pd.get_dummies(validation_X['Bases'], drop_first=True)
validation_X = pd.concat([validation_X, validation_X_dummies],axis=1)

training_X

Unnamed: 0,Transcript Name,Gene Name,Position,Bases,Mean SD,Mean_Mean,Mean Dwelling Time,Median SD,Median_Mean,Median Dwelling Time,...,TAACC,TAACT,TAGAC,TGAAC,TGACA,TGACC,TGACT,TGGAC,TTAAC,TTGAC
0,ENST00000000233,ENSG00000004059,243,AAGAC,4.223784,123.702703,0.008264,3.730,125.00,0.006970,...,0,0,0,0,0,0,0,0,0,0
1,ENST00000000233,ENSG00000004059,244,AGACC,7.382162,125.913513,0.009373,6.650,126.00,0.007970,...,0,0,0,0,0,0,0,0,0,0
2,ENST00000000233,ENSG00000004059,245,GACCA,4.386989,80.570270,0.007345,3.440,80.50,0.005980,...,0,0,0,0,0,0,0,0,0,0
3,ENST00000000233,ENSG00000004059,260,CAAAC,3.216424,109.681395,0.006609,2.880,110.00,0.005640,...,0,0,0,0,0,0,0,0,0,0
4,ENST00000000233,ENSG00000004059,261,AAACT,3.226535,107.889535,0.006813,3.000,108.00,0.005885,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257293,ENST00000641784,ENSG00000284707,3243,GAACA,2.998125,97.346875,0.007648,2.675,97.60,0.006580,...,0,0,0,0,0,0,0,0,0,0
257294,ENST00000641784,ENSG00000284707,3244,AACAA,2.203750,88.439063,0.005190,2.130,88.70,0.004660,...,0,0,0,0,0,0,0,0,0,0
257295,ENST00000641784,ENSG00000284707,3265,CTAAC,1.874516,94.209677,0.005972,1.760,94.80,0.005065,...,0,0,0,0,0,0,0,0,0,0
257296,ENST00000641784,ENSG00000284707,3266,TAACT,2.194032,99.730645,0.006831,2.170,99.75,0.005785,...,0,1,0,0,0,0,0,0,0,0


In [3]:
training_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 257298 entries, 0 to 257297
Data columns (total 75 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Transcript Name       257298 non-null  object 
 1   Gene Name             257298 non-null  object 
 2   Position              257298 non-null  int64  
 3   Bases                 257298 non-null  object 
 4   Mean SD               257298 non-null  float64
 5   Mean_Mean             257298 non-null  float64
 6   Mean Dwelling Time    257298 non-null  float64
 7   Median SD             257298 non-null  float64
 8   Median_Mean           257298 non-null  float64
 9   Median Dwelling Time  257298 non-null  float64
 10  AAACA                 257298 non-null  uint8  
 11  AAACC                 257298 non-null  uint8  
 12  AAACT                 257298 non-null  uint8  
 13  AACAA                 257298 non-null  uint8  
 14  AACAC                 257298 non-null  uint8  
 15  

In [4]:
validation_X

Unnamed: 0,Transcript Name,Gene Name,Position,Bases,Mean SD,Mean_Mean,Mean Dwelling Time,Median SD,Median_Mean,Median Dwelling Time,...,TAACC,TAACT,TAGAC,TGAAC,TGACA,TGACC,TGACT,TGGAC,TTAAC,TTGAC
0,ENST00000000412,ENSG00000003056,354,GAAAC,2.977180,108.360000,0.007340,2.495,109.00,0.005475,...,0,0,0,0,0,0,0,0,0,0
1,ENST00000000412,ENSG00000003056,355,AAACT,2.608600,106.584000,0.007782,2.635,108.00,0.006535,...,0,0,0,0,0,0,0,0,0,0
2,ENST00000000412,ENSG00000003056,356,AACTA,1.888520,94.174000,0.007045,1.835,94.15,0.006745,...,0,0,0,0,0,0,0,0,0,0
3,ENST00000000412,ENSG00000003056,366,GGGAC,3.961489,118.638298,0.008988,3.670,119.00,0.007970,...,0,0,0,0,0,0,0,0,0,0
4,ENST00000000412,ENSG00000003056,367,GGACC,6.045319,122.489362,0.007403,5.760,123.00,0.006930,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108211,ENST00000641834,ENSG00000167747,1537,TGACC,6.552982,123.263158,0.007419,5.790,124.00,0.006810,...,0,0,0,0,0,1,0,0,0,0
108212,ENST00000641834,ENSG00000167747,1538,GACCA,2.540877,82.289474,0.006472,2.330,82.00,0.006310,...,0,0,0,0,0,0,0,0,0,0
108213,ENST00000641834,ENSG00000167747,1692,TTGAC,4.090577,105.807692,0.008788,3.160,107.00,0.007090,...,0,0,0,0,0,0,0,0,0,1
108214,ENST00000641834,ENSG00000167747,1693,TGACA,8.702885,113.134615,0.006907,8.675,113.00,0.006705,...,0,0,0,0,1,0,0,0,0,0


In [5]:
validation_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108216 entries, 0 to 108215
Data columns (total 75 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Transcript Name       108216 non-null  object 
 1   Gene Name             108216 non-null  object 
 2   Position              108216 non-null  int64  
 3   Bases                 108216 non-null  object 
 4   Mean SD               108216 non-null  float64
 5   Mean_Mean             108216 non-null  float64
 6   Mean Dwelling Time    108216 non-null  float64
 7   Median SD             108216 non-null  float64
 8   Median_Mean           108216 non-null  float64
 9   Median Dwelling Time  108216 non-null  float64
 10  AAACA                 108216 non-null  uint8  
 11  AAACC                 108216 non-null  uint8  
 12  AAACT                 108216 non-null  uint8  
 13  AACAA                 108216 non-null  uint8  
 14  AACAC                 108216 non-null  uint8  
 15  

### Feature engineering functions

In [6]:
# From the EDA, it appears that bases ['GGACT', 'GAACT', 'GGACA', 'GGACC', 'AGACT', 'TGACT'] appear the highest amount of 
# times in m6a positive sites - as such, I will use a greater weight when those bases are present
def indicate_top_bases(bases):
    if bases in ['GGACT', 'GAACT']:
        return 2
    elif bases in ['GGACA', 'GGACC', 'AGACT', 'TGACT']:
        return 1
    return 0

# From the EDA, it appears that bases ['GGACT', 'GAACT', 'GGACA', 'GGACC', 'AGACT', 'TGACT'] appear the highest amount of 
# times in m6a negative sites - as such, I will use a smaller weight when those bases are present
def indicate_bottom_bases(bases):
    if bases in ['AAACT', 'GAACA', 'TGACA', 'AGACA', 'TGACC', 'GGACA']:
        return -1
    return 0

# Obtain the counts of the individual bases and use them as features
def count_bases(bases):
    a,t,c=0,0,0
    for i in bases:
        if i == 'A':
            a+=1
        elif i == 'T':
            t+=1
        elif i == 'C':
            c+=1
    return a,t,c

In [7]:
# Feature engineering for train set 
# Uncomment following 2 lines of codes when dealing with solely drach sites 
# relative_position_train = pd.DataFrame([1,2,3] * int(len(training_X)/3), columns=['Relative Position'])
# training_X = pd.concat([training_X, relative_position_train], axis=1).drop(['Position'], axis=1)

training_X['Top Bases Indicator'] = training_X['Bases'].apply(indicate_top_bases)
training_X['Bottom Bases Indicator'] = training_X['Bases'].apply(indicate_bottom_bases)
training_X['Combined Bases Indicator'] = training_X['Top Bases Indicator'] + training_X['Bottom Bases Indicator']
training_X['Count_A'], training_X['Count_T'], training_X['Count_C'] = zip(*training_X['Bases'].apply(count_bases))
training_X = training_X.drop(['Bases'],axis=1)

training_X

Unnamed: 0,Transcript Name,Gene Name,Position,Mean SD,Mean_Mean,Mean Dwelling Time,Median SD,Median_Mean,Median Dwelling Time,AAACA,...,TGACT,TGGAC,TTAAC,TTGAC,Top Bases Indicator,Bottom Bases Indicator,Combined Bases Indicator,Count_A,Count_T,Count_C
0,ENST00000000233,ENSG00000004059,243,4.223784,123.702703,0.008264,3.730,125.00,0.006970,0,...,0,0,0,0,0,0,0,3,0,1
1,ENST00000000233,ENSG00000004059,244,7.382162,125.913513,0.009373,6.650,126.00,0.007970,0,...,0,0,0,0,0,0,0,2,0,2
2,ENST00000000233,ENSG00000004059,245,4.386989,80.570270,0.007345,3.440,80.50,0.005980,0,...,0,0,0,0,0,0,0,2,0,2
3,ENST00000000233,ENSG00000004059,260,3.216424,109.681395,0.006609,2.880,110.00,0.005640,0,...,0,0,0,0,0,0,0,3,0,2
4,ENST00000000233,ENSG00000004059,261,3.226535,107.889535,0.006813,3.000,108.00,0.005885,0,...,0,0,0,0,0,-1,-1,3,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257293,ENST00000641784,ENSG00000284707,3243,2.998125,97.346875,0.007648,2.675,97.60,0.006580,0,...,0,0,0,0,0,-1,-1,3,0,1
257294,ENST00000641784,ENSG00000284707,3244,2.203750,88.439063,0.005190,2.130,88.70,0.004660,0,...,0,0,0,0,0,0,0,4,0,1
257295,ENST00000641784,ENSG00000284707,3265,1.874516,94.209677,0.005972,1.760,94.80,0.005065,0,...,0,0,0,0,0,0,0,2,1,2
257296,ENST00000641784,ENSG00000284707,3266,2.194032,99.730645,0.006831,2.170,99.75,0.005785,0,...,0,0,0,0,0,0,0,2,2,1


In [8]:
validation_X['Top Bases Indicator'] = validation_X['Bases'].apply(indicate_top_bases)
validation_X['Bottom Bases Indicator'] = validation_X['Bases'].apply(indicate_bottom_bases)
validation_X['Combined Bases Indicator'] = validation_X['Top Bases Indicator'] + validation_X['Bottom Bases Indicator']
validation_X['Count_A'], validation_X['Count_T'], validation_X['Count_C'] = zip(*validation_X['Bases'].apply(count_bases)) 
validation_X = validation_X.drop(['Bases'],axis=1)

validation_X

Unnamed: 0,Transcript Name,Gene Name,Position,Mean SD,Mean_Mean,Mean Dwelling Time,Median SD,Median_Mean,Median Dwelling Time,AAACA,...,TGACT,TGGAC,TTAAC,TTGAC,Top Bases Indicator,Bottom Bases Indicator,Combined Bases Indicator,Count_A,Count_T,Count_C
0,ENST00000000412,ENSG00000003056,354,2.977180,108.360000,0.007340,2.495,109.00,0.005475,0,...,0,0,0,0,0,0,0,3,0,1
1,ENST00000000412,ENSG00000003056,355,2.608600,106.584000,0.007782,2.635,108.00,0.006535,0,...,0,0,0,0,0,-1,-1,3,1,1
2,ENST00000000412,ENSG00000003056,356,1.888520,94.174000,0.007045,1.835,94.15,0.006745,0,...,0,0,0,0,0,0,0,3,1,1
3,ENST00000000412,ENSG00000003056,366,3.961489,118.638298,0.008988,3.670,119.00,0.007970,0,...,0,0,0,0,0,0,0,1,0,1
4,ENST00000000412,ENSG00000003056,367,6.045319,122.489362,0.007403,5.760,123.00,0.006930,0,...,0,0,0,0,1,0,1,1,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108211,ENST00000641834,ENSG00000167747,1537,6.552982,123.263158,0.007419,5.790,124.00,0.006810,0,...,0,0,0,0,0,-1,-1,1,1,2
108212,ENST00000641834,ENSG00000167747,1538,2.540877,82.289474,0.006472,2.330,82.00,0.006310,0,...,0,0,0,0,0,0,0,2,0,2
108213,ENST00000641834,ENSG00000167747,1692,4.090577,105.807692,0.008788,3.160,107.00,0.007090,0,...,0,0,0,1,0,0,0,1,2,1
108214,ENST00000641834,ENSG00000167747,1693,8.702885,113.134615,0.006907,8.675,113.00,0.006705,0,...,0,0,0,0,0,-1,-1,2,1,1


### Feature selection using Pearson Correlation

In [9]:
df_cor = training_X.drop(['Transcript Name', 'Gene Name'], axis=1)
cor_matrix = df_cor.corr().abs()
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.75)] # Drop features if correlation is > 0.75
df_cor = df_cor.drop(df_cor[to_drop], axis=1)
df_cor

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))


Unnamed: 0,Position,Mean SD,Mean_Mean,Mean Dwelling Time,AAACA,AAACC,AAACT,AACAA,AACAC,AACAG,...,TGACC,TGACT,TGGAC,TTAAC,TTGAC,Top Bases Indicator,Bottom Bases Indicator,Count_A,Count_T,Count_C
0,243,4.223784,123.702703,0.008264,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3,0,1
1,244,7.382162,125.913513,0.009373,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,2
2,245,4.386989,80.570270,0.007345,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,2
3,260,3.216424,109.681395,0.006609,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3,0,2
4,261,3.226535,107.889535,0.006813,0,0,1,0,0,0,...,0,0,0,0,0,0,-1,3,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257293,3243,2.998125,97.346875,0.007648,0,0,0,0,0,0,...,0,0,0,0,0,0,-1,3,0,1
257294,3244,2.203750,88.439063,0.005190,0,0,0,1,0,0,...,0,0,0,0,0,0,0,4,0,1
257295,3265,1.874516,94.209677,0.005972,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,1,2
257296,3266,2.194032,99.730645,0.006831,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,2,1


In [10]:
# Drop Position variable if not dealing with relative positions 
training_X = training_X.drop(training_X[to_drop], axis=1).drop(['Position'],axis=1)
validation_X = validation_X.drop(validation_X[to_drop], axis=1).drop(['Position'],axis=1)
training_X

Unnamed: 0,Transcript Name,Gene Name,Mean SD,Mean_Mean,Mean Dwelling Time,AAACA,AAACC,AAACT,AACAA,AACAC,...,TGACC,TGACT,TGGAC,TTAAC,TTGAC,Top Bases Indicator,Bottom Bases Indicator,Count_A,Count_T,Count_C
0,ENST00000000233,ENSG00000004059,4.223784,123.702703,0.008264,0,0,0,0,0,...,0,0,0,0,0,0,0,3,0,1
1,ENST00000000233,ENSG00000004059,7.382162,125.913513,0.009373,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,2
2,ENST00000000233,ENSG00000004059,4.386989,80.570270,0.007345,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,2
3,ENST00000000233,ENSG00000004059,3.216424,109.681395,0.006609,0,0,0,0,0,...,0,0,0,0,0,0,0,3,0,2
4,ENST00000000233,ENSG00000004059,3.226535,107.889535,0.006813,0,0,1,0,0,...,0,0,0,0,0,0,-1,3,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257293,ENST00000641784,ENSG00000284707,2.998125,97.346875,0.007648,0,0,0,0,0,...,0,0,0,0,0,0,-1,3,0,1
257294,ENST00000641784,ENSG00000284707,2.203750,88.439063,0.005190,0,0,0,1,0,...,0,0,0,0,0,0,0,4,0,1
257295,ENST00000641784,ENSG00000284707,1.874516,94.209677,0.005972,0,0,0,0,0,...,0,0,0,0,0,0,0,2,1,2
257296,ENST00000641784,ENSG00000284707,2.194032,99.730645,0.006831,0,0,0,0,0,...,0,0,0,0,0,0,0,2,2,1


In [11]:
training_X.columns

Index(['Transcript Name', 'Gene Name', 'Mean SD', 'Mean_Mean',
       'Mean Dwelling Time', 'AAACA', 'AAACC', 'AAACT', 'AACAA', 'AACAC',
       'AACAG', 'AACAT', 'AACCA', 'AACCC', 'AACCG', 'AACCT', 'AACTA', 'AACTC',
       'AACTG', 'AACTT', 'AAGAC', 'AGAAC', 'AGACA', 'AGACC', 'AGACT', 'AGGAC',
       'ATAAC', 'ATGAC', 'CAAAC', 'CAGAC', 'CGAAC', 'CGGAC', 'CTAAC', 'CTGAC',
       'GAAAC', 'GAACA', 'GAACC', 'GAACT', 'GACAA', 'GACAC', 'GACAG', 'GACAT',
       'GACCA', 'GACCC', 'GACCG', 'GACCT', 'GACTA', 'GACTC', 'GACTG', 'GACTT',
       'GAGAC', 'GGAAC', 'GGACA', 'GGACC', 'GGACT', 'GGGAC', 'GTAAC', 'GTGAC',
       'TAAAC', 'TAACA', 'TAACC', 'TAACT', 'TAGAC', 'TGAAC', 'TGACA', 'TGACC',
       'TGACT', 'TGGAC', 'TTAAC', 'TTGAC', 'Top Bases Indicator',
       'Bottom Bases Indicator', 'Count_A', 'Count_T', 'Count_C'],
      dtype='object')

### K-fold validation 
Will be performing k cross-fold validation where k=10 using training dataset

In [12]:
# Break genes of training dataset up into n chunks
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [13]:
# # Obtain all unique genes in training dataset
# unique_genes_training = []
# for i in training_X['Gene Name']:
#     if i not in unique_genes_training:
#         unique_genes_training.append(i)
# unique_genes_training2 = unique_genes_training.copy()        
# unique_genes_training

In [14]:
# random.seed(4262)
# random.shuffle(unique_genes_training)
# cross_val_data = chunks(unique_genes_training, round(len(unique_genes_training)/10)) # Will be using k=10 folds
# cross_val_data = list(cross_val_data)

# # Check to see what the cross_val_data looks like
# for i in cross_val_data:
#     print(i)
#     print(len(i))
#     print('')

In [15]:
# training_X

In [16]:
# roc_auc_scores = []
# pr_auc_scores = []
# for i in range(len(cross_val_data)):
#     print(cross_val_data[i]) # The set that is selected will be used for validation
#     # Train test split on training set
#     training_cross_val = training_data[~training_data['Gene Name'].isin(cross_val_data[i])]
#     validation_cross_val = training_data[training_data['Gene Name'].isin(cross_val_data[i])]
# #     print(validation_cross_val)
    
#     training_cross_val_y = training_cross_val['Label'].reset_index(drop=True)
#     training_cross_val_X = training_cross_val.drop(['Label'], axis=1).reset_index(drop=True)
#     validation_cross_val_y = validation_cross_val['Label'].reset_index(drop=True)
#     validation_cross_val_X = validation_cross_val.drop(['Label'], axis=1).reset_index(drop=True)
    
#     # Feature engineering
#     training_cross_val_X['Top Bases Indicator'] = training_cross_val_X['Bases'].apply(indicate_top_bases)
#     training_cross_val_X['Bottom Bases Indicator'] = training_cross_val_X['Bases'].apply(indicate_bottom_bases)
#     training_cross_val_X['Combined Bases Indicator'] = training_cross_val_X['Top Bases Indicator'] + training_cross_val_X['Bottom Bases Indicator']
#     training_cross_val_X['Count_A'], training_cross_val_X['Count_T'], training_cross_val_X['Count_C'] = zip(*training_cross_val_X['Bases'].apply(count_bases)) 
#     training_cross_val_dummies = pd.get_dummies(training_cross_val_X['Bases'], drop_first=True)
#     training_cross_val_X = pd.concat([training_cross_val_X, training_cross_val_dummies],axis=1).drop(['Bases'],axis=1)

#     validation_cross_val_X['Top Bases Indicator'] = validation_cross_val_X['Bases'].apply(indicate_top_bases)
#     validation_cross_val_X['Bottom Bases Indicator'] = validation_cross_val_X['Bases'].apply(indicate_bottom_bases)
#     validation_cross_val_X['Combined Bases Indicator'] = validation_cross_val_X['Top Bases Indicator'] + validation_cross_val_X['Bottom Bases Indicator']
#     validation_cross_val_X['Count_A'], validation_cross_val_X['Count_T'], validation_cross_val_X['Count_C'] = zip(*validation_cross_val_X['Bases'].apply(count_bases)) 
#     validation_cross_val_dummies = pd.get_dummies(validation_cross_val_X['Bases'], drop_first=True)
#     validation_cross_val_X = pd.concat([validation_cross_val_X, validation_cross_val_dummies],axis=1).drop(['Bases'],axis=1)

#     # Drop Position variable when not dealing with relative positions
#     training_cross_val_X = training_cross_val_X.drop(['Transcript Name', 'Gene Name'], axis=1).drop(['Position'],axis=1)
#     validation_cross_val_X = validation_cross_val_X.drop(['Transcript Name', 'Gene Name'], axis=1).drop(['Position'],axis=1)
#     print(validation_cross_val_X)
    
#     # First apply SMOTE to bring minority class distribution to 10% of majority class then use RandomUnderSampler to bring majority class down
#     # to 50 percent more than minority class before fitting model
#     over = SMOTE(sampling_strategy=0.1)
#     rus = RandomUnderSampler(random_state=4262, sampling_strategy = 0.5)
#     training_cross_val_X, training_cross_val_y = over.fit_resample(training_cross_val_X, training_cross_val_y)
#     training_cross_val_X, training_cross_val_y = rus.fit_resample(training_cross_val_X, training_cross_val_y)
    
#     # Fitting ML model
#     random.seed(4262)
#     model = GradientBoostingClassifier(random_state=4262, max_depth=5)
#     model.fit(training_cross_val_X, training_cross_val_y)
#     y_pred = model.predict_proba(validation_cross_val_X)
#     roc_auc_scores.append(roc_auc_score(validation_cross_val_y, y_pred[:,1], average=None))

#     # Data to plot precision - recall curve
#     precision, recall, thresholds = precision_recall_curve(validation_cross_val_y, y_pred[:,1])
#     # Use AUC function to calculate the area under the curve of precision recall curve
#     auc_precision_recall = auc(recall, precision)
#     pr_auc_scores.append(auc_precision_recall)
    
# print('Mean roc auc score is', np.mean(roc_auc_scores))
# print('Mean pr auc score is', np.mean(auc_precision_recall))

### Manually Combine SMOTE and Random Undersampling
https://machinelearningmastery.com/combine-oversampling-and-undersampling-for-imbalanced-classification/

*Note that smote and random undersampling will be applied to training dataset only*

In [17]:
training_X = training_X.drop(['Transcript Name', 'Gene Name'], axis=1)
validation_X = validation_X.drop(['Transcript Name', 'Gene Name'], axis=1)

In [18]:
validation_X

Unnamed: 0,Mean SD,Mean_Mean,Mean Dwelling Time,AAACA,AAACC,AAACT,AACAA,AACAC,AACAG,AACAT,...,TGACC,TGACT,TGGAC,TTAAC,TTGAC,Top Bases Indicator,Bottom Bases Indicator,Count_A,Count_T,Count_C
0,2.977180,108.360000,0.007340,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3,0,1
1,2.608600,106.584000,0.007782,0,0,1,0,0,0,0,...,0,0,0,0,0,0,-1,3,1,1
2,1.888520,94.174000,0.007045,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3,1,1
3,3.961489,118.638298,0.008988,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
4,6.045319,122.489362,0.007403,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108211,6.552982,123.263158,0.007419,0,0,0,0,0,0,0,...,1,0,0,0,0,0,-1,1,1,2
108212,2.540877,82.289474,0.006472,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,2
108213,4.090577,105.807692,0.008788,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,2,1
108214,8.702885,113.134615,0.006907,0,0,0,0,0,0,0,...,0,0,0,0,0,0,-1,2,1,1


In [19]:
validation_y

0         0
1         0
2         0
3         0
4         0
         ..
108211    0
108212    0
108213    0
108214    0
108215    0
Name: Label, Length: 108216, dtype: int64

In [20]:
# First apply SMOTE to bring minority class distribution to 10% of majority class then use RandomUnderSampler to bring majority class down
# to 50 percent more than minority class before fitting model
over = SMOTE(sampling_strategy=0.1)
rus = RandomUnderSampler(random_state=4262, sampling_strategy = 0.5)
training_X, training_y = over.fit_resample(training_X, training_y)
training_X, training_y = rus.fit_resample(training_X, training_y)
training_X

Unnamed: 0,Mean SD,Mean_Mean,Mean Dwelling Time,AAACA,AAACC,AAACT,AACAA,AACAC,AACAG,AACAT,...,TGACC,TGACT,TGGAC,TTAAC,TTGAC,Top Bases Indicator,Bottom Bases Indicator,Count_A,Count_T,Count_C
0,3.258396,82.822642,0.007042,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,1,1
1,7.993535,100.328956,0.008158,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1
2,3.356057,81.044588,0.008305,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,1
3,6.414508,128.524590,0.010068,0,0,0,0,0,0,0,...,0,0,0,0,0,0,-1,3,0,1
4,4.527273,121.545455,0.007291,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76081,7.551178,119.214028,0.009247,0,0,0,0,0,0,0,...,0,0,0,0,0,1,-1,2,0,1
76082,3.196161,100.730095,0.006709,0,0,0,0,0,0,0,...,0,0,0,0,0,2,0,2,1,1
76083,5.330866,118.318560,0.006649,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,1,2,1
76084,4.904098,99.355053,0.005809,0,0,0,0,0,0,0,...,0,0,0,0,0,2,0,2,1,1


In [21]:
training_y.value_counts()

0    50724
1    25362
Name: Label, dtype: int64

In [22]:
validation_y.value_counts()

0    106413
1      1803
Name: Label, dtype: int64

### Model training

#### Adaboost algorithm

In [23]:
# random.seed(4262)
# model = AdaBoostClassifier(random_state=4262)
# model.fit(training_X, training_y)
# y_pred = model.predict_proba(validation_X)
# print('ROC_AUC score is', roc_auc_score(validation_y, y_pred[:,1], average=None))

In [24]:
# # Data to plot precision - recall curve
# precision, recall, thresholds = precision_recall_curve(validation_y, y_pred[:,1])
# # Use AUC function to calculate the area under the curve of precision recall curve
# auc_precision_recall = auc(recall, precision)
# print('PR AUC score is', auc_precision_recall)

#### GradientBoostingClassifier algorithm

In [25]:
random.seed(4262)
clf = GradientBoostingClassifier(random_state=4262, max_depth=5)
clf.fit(training_X, training_y)
y_pred = clf.predict_proba(validation_X)
print('ROC_AUC score is', roc_auc_score(validation_y, y_pred[:,1], average=None))

ROC_AUC score is 0.9572553883197656


In [26]:
# Data to plot precision - recall curve
precision, recall, thresholds = precision_recall_curve(validation_y, y_pred[:,1])
# Use AUC function to calculate the area under the curve of precision recall curve
auc_precision_recall = auc(recall, precision)
print('PR AUC score is', auc_precision_recall)

PR AUC score is 0.4130069420903716


In [30]:
import pickle
pickle.dump(clf, open('gradientboostingclassifier.sav', 'wb'))


#### RandomForestClassifier algorithm

In [28]:
# random.seed(4262)
# rf = RandomForestClassifier(random_state=4262)
# rf.fit(training_X, training_y)
# y_pred = rf.predict_proba(validation_X)
# print('ROC_AUC score is', roc_auc_score(validation_y, y_pred[:,1], average=None))

In [29]:
# # Data to plot precision - recall curve
# precision, recall, thresholds = precision_recall_curve(validation_y, y_pred[:,1])
# # Use AUC function to calculate the area under the curve of precision recall curve
# auc_precision_recall = auc(recall, precision)
# print('PR AUC score is', auc_precision_recall)