In [None]:
# !pip install numpy
# !pip install pandas
# !pip install seaborn
# !pip install imblearn

In [1]:
import math
import random
import numpy as np
import pandas as pd
import seaborn as sns
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.model_selection import cross_val_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import precision_recall_curve, auc
from sklearn.metrics import precision_score, recall_score
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier

df = pd.read_csv('./Data/train_set_github.csv')
df

Unnamed: 0,Transcript Name,Gene Name,Position,Bases,Mean SD,Mean_Mean,Mean Dwelling Time,Median SD,Median_Mean,Median Dwelling Time,Label
0,ENST00000000233,ENSG00000004059,243,AAGAC,4.223784,123.702703,0.008264,3.730,125.00,0.006970,0
1,ENST00000000233,ENSG00000004059,244,AGACC,7.382162,125.913513,0.009373,6.650,126.00,0.007970,0
2,ENST00000000233,ENSG00000004059,245,GACCA,4.386989,80.570270,0.007345,3.440,80.50,0.005980,0
3,ENST00000000233,ENSG00000004059,260,CAAAC,3.216424,109.681395,0.006609,2.880,110.00,0.005640,0
4,ENST00000000233,ENSG00000004059,261,AAACT,3.226535,107.889535,0.006813,3.000,108.00,0.005885,0
...,...,...,...,...,...,...,...,...,...,...,...
365509,ENST00000641834,ENSG00000167747,1537,TGACC,6.552982,123.263158,0.007419,5.790,124.00,0.006810,0
365510,ENST00000641834,ENSG00000167747,1538,GACCA,2.540877,82.289474,0.006472,2.330,82.00,0.006310,0
365511,ENST00000641834,ENSG00000167747,1692,TTGAC,4.090577,105.807692,0.008788,3.160,107.00,0.007090,0
365512,ENST00000641834,ENSG00000167747,1693,TGACA,8.702885,113.134615,0.006907,8.675,113.00,0.006705,0


In [2]:
# Obtain all unique gene names in the dataframe
gene_names = []
for i in df['Gene Name']:
    if i not in gene_names:
        gene_names.append(i)

# Split dataset into training and validation data (70/30 ratio) based on the Gene name
random.seed(4262)
training_genes = random.sample(gene_names, int(0.7 * len(gene_names)) ) # sample 70% of genes to be used in training
validation_genes = list(set(gene_names) - set(training_genes)) # remainder of genes that are not sampled will be used in validation
training_data = df[df['Gene Name'].isin(training_genes)]
validation_data = df[df['Gene Name'].isin(validation_genes)]

training_y = training_data['Label'].reset_index(drop=True)
training_X = training_data.drop(['Label'], axis=1).reset_index(drop=True)
validation_y = validation_data['Label'].reset_index(drop=True)
validation_X = validation_data.drop(['Label'], axis=1).reset_index(drop=True)

# One hot encoding for bases column of train and test set
training_X_dummies = pd.get_dummies(training_X['Bases'], drop_first=True)
training_X = pd.concat([training_X, training_X_dummies],axis=1)
validation_X_dummies = pd.get_dummies(validation_X['Bases'], drop_first=True)
validation_X = pd.concat([validation_X, validation_X_dummies],axis=1)

training_X

Unnamed: 0,Transcript Name,Gene Name,Position,Bases,Mean SD,Mean_Mean,Mean Dwelling Time,Median SD,Median_Mean,Median Dwelling Time,...,TAACC,TAACT,TAGAC,TGAAC,TGACA,TGACC,TGACT,TGGAC,TTAAC,TTGAC
0,ENST00000000233,ENSG00000004059,243,AAGAC,4.223784,123.702703,0.008264,3.730,125.00,0.006970,...,0,0,0,0,0,0,0,0,0,0
1,ENST00000000233,ENSG00000004059,244,AGACC,7.382162,125.913513,0.009373,6.650,126.00,0.007970,...,0,0,0,0,0,0,0,0,0,0
2,ENST00000000233,ENSG00000004059,245,GACCA,4.386989,80.570270,0.007345,3.440,80.50,0.005980,...,0,0,0,0,0,0,0,0,0,0
3,ENST00000000233,ENSG00000004059,260,CAAAC,3.216424,109.681395,0.006609,2.880,110.00,0.005640,...,0,0,0,0,0,0,0,0,0,0
4,ENST00000000233,ENSG00000004059,261,AAACT,3.226535,107.889535,0.006813,3.000,108.00,0.005885,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257293,ENST00000641784,ENSG00000284707,3243,GAACA,2.998125,97.346875,0.007648,2.675,97.60,0.006580,...,0,0,0,0,0,0,0,0,0,0
257294,ENST00000641784,ENSG00000284707,3244,AACAA,2.203750,88.439063,0.005190,2.130,88.70,0.004660,...,0,0,0,0,0,0,0,0,0,0
257295,ENST00000641784,ENSG00000284707,3265,CTAAC,1.874516,94.209677,0.005972,1.760,94.80,0.005065,...,0,0,0,0,0,0,0,0,0,0
257296,ENST00000641784,ENSG00000284707,3266,TAACT,2.194032,99.730645,0.006831,2.170,99.75,0.005785,...,0,1,0,0,0,0,0,0,0,0


In [3]:
training_data

Unnamed: 0,Transcript Name,Gene Name,Position,Bases,Mean SD,Mean_Mean,Mean Dwelling Time,Median SD,Median_Mean,Median Dwelling Time,Label
0,ENST00000000233,ENSG00000004059,243,AAGAC,4.223784,123.702703,0.008264,3.730,125.00,0.006970,0
1,ENST00000000233,ENSG00000004059,244,AGACC,7.382162,125.913513,0.009373,6.650,126.00,0.007970,0
2,ENST00000000233,ENSG00000004059,245,GACCA,4.386989,80.570270,0.007345,3.440,80.50,0.005980,0
3,ENST00000000233,ENSG00000004059,260,CAAAC,3.216424,109.681395,0.006609,2.880,110.00,0.005640,0
4,ENST00000000233,ENSG00000004059,261,AAACT,3.226535,107.889535,0.006813,3.000,108.00,0.005885,0
...,...,...,...,...,...,...,...,...,...,...,...
365446,ENST00000641784,ENSG00000284707,3243,GAACA,2.998125,97.346875,0.007648,2.675,97.60,0.006580,0
365447,ENST00000641784,ENSG00000284707,3244,AACAA,2.203750,88.439063,0.005190,2.130,88.70,0.004660,0
365448,ENST00000641784,ENSG00000284707,3265,CTAAC,1.874516,94.209677,0.005972,1.760,94.80,0.005065,0
365449,ENST00000641784,ENSG00000284707,3266,TAACT,2.194032,99.730645,0.006831,2.170,99.75,0.005785,0


In [4]:
validation_data

Unnamed: 0,Transcript Name,Gene Name,Position,Bases,Mean SD,Mean_Mean,Mean Dwelling Time,Median SD,Median_Mean,Median Dwelling Time,Label
54,ENST00000000412,ENSG00000003056,354,GAAAC,2.977180,108.360000,0.007340,2.495,109.00,0.005475,0
55,ENST00000000412,ENSG00000003056,355,AAACT,2.608600,106.584000,0.007782,2.635,108.00,0.006535,0
56,ENST00000000412,ENSG00000003056,356,AACTA,1.888520,94.174000,0.007045,1.835,94.15,0.006745,0
57,ENST00000000412,ENSG00000003056,366,GGGAC,3.961489,118.638298,0.008988,3.670,119.00,0.007970,0
58,ENST00000000412,ENSG00000003056,367,GGACC,6.045319,122.489362,0.007403,5.760,123.00,0.006930,0
...,...,...,...,...,...,...,...,...,...,...,...
365509,ENST00000641834,ENSG00000167747,1537,TGACC,6.552982,123.263158,0.007419,5.790,124.00,0.006810,0
365510,ENST00000641834,ENSG00000167747,1538,GACCA,2.540877,82.289474,0.006472,2.330,82.00,0.006310,0
365511,ENST00000641834,ENSG00000167747,1692,TTGAC,4.090577,105.807692,0.008788,3.160,107.00,0.007090,0
365512,ENST00000641834,ENSG00000167747,1693,TGACA,8.702885,113.134615,0.006907,8.675,113.00,0.006705,0


### Feature engineering functions

In [5]:
# Obtain the counts of the individual bases and use them as features
def count_bases(bases):
    a,t,c,g=0,0,0,0
    for i in bases:
        if i == 'A':
            a+=1
        elif i == 'T':
            t+=1
        elif i == 'C':
            c+=1
        else:
            g+=1
    return a,t,c,g

In [6]:
# Relative positions
relative_positions = [1,2,3] * int(len(training_X)/3)
relative_positions_df = pd.DataFrame(relative_positions, columns=['Relative Position'])
training_X = pd.concat([training_X, relative_positions_df],axis=1)

training_X['Count_A'], training_X['Count_T'], training_X['Count_C'], training_X['Count_G'] = zip(*training_X['Bases'].apply(count_bases))
training_X = training_X.drop(['Bases'],axis=1)

training_X

Unnamed: 0,Transcript Name,Gene Name,Position,Mean SD,Mean_Mean,Mean Dwelling Time,Median SD,Median_Mean,Median Dwelling Time,AAACA,...,TGACC,TGACT,TGGAC,TTAAC,TTGAC,Relative Position,Count_A,Count_T,Count_C,Count_G
0,ENST00000000233,ENSG00000004059,243,4.223784,123.702703,0.008264,3.730,125.00,0.006970,0,...,0,0,0,0,0,1,3,0,1,1
1,ENST00000000233,ENSG00000004059,244,7.382162,125.913513,0.009373,6.650,126.00,0.007970,0,...,0,0,0,0,0,2,2,0,2,1
2,ENST00000000233,ENSG00000004059,245,4.386989,80.570270,0.007345,3.440,80.50,0.005980,0,...,0,0,0,0,0,3,2,0,2,1
3,ENST00000000233,ENSG00000004059,260,3.216424,109.681395,0.006609,2.880,110.00,0.005640,0,...,0,0,0,0,0,1,3,0,2,0
4,ENST00000000233,ENSG00000004059,261,3.226535,107.889535,0.006813,3.000,108.00,0.005885,0,...,0,0,0,0,0,2,3,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257293,ENST00000641784,ENSG00000284707,3243,2.998125,97.346875,0.007648,2.675,97.60,0.006580,0,...,0,0,0,0,0,2,3,0,1,1
257294,ENST00000641784,ENSG00000284707,3244,2.203750,88.439063,0.005190,2.130,88.70,0.004660,0,...,0,0,0,0,0,3,4,0,1,0
257295,ENST00000641784,ENSG00000284707,3265,1.874516,94.209677,0.005972,1.760,94.80,0.005065,0,...,0,0,0,0,0,1,2,1,2,0
257296,ENST00000641784,ENSG00000284707,3266,2.194032,99.730645,0.006831,2.170,99.75,0.005785,0,...,0,0,0,0,0,2,2,2,1,0


In [7]:
# Relative positions
relative_positions = [1,2,3] * int(len(validation_X)/3)
relative_positions_df = pd.DataFrame(relative_positions, columns=['Relative Position'])
validation_X = pd.concat([validation_X, relative_positions_df],axis=1)

validation_X['Count_A'], validation_X['Count_T'], validation_X['Count_C'], validation_X['Count_G'] = zip(*validation_X['Bases'].apply(count_bases)) 
validation_X = validation_X.drop(['Bases'],axis=1)

validation_X

Unnamed: 0,Transcript Name,Gene Name,Position,Mean SD,Mean_Mean,Mean Dwelling Time,Median SD,Median_Mean,Median Dwelling Time,AAACA,...,TGACC,TGACT,TGGAC,TTAAC,TTGAC,Relative Position,Count_A,Count_T,Count_C,Count_G
0,ENST00000000412,ENSG00000003056,354,2.977180,108.360000,0.007340,2.495,109.00,0.005475,0,...,0,0,0,0,0,1,3,0,1,1
1,ENST00000000412,ENSG00000003056,355,2.608600,106.584000,0.007782,2.635,108.00,0.006535,0,...,0,0,0,0,0,2,3,1,1,0
2,ENST00000000412,ENSG00000003056,356,1.888520,94.174000,0.007045,1.835,94.15,0.006745,0,...,0,0,0,0,0,3,3,1,1,0
3,ENST00000000412,ENSG00000003056,366,3.961489,118.638298,0.008988,3.670,119.00,0.007970,0,...,0,0,0,0,0,1,1,0,1,3
4,ENST00000000412,ENSG00000003056,367,6.045319,122.489362,0.007403,5.760,123.00,0.006930,0,...,0,0,0,0,0,2,1,0,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108211,ENST00000641834,ENSG00000167747,1537,6.552982,123.263158,0.007419,5.790,124.00,0.006810,0,...,1,0,0,0,0,2,1,1,2,1
108212,ENST00000641834,ENSG00000167747,1538,2.540877,82.289474,0.006472,2.330,82.00,0.006310,0,...,0,0,0,0,0,3,2,0,2,1
108213,ENST00000641834,ENSG00000167747,1692,4.090577,105.807692,0.008788,3.160,107.00,0.007090,0,...,0,0,0,0,1,1,1,2,1,1
108214,ENST00000641834,ENSG00000167747,1693,8.702885,113.134615,0.006907,8.675,113.00,0.006705,0,...,0,0,0,0,0,2,2,1,1,1


## Feature Selection

### Feature selection using Pearson Correlation

In [8]:
df_cor = training_X.drop(['Transcript Name', 'Gene Name'], axis=1)
cor_matrix = df_cor.corr().abs()
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.75)] # Drop features if correlation is > 0.75
df_cor = df_cor.drop(df_cor[to_drop], axis=1)
df_cor

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))


Unnamed: 0,Position,Mean SD,Mean_Mean,Mean Dwelling Time,AAACA,AAACC,AAACT,AACAA,AACAC,AACAG,...,TGACC,TGACT,TGGAC,TTAAC,TTGAC,Relative Position,Count_A,Count_T,Count_C,Count_G
0,243,4.223784,123.702703,0.008264,0,0,0,0,0,0,...,0,0,0,0,0,1,3,0,1,1
1,244,7.382162,125.913513,0.009373,0,0,0,0,0,0,...,0,0,0,0,0,2,2,0,2,1
2,245,4.386989,80.570270,0.007345,0,0,0,0,0,0,...,0,0,0,0,0,3,2,0,2,1
3,260,3.216424,109.681395,0.006609,0,0,0,0,0,0,...,0,0,0,0,0,1,3,0,2,0
4,261,3.226535,107.889535,0.006813,0,0,1,0,0,0,...,0,0,0,0,0,2,3,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257293,3243,2.998125,97.346875,0.007648,0,0,0,0,0,0,...,0,0,0,0,0,2,3,0,1,1
257294,3244,2.203750,88.439063,0.005190,0,0,0,1,0,0,...,0,0,0,0,0,3,4,0,1,0
257295,3265,1.874516,94.209677,0.005972,0,0,0,0,0,0,...,0,0,0,0,0,1,2,1,2,0
257296,3266,2.194032,99.730645,0.006831,0,0,0,0,0,0,...,0,0,0,0,0,2,2,2,1,0


In [9]:
print(to_drop)

['Median SD', 'Median_Mean', 'Median Dwelling Time']


In [10]:
# Drop Position variable if not dealing with relative positions 
training_X = training_X.drop(training_X[to_drop], axis=1).drop(['Position'],axis=1)
validation_X = validation_X.drop(validation_X[to_drop], axis=1).drop(['Position'],axis=1)
training_X

Unnamed: 0,Transcript Name,Gene Name,Mean SD,Mean_Mean,Mean Dwelling Time,AAACA,AAACC,AAACT,AACAA,AACAC,...,TGACC,TGACT,TGGAC,TTAAC,TTGAC,Relative Position,Count_A,Count_T,Count_C,Count_G
0,ENST00000000233,ENSG00000004059,4.223784,123.702703,0.008264,0,0,0,0,0,...,0,0,0,0,0,1,3,0,1,1
1,ENST00000000233,ENSG00000004059,7.382162,125.913513,0.009373,0,0,0,0,0,...,0,0,0,0,0,2,2,0,2,1
2,ENST00000000233,ENSG00000004059,4.386989,80.570270,0.007345,0,0,0,0,0,...,0,0,0,0,0,3,2,0,2,1
3,ENST00000000233,ENSG00000004059,3.216424,109.681395,0.006609,0,0,0,0,0,...,0,0,0,0,0,1,3,0,2,0
4,ENST00000000233,ENSG00000004059,3.226535,107.889535,0.006813,0,0,1,0,0,...,0,0,0,0,0,2,3,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257293,ENST00000641784,ENSG00000284707,2.998125,97.346875,0.007648,0,0,0,0,0,...,0,0,0,0,0,2,3,0,1,1
257294,ENST00000641784,ENSG00000284707,2.203750,88.439063,0.005190,0,0,0,1,0,...,0,0,0,0,0,3,4,0,1,0
257295,ENST00000641784,ENSG00000284707,1.874516,94.209677,0.005972,0,0,0,0,0,...,0,0,0,0,0,1,2,1,2,0
257296,ENST00000641784,ENSG00000284707,2.194032,99.730645,0.006831,0,0,0,0,0,...,0,0,0,0,0,2,2,2,1,0


In [11]:
training_X.columns

Index(['Transcript Name', 'Gene Name', 'Mean SD', 'Mean_Mean',
       'Mean Dwelling Time', 'AAACA', 'AAACC', 'AAACT', 'AACAA', 'AACAC',
       'AACAG', 'AACAT', 'AACCA', 'AACCC', 'AACCG', 'AACCT', 'AACTA', 'AACTC',
       'AACTG', 'AACTT', 'AAGAC', 'AGAAC', 'AGACA', 'AGACC', 'AGACT', 'AGGAC',
       'ATAAC', 'ATGAC', 'CAAAC', 'CAGAC', 'CGAAC', 'CGGAC', 'CTAAC', 'CTGAC',
       'GAAAC', 'GAACA', 'GAACC', 'GAACT', 'GACAA', 'GACAC', 'GACAG', 'GACAT',
       'GACCA', 'GACCC', 'GACCG', 'GACCT', 'GACTA', 'GACTC', 'GACTG', 'GACTT',
       'GAGAC', 'GGAAC', 'GGACA', 'GGACC', 'GGACT', 'GGGAC', 'GTAAC', 'GTGAC',
       'TAAAC', 'TAACA', 'TAACC', 'TAACT', 'TAGAC', 'TGAAC', 'TGACA', 'TGACC',
       'TGACT', 'TGGAC', 'TTAAC', 'TTGAC', 'Relative Position', 'Count_A',
       'Count_T', 'Count_C', 'Count_G'],
      dtype='object')

### Feature selection using RandomForest's feature_importances_

In [12]:
rf = RandomForestClassifier(random_state=4262)
rf.fit(training_X.iloc[:, 2:], training_y) # Don't take into consideration the transcript name and gene name columns for training_X
rf.feature_importances_

array([3.10618244e-01, 3.49477086e-01, 2.47074809e-01, 1.72239514e-04,
       6.20628391e-05, 4.98285222e-04, 2.58038356e-05, 9.13409903e-07,
       1.38332047e-05, 3.93224025e-06, 2.69490085e-06, 3.05895195e-06,
       5.79356557e-07, 2.57991369e-06, 2.36318557e-06, 1.69937187e-06,
       4.94333387e-05, 1.37044329e-05, 6.68644094e-05, 7.11718880e-05,
       5.67196439e-04, 2.26481502e-04, 4.07238770e-03, 3.79430620e-04,
       9.64401356e-06, 1.77358922e-04, 6.08622919e-06, 3.47840274e-05,
       4.69486412e-06, 1.69786820e-04, 9.66614942e-07, 6.31380429e-05,
       3.98458913e-05, 2.54809821e-04, 2.05678484e-04, 9.69561797e-03,
       3.57367037e-05, 1.82102151e-06, 2.74179604e-04, 4.28845350e-05,
       2.00213246e-05, 6.43922993e-06, 6.02298198e-05, 6.50482256e-06,
       8.29822350e-06, 1.26904642e-05, 4.88687657e-04, 7.73299706e-05,
       1.69104270e-04, 1.94058121e-04, 4.93448396e-03, 2.18219341e-03,
       3.05503377e-02, 4.97822456e-04, 1.07763956e-05, 1.02884014e-03,
      

In [13]:
# Calculate the ranks of scores for feature importance
def calculate_rank(vector):
    a={}
    rank=1
    for num in sorted(vector, reverse=True):
        if num not in a:
            a[num]=rank
            rank=rank+1
    return[a[i] for i in vector]
feature_importance_list = sorted(list(zip(training_X.columns, rf.feature_importances_, calculate_rank(rf.feature_importances_))), key=lambda x:x[1], reverse=True)
feature_importance_list

[('Gene Name', 0.3494770861865757, 1),
 ('Transcript Name', 0.3106182440405313, 2),
 ('Mean SD', 0.24707480884401734, 3),
 ('GGACA', 0.030550337708675067, 4),
 ('TTAAC', 0.012983438472209128, 5),
 ('GAACA', 0.009695617972578296, 6),
 ('Count_T', 0.006869665384603187, 7),
 ('Relative Position', 0.004961368910707888, 8),
 ('GAGAC', 0.004934483957328817, 9),
 ('AGACA', 0.004072387703123804, 10),
 ('TTGAC', 0.003764415487620057, 11),
 ('GGAAC', 0.00218219340707441, 12),
 ('TGACA', 0.001993193103120508, 13),
 ('Count_A', 0.0016532538256689828, 14),
 ('GGGAC', 0.0010288401430825437, 15),
 ('TGACC', 0.001009206293809678, 16),
 ('TAGAC', 0.0007740952161461218, 17),
 ('TGAAC', 0.0005695603593077017, 18),
 ('AAGAC', 0.0005671964393031383, 19),
 ('AAACA', 0.000498285221611878, 20),
 ('GGACC', 0.0004978224561640547, 21),
 ('GACTA', 0.000488687656670476, 22),
 ('AGACC', 0.00037943061991683774, 23),
 ('GACAA', 0.0002741796037013448, 24),
 ('CTGAC', 0.00025480982050232124, 25),
 ('AGAAC', 0.000226481

### Change the number of features to be used by the model

Tweak the argument within the `range()` function

In [14]:
top_features = []
for i in range(38): 
    top_features.append(feature_importance_list[i][0])
top_features

['Gene Name',
 'Transcript Name',
 'Mean SD',
 'GGACA',
 'TTAAC',
 'GAACA',
 'Count_T',
 'Relative Position',
 'GAGAC',
 'AGACA',
 'TTGAC',
 'GGAAC',
 'TGACA',
 'Count_A',
 'GGGAC',
 'TGACC',
 'TAGAC',
 'TGAAC',
 'AAGAC',
 'AAACA',
 'GGACC',
 'GACTA',
 'AGACC',
 'GACAA',
 'CTGAC',
 'AGAAC',
 'TAACC',
 'GAAAC',
 'GACTT',
 'AGGAC',
 'TAACA',
 'Mean_Mean',
 'CAGAC',
 'GACTG',
 'TAACT',
 'TAAAC',
 'GACTC',
 'AACTT']

In [15]:
training_X = training_X[top_features]
validation_X = validation_X[top_features]

In [16]:
training_X

Unnamed: 0,Gene Name,Transcript Name,Mean SD,GGACA,TTAAC,GAACA,Count_T,Relative Position,GAGAC,AGACA,...,GACTT,AGGAC,TAACA,Mean_Mean,CAGAC,GACTG,TAACT,TAAAC,GACTC,AACTT
0,ENSG00000004059,ENST00000000233,4.223784,0,0,0,0,1,0,0,...,0,0,0,123.702703,0,0,0,0,0,0
1,ENSG00000004059,ENST00000000233,7.382162,0,0,0,0,2,0,0,...,0,0,0,125.913513,0,0,0,0,0,0
2,ENSG00000004059,ENST00000000233,4.386989,0,0,0,0,3,0,0,...,0,0,0,80.570270,0,0,0,0,0,0
3,ENSG00000004059,ENST00000000233,3.216424,0,0,0,0,1,0,0,...,0,0,0,109.681395,0,0,0,0,0,0
4,ENSG00000004059,ENST00000000233,3.226535,0,0,0,1,2,0,0,...,0,0,0,107.889535,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257293,ENSG00000284707,ENST00000641784,2.998125,0,0,1,0,2,0,0,...,0,0,0,97.346875,0,0,0,0,0,0
257294,ENSG00000284707,ENST00000641784,2.203750,0,0,0,0,3,0,0,...,0,0,0,88.439063,0,0,0,0,0,0
257295,ENSG00000284707,ENST00000641784,1.874516,0,0,0,1,1,0,0,...,0,0,0,94.209677,0,0,0,0,0,0
257296,ENSG00000284707,ENST00000641784,2.194032,0,0,0,2,2,0,0,...,0,0,0,99.730645,0,0,1,0,0,0


### K-fold validation 
Will be performing k cross-fold validation where k=10 using training dataset

In [22]:
# Break genes of training dataset up into n chunks
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [23]:
# Obtain all unique genes in training dataset
unique_genes_training = []
for i in training_X['Gene Name']:
    if i not in unique_genes_training:
        unique_genes_training.append(i)
unique_genes_training2 = unique_genes_training.copy()        
unique_genes_training

['ENSG00000004059',
 'ENSG00000001630',
 'ENSG00000004779',
 'ENSG00000006451',
 'ENSG00000007520',
 'ENSG00000007376',
 'ENSG00000008988',
 'ENSG00000010270',
 'ENSG00000004975',
 'ENSG00000014919',
 'ENSG00000013583',
 'ENSG00000002587',
 'ENSG00000006453',
 'ENSG00000078668',
 'ENSG00000010017',
 'ENSG00000006118',
 'ENSG00000034713',
 'ENSG00000022277',
 'ENSG00000011485',
 'ENSG00000012061',
 'ENSG00000035141',
 'ENSG00000013588',
 'ENSG00000041357',
 'ENSG00000030582',
 'ENSG00000048544',
 'ENSG00000072849',
 'ENSG00000049541',
 'ENSG00000049449',
 'ENSG00000079616',
 'ENSG00000088256',
 'ENSG00000063241',
 'ENSG00000068697',
 'ENSG00000013275',
 'ENSG00000071539',
 'ENSG00000075336',
 'ENSG00000075975',
 'ENSG00000070669',
 'ENSG00000086232',
 'ENSG00000065518',
 'ENSG00000072110',
 'ENSG00000079999',
 'ENSG00000083845',
 'ENSG00000086504',
 'ENSG00000085721',
 'ENSG00000086589',
 'ENSG00000088356',
 'ENSG00000089048',
 'ENSG00000089693',
 'ENSG00000090487',
 'ENSG00000090470',


In [24]:
random.seed(4262)
random.shuffle(unique_genes_training)
cross_val_data = chunks(unique_genes_training, round(len(unique_genes_training)/10)) # Will be using k=10 folds
cross_val_data = list(cross_val_data)

# Check to see what the cross_val_data looks like
for i in cross_val_data:
    print(i)
    print(len(i))
    print('')

['ENSG00000206418', 'ENSG00000197070', 'ENSG00000128272', 'ENSG00000116096', 'ENSG00000171004', 'ENSG00000000419', 'ENSG00000113719', 'ENSG00000125378', 'ENSG00000167670', 'ENSG00000141582', 'ENSG00000163468', 'ENSG00000149823', 'ENSG00000051596', 'ENSG00000175166', 'ENSG00000105355', 'ENSG00000181885', 'ENSG00000169032', 'ENSG00000183955', 'ENSG00000164253', 'ENSG00000176624', 'ENSG00000171302', 'ENSG00000262919', 'ENSG00000133935', 'ENSG00000138629', 'ENSG00000053372', 'ENSG00000173020', 'ENSG00000197785', 'ENSG00000114416', 'ENSG00000122545', 'ENSG00000100316', 'ENSG00000124942', 'ENSG00000090061', 'ENSG00000189403', 'ENSG00000109390', 'ENSG00000143815', 'ENSG00000100902', 'ENSG00000136908', 'ENSG00000178999', 'ENSG00000005007', 'ENSG00000081721', 'ENSG00000113758', 'ENSG00000144029', 'ENSG00000116212', 'ENSG00000241837', 'ENSG00000206450', 'ENSG00000278081', 'ENSG00000044574', 'ENSG00000123064', 'ENSG00000169288', 'ENSG00000164880', 'ENSG00000197894', 'ENSG00000176248', 'ENSG000001

In [25]:
# training_X

In [26]:
roc_auc_scores = []
pr_auc_scores = []
for i in range(len(cross_val_data)):
    print(cross_val_data[i]) # The set that is selected will be used for validation
    print("testing", i)
    # Train test split on training set
    training_cross_val = training_data[~training_data['Gene Name'].isin(cross_val_data[i])]
    validation_cross_val = training_data[training_data['Gene Name'].isin(cross_val_data[i])]
    
    training_cross_val_y = training_cross_val['Label'].reset_index(drop=True)
    training_cross_val_X = training_cross_val.drop(['Label'], axis=1).reset_index(drop=True)
    validation_cross_val_y = validation_cross_val['Label'].reset_index(drop=True)
    validation_cross_val_X = validation_cross_val.drop(['Label'], axis=1).reset_index(drop=True)
    
    # Feature engineering
    
    # Add in relative positions
    relative_positions = [1,2,3] * int(len(training_cross_val_X)/3)
    relative_positions_df = pd.DataFrame(relative_positions, columns=['Relative Position'])
    training_cross_val_X = pd.concat([training_cross_val_X, relative_positions_df],axis=1)
    
    training_cross_val_X['Count_A'], training_cross_val_X['Count_T'], training_cross_val_X['Count_C'], training_cross_val_X['Count_G'] = zip(*training_cross_val_X['Bases'].apply(count_bases)) 
    training_cross_val_dummies = pd.get_dummies(training_cross_val_X['Bases'], drop_first=True)
    training_cross_val_X = pd.concat([training_cross_val_X, training_cross_val_dummies],axis=1).drop(['Bases'],axis=1)
    
    # Add in relative positions
    relative_positions = [1,2,3] * int(len(validation_cross_val_X)/3)
    relative_positions_df = pd.DataFrame(relative_positions, columns=['Relative Position'])
    validation_cross_val_X = pd.concat([validation_cross_val_X, relative_positions_df],axis=1)

    validation_cross_val_X['Count_A'], validation_cross_val_X['Count_T'], validation_cross_val_X['Count_C'], validation_cross_val_X['Count_G'] = zip(*validation_cross_val_X['Bases'].apply(count_bases)) 
    validation_cross_val_dummies = pd.get_dummies(validation_cross_val_X['Bases'], drop_first=True)
    validation_cross_val_X = pd.concat([validation_cross_val_X, validation_cross_val_dummies],axis=1).drop(['Bases'],axis=1)
    
    # Drop additional variables via pearson correlation and feature_importances_
    training_cross_val_X = training_cross_val_X.drop(training_cross_val_X[to_drop], axis=1).drop(['Position'],axis=1)
    validation_cross_val_X = validation_cross_val_X.drop(validation_cross_val_X[to_drop], axis=1).drop(['Position'],axis=1)
    
    training_cross_val_X = training_cross_val_X[top_features]
    validation_cross_val_X = validation_cross_val_X[top_features]
    
    training_cross_val_X = training_cross_val_X.drop(['Transcript Name', 'Gene Name'], axis=1)
    validation_cross_val_X = validation_cross_val_X.drop(['Transcript Name', 'Gene Name'], axis=1)
    print(validation_cross_val_X)
    
    # Use RandomUnderSampler to sample majority class until minority class makes up 5% of the majority class counts
    over = SMOTE(sampling_strategy=0.3)
    training_cross_val_X, training_cross_val_y = over.fit_resample(training_cross_val_X, training_cross_val_y)
    print("Number of columns in training_cross_val_X is", len(training_cross_val_X.columns))
    print("Number of columns in validation_cross_val_X is", len(validation_cross_val_X.columns))
    
    # Fitting ML model
    random.seed(4262)
    model = GradientBoostingClassifier(random_state=4262, max_depth=6, max_features = "log2", criterion = "friedman_mse", n_estimators = 500, loss = "log_loss", learning_rate = 0.1)
    model.fit(training_cross_val_X, training_cross_val_y)
    y_score = model.predict_proba(validation_cross_val_X)
    y_pred = model.predict(validation_cross_val_X)
    
    # Make sure that only DRACH sites are present before checking scores
    y_score = pd.DataFrame(y_score[:,1], columns = ['Predicted Score'])
    y_pred = pd.DataFrame(y_pred, columns=['Predicted'])
    validation_cross_val_y = validation_cross_val_y.to_frame()
    
    y_pred_and_validation_y = pd.concat([y_score, validation_cross_val_y, y_pred],axis=1)
    indicator = pd.DataFrame([0,1,0] * int(len(y_pred_and_validation_y) / 3), columns = ['Indicator'])
    y_pred_and_validation_y = pd.concat([y_pred_and_validation_y, indicator], axis=1)
    y_pred_and_validation_y = y_pred_and_validation_y[y_pred_and_validation_y['Indicator'] == 1]
    y_pred_and_validation_y = y_pred_and_validation_y.drop(['Indicator'], axis=1)
    
    # Convert DataFrame to Series before checking ROC AUC and PR AUC Score
    validation_cross_val_y = y_pred_and_validation_y['Label'].squeeze()
    y_score = y_pred_and_validation_y['Predicted Score'].squeeze()
    y_pred = y_pred_and_validation_y['Predicted'].squeeze()

    roc_auc_scores.append(roc_auc_score(validation_cross_val_y, y_score, average=None))

    # Data to plot precision - recall curve
    precision, recall, thresholds = precision_recall_curve(validation_cross_val_y, y_score)
    # Use AUC function to calculate the area under the curve of precision recall curve
    auc_precision_recall = auc(recall, precision)
    pr_auc_scores.append(auc_precision_recall)
    
print('Mean roc auc score is', np.mean(roc_auc_scores))
print('Mean pr auc score is', np.mean(auc_precision_recall))

['ENSG00000206418', 'ENSG00000197070', 'ENSG00000128272', 'ENSG00000116096', 'ENSG00000171004', 'ENSG00000000419', 'ENSG00000113719', 'ENSG00000125378', 'ENSG00000167670', 'ENSG00000141582', 'ENSG00000163468', 'ENSG00000149823', 'ENSG00000051596', 'ENSG00000175166', 'ENSG00000105355', 'ENSG00000181885', 'ENSG00000169032', 'ENSG00000183955', 'ENSG00000164253', 'ENSG00000176624', 'ENSG00000171302', 'ENSG00000262919', 'ENSG00000133935', 'ENSG00000138629', 'ENSG00000053372', 'ENSG00000173020', 'ENSG00000197785', 'ENSG00000114416', 'ENSG00000122545', 'ENSG00000100316', 'ENSG00000124942', 'ENSG00000090061', 'ENSG00000189403', 'ENSG00000109390', 'ENSG00000143815', 'ENSG00000100902', 'ENSG00000136908', 'ENSG00000178999', 'ENSG00000005007', 'ENSG00000081721', 'ENSG00000113758', 'ENSG00000144029', 'ENSG00000116212', 'ENSG00000241837', 'ENSG00000206450', 'ENSG00000278081', 'ENSG00000044574', 'ENSG00000123064', 'ENSG00000169288', 'ENSG00000164880', 'ENSG00000197894', 'ENSG00000176248', 'ENSG000001

        Mean SD  GGACA  TTAAC  GAACA  Count_T  Relative Position  GAGAC  \
0      6.685581      0      0      0        0                  1      0   
1      6.863488      0      0      0        1                  2      0   
2      2.480233      0      0      0        1                  3      0   
3      6.838333      0      0      0        0                  1      0   
4      2.934286      0      0      0        0                  2      0   
...         ...    ...    ...    ...      ...                ...    ...   
23260  3.227105      0      0      1        0                  2      0   
23261  2.454632      0      0      0        0                  3      0   
23262  6.463061      0      0      0        0                  1      0   
23263  5.443878      0      0      0        1                  2      0   
23264  3.151612      0      0      0        1                  3      0   

       AGACA  TTGAC  GGAAC  ...  GACTT  AGGAC  TAACA   Mean_Mean  CAGAC  \
0          0      0     

Number of columns in training_cross_val_X is 36
Number of columns in validation_cross_val_X is 36
['ENSG00000229684', 'ENSG00000090372', 'ENSG00000099995', 'ENSG00000091542', 'ENSG00000038274', 'ENSG00000187079', 'ENSG00000100744', 'ENSG00000182628', 'ENSG00000180879', 'ENSG00000165733', 'ENSG00000117868', 'ENSG00000115641', 'ENSG00000113649', 'ENSG00000104442', 'ENSG00000123416', 'ENSG00000100116', 'ENSG00000247077', 'ENSG00000181704', 'ENSG00000188846', 'ENSG00000084072', 'ENSG00000148773', 'ENSG00000033050', 'ENSG00000169714', 'ENSG00000160007', 'ENSG00000113269', 'ENSG00000154146', 'ENSG00000095261', 'ENSG00000065154', 'ENSG00000144566', 'ENSG00000273594', 'ENSG00000185043', 'ENSG00000136819', 'ENSG00000106049', 'ENSG00000135916', 'ENSG00000241945', 'ENSG00000110492', 'ENSG00000174243', 'ENSG00000165501', 'ENSG00000100075', 'ENSG00000123066', 'ENSG00000039123', 'ENSG00000111845', 'ENSG00000146731', 'ENSG00000134531', 'ENSG00000133265', 'ENSG00000114738', 'ENSG00000128059', 'ENSG000

        Mean SD  GGACA  TTAAC  GAACA  Count_T  Relative Position  GAGAC  \
0      4.411081      0      0      0        0                  1      0   
1      5.808378      0      0      0        0                  2      0   
2      3.928378      0      0      0        0                  3      0   
3      6.111176      0      0      0        0                  1      0   
4      5.637059      0      0      0        0                  2      0   
...         ...    ...    ...    ...      ...                ...    ...   
27124  9.656250      0      0      0        1                  2      0   
27125  3.318750      0      0      0        0                  3      0   
27126  3.293750      0      0      0        0                  1      0   
27127  5.548438      0      0      0        0                  2      0   
27128  3.479375      0      0      0        0                  3      0   

       AGACA  TTGAC  GGAAC  ...  GACTT  AGGAC  TAACA   Mean_Mean  CAGAC  \
0          0      0     

Number of columns in training_cross_val_X is 36
Number of columns in validation_cross_val_X is 36


KeyboardInterrupt: 

In [None]:
roc_auc_scores

### SMOTE

In [None]:
training_X = training_X.drop(['Transcript Name', 'Gene Name'], axis=1)
validation_X = validation_X.drop(['Transcript Name', 'Gene Name'], axis=1)

In [None]:
training_X

In [None]:
validation_X

In [None]:
validation_y

In [None]:
# # Use RandomUnderSampler to sample majority class until minority class makes up 5% of the majority class counts
over = SMOTE(sampling_strategy=0.3)
training_X, training_y = over.fit_resample(training_X, training_y)
training_X

### Model training for Non-DRACH + DRACH sites

In [None]:
random.seed(4262)
clf = GradientBoostingClassifier(random_state=4262, max_depth=6, max_features = "log2", criterion = "friedman_mse", n_estimators = 500, loss = "log_loss", learning_rate = 0.1)
clf.fit(training_X, training_y)
y_score = clf.predict_proba(validation_X)
y_pred = clf.predict(validation_X)

In [None]:
y_score = pd.DataFrame(y_score[:,1], columns = ['Predicted Score'])
y_pred = pd.DataFrame(y_pred, columns=['Predicted'])
validation_y = validation_y.to_frame()
y_score

In [None]:
validation_y

In [None]:
# Make sure that only DRACH sites are present before checking scores
y_pred_and_validation_y = pd.concat([y_score, validation_y, y_pred],axis=1)
indicator = pd.DataFrame([0,1,0] * int(len(y_pred_and_validation_y) / 3), columns = ['Indicator'])
y_pred_and_validation_y = pd.concat([y_pred_and_validation_y, indicator], axis=1)
y_pred_and_validation_y = y_pred_and_validation_y[y_pred_and_validation_y['Indicator'] == 1]
y_pred_and_validation_y = y_pred_and_validation_y.drop(['Indicator'], axis=1)
y_pred_and_validation_y

In [None]:
# Convert DataFrame to Series before checking ROC AUC and PR AUC Score
validation_y = y_pred_and_validation_y['Label'].squeeze()
y_score = y_pred_and_validation_y['Predicted Score'].squeeze()
y_pred = y_pred_and_validation_y['Predicted'].squeeze()

In [None]:
print('ROC_AUC score is', roc_auc_score(validation_y, y_score, average=None))

In [None]:
# Data to plot precision - recall curve
precision, recall, thresholds = precision_recall_curve(validation_y, y_score)
# Use AUC function to calculate the area under the curve of precision recall curve
auc_precision_recall = auc(recall, precision)
print('PR AUC score is', auc_precision_recall)

In [None]:
from sklearn.metrics import average_precision_score
average_precision_score(validation_y, y_score)

In [None]:
from sklearn.metrics import precision_score, recall_score
print("Precision Score is", precision_score(validation_y, y_pred))
print("Recall Score is", recall_score(validation_y, y_pred))

In [None]:
import pickle
pickle.dump(clf, open('gradientboostingclassifier.sav', 'wb'))