In [1]:
!pip install numpy
!pip install pandas
!pip install seaborn
!pip install imblearn



In [2]:
import pickle
import pandas as pd
df = pd.read_csv('test_set_github.csv')
transcript_name_and_position = df[['Transcript Name', 'Position']]
df = df.drop(['Transcript Name','Position'], axis=1)
df

Unnamed: 0,Bases,Mean SD,Mean_Mean,Mean Dwelling Time,Median SD,Median_Mean,Median Dwelling Time
0,AAGAC,4.223784,123.702703,0.008264,3.730,125.00,0.006970
1,AGACC,7.382162,125.913513,0.009373,6.650,126.00,0.007970
2,GACCA,4.386989,80.570270,0.007345,3.440,80.50,0.005980
3,CAAAC,3.216424,109.681395,0.006609,2.880,110.00,0.005640
4,AAACT,3.226535,107.889535,0.006813,3.000,108.00,0.005885
...,...,...,...,...,...,...,...
257293,GAACA,2.998125,97.346875,0.007648,2.675,97.60,0.006580
257294,AACAA,2.203750,88.439063,0.005190,2.130,88.70,0.004660
257295,CTAAC,1.874516,94.209677,0.005972,1.760,94.80,0.005065
257296,TAACT,2.194032,99.730645,0.006831,2.170,99.75,0.005785


### Pre-processing data to make sure that data has columns that are accepted by ML model

In [3]:
# Obtain the counts of the individual bases and use them as features
def count_bases(bases):
    a,t,c,g=0,0,0,0
    for i in bases:
        if i == 'A':
            a+=1
        elif i == 'T':
            t+=1
        elif i == 'C':
            c+=1
        else:
            g+=1
    return a,t,c,g

In [4]:
# Relative positions
relative_positions = [1,2,3] * int(len(df)/3)
relative_positions_df = pd.DataFrame(relative_positions, columns=['Relative Position'])
df = pd.concat([df, relative_positions_df],axis=1)

df['Count_A'], df['Count_T'], df['Count_C'], df['Count_G'] = zip(*df['Bases'].apply(count_bases))
df

Unnamed: 0,Bases,Mean SD,Mean_Mean,Mean Dwelling Time,Median SD,Median_Mean,Median Dwelling Time,Relative Position,Count_A,Count_T,Count_C,Count_G
0,AAGAC,4.223784,123.702703,0.008264,3.730,125.00,0.006970,1,3,0,1,1
1,AGACC,7.382162,125.913513,0.009373,6.650,126.00,0.007970,2,2,0,2,1
2,GACCA,4.386989,80.570270,0.007345,3.440,80.50,0.005980,3,2,0,2,1
3,CAAAC,3.216424,109.681395,0.006609,2.880,110.00,0.005640,1,3,0,2,0
4,AAACT,3.226535,107.889535,0.006813,3.000,108.00,0.005885,2,3,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
257293,GAACA,2.998125,97.346875,0.007648,2.675,97.60,0.006580,2,3,0,1,1
257294,AACAA,2.203750,88.439063,0.005190,2.130,88.70,0.004660,3,4,0,1,0
257295,CTAAC,1.874516,94.209677,0.005972,1.760,94.80,0.005065,1,2,1,2,0
257296,TAACT,2.194032,99.730645,0.006831,2.170,99.75,0.005785,2,2,2,1,0


### Remove columns based on Pearson Correlation 
Refer to `Project 2 - 3. Model training (OHE for bases + No relative positions)`

In [5]:
df = df.drop(['Median Dwelling Time', 'Median SD', 'Median_Mean'], axis=1)
df_dummies = pd.get_dummies(df['Bases'], drop_first=True)
df = pd.concat([df, df_dummies],axis=1).drop(['Bases'],axis=1)
df

Unnamed: 0,Mean SD,Mean_Mean,Mean Dwelling Time,Relative Position,Count_A,Count_T,Count_C,Count_G,AAACA,AAACC,...,TAACC,TAACT,TAGAC,TGAAC,TGACA,TGACC,TGACT,TGGAC,TTAAC,TTGAC
0,4.223784,123.702703,0.008264,1,3,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7.382162,125.913513,0.009373,2,2,0,2,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4.386989,80.570270,0.007345,3,2,0,2,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3.216424,109.681395,0.006609,1,3,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3.226535,107.889535,0.006813,2,3,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257293,2.998125,97.346875,0.007648,2,3,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
257294,2.203750,88.439063,0.005190,3,4,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
257295,1.874516,94.209677,0.005972,1,2,1,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
257296,2.194032,99.730645,0.006831,2,2,2,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0


### Select columns based on RandomForest

Obtain the columns from `Project 2 - 3. Model training (OHE for bases + Relative positions + No bases indicator + DRACH&No-DRACH)`

In [6]:
df = df[[
 'Mean SD',
 'GGACA',
 'TTAAC',
 'GAACA',
 'Count_T',
 'TTGAC',
 'Relative Position',
 'GAGAC',
 'AGACA',
 'GGAAC',
 'TGACA',
 'Count_A',
 'TGACC',
 'TAGAC',
 'AAACA',
 'GGACC',
 'GACTA',
 'TGAAC',
 'AAGAC',
 'AGACC',
 'GAAAC',
 'CTGAC',
 'GGGAC',
 'GACAA',
 'Mean_Mean',
 'Mean Dwelling Time',
 'GACTG',
 'TAACA',
 'AGAAC',
 'GACTT',
 'TAACT',
 'AACTG',
 'AACTT',
 'TAACC',
 'AGGAC',
 'CAGAC']]
df

Unnamed: 0,Mean SD,GGACA,TTAAC,GAACA,Count_T,TTGAC,Relative Position,GAGAC,AGACA,GGAAC,...,GACTG,TAACA,AGAAC,GACTT,TAACT,AACTG,AACTT,TAACC,AGGAC,CAGAC
0,4.223784,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7.382162,0,0,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4.386989,0,0,0,0,0,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3.216424,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3.226535,0,0,0,1,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257293,2.998125,0,0,1,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
257294,2.203750,0,0,0,0,0,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
257295,1.874516,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
257296,2.194032,0,0,0,2,0,2,0,0,0,...,0,0,0,0,1,0,0,0,0,0


### Import ML Model to predict on test dataset

In [7]:
pickled_model = pickle.load(open('gradientboostingclassifier.sav', 'rb'))
predicted_scores = pickled_model.predict_proba(df)
predicted_scores

array([[9.99999982e-01, 1.78323768e-08],
       [9.57038154e-01, 4.29618461e-02],
       [9.99985073e-01, 1.49269479e-05],
       ...,
       [9.99910647e-01, 8.93528992e-05],
       [9.04332303e-01, 9.56676965e-02],
       [9.99999961e-01, 3.89255011e-08]])

In [8]:
predicted_full_df = pd.concat([transcript_name_and_position, pd.DataFrame(predicted_scores[:,1], columns = ['score'])], axis=1)
predicted_full_df['transcript_id'], predicted_full_df['transcript_position'] = predicted_full_df['Transcript Name'], predicted_full_df['Position']
predicted_full_df = predicted_full_df.drop(['Transcript Name', 'Position'], axis=1)
predicted_full_df = predicted_full_df[['transcript_id', 'transcript_position', 'score']]
predicted_full_df

Unnamed: 0,transcript_id,transcript_position,score
0,ENST00000000233,243,1.783238e-08
1,ENST00000000233,244,4.296185e-02
2,ENST00000000233,245,1.492695e-05
3,ENST00000000233,260,1.446433e-06
4,ENST00000000233,261,2.701639e-02
...,...,...,...
257293,ENST00000641784,3243,1.388091e-01
257294,ENST00000641784,3244,4.456944e-08
257295,ENST00000641784,3265,8.935290e-05
257296,ENST00000641784,3266,9.566770e-02


### Post-processing to obtain rows for DRACH sites only

In [9]:
indicator = pd.DataFrame([0,1,0] * int(len(predicted_full_df) / 3), columns = ['Indicator'])
predicted_full_df = pd.concat([predicted_full_df, indicator], axis=1)
predicted_full_df = predicted_full_df[predicted_full_df['Indicator'] == 1]
predicted_full_df

Unnamed: 0,transcript_id,transcript_position,score,Indicator
1,ENST00000000233,244,0.042962,1
4,ENST00000000233,261,0.027016,1
7,ENST00000000233,316,0.011106,1
10,ENST00000000233,332,0.034682,1
13,ENST00000000233,368,0.101827,1
...,...,...,...,...
257284,ENST00000641784,3122,0.746597,1
257287,ENST00000641784,3142,0.054431,1
257290,ENST00000641784,3224,0.836040,1
257293,ENST00000641784,3243,0.138809,1


In [10]:
predicted_full_df = predicted_full_df.drop(['Indicator'], axis=1)
predicted_full_df

Unnamed: 0,transcript_id,transcript_position,score
1,ENST00000000233,244,0.042962
4,ENST00000000233,261,0.027016
7,ENST00000000233,316,0.011106
10,ENST00000000233,332,0.034682
13,ENST00000000233,368,0.101827
...,...,...,...
257284,ENST00000641784,3122,0.746597
257287,ENST00000641784,3142,0.054431
257290,ENST00000641784,3224,0.836040
257293,ENST00000641784,3243,0.138809


In [11]:
predicted_full_df.to_csv('predictions.csv',index=False)