### Columns present in ML model - obtained from Project 2 - 3. Model training (OHE for bases + No relative positions)
`['Mean SD', 'Mean_Mean', 'Mean Dwelling Time', 'AAACA', 'AAACC', 'AAACT', 'AACAA', 'AACAC',
  'AACAG', 'AACAT', 'AACCA', 'AACCC', 'AACCG', 'AACCT', 'AACTA', 'AACTC', 'AACTG', 'AACTT', 'AAGAC', 'AGAAC', 'AGACA', 'AGACC',   'AGACT', 'AGGAC', 'ATAAC', 'ATGAC', 'CAAAC', 'CAGAC', 'CGAAC', 'CGGAC', 'CTAAC', 'CTGAC', 'GAAAC', 'GAACA', 'GAACC', 'GAACT',   'GACAA', 'GACAC', 'GACAG', 'GACAT', 'GACCA', 'GACCC', 'GACCG', 'GACCT', 'GACTA', 'GACTC', 'GACTG', 'GACTT', 'GAGAC', 'GGAAC',   'GGACA', 'GGACC', 'GGACT', 'GGGAC', 'GTAAC', 'GTGAC', 'TAAAC', 'TAACA', 'TAACC', 'TAACT', 'TAGAC', 'TGAAC', 'TGACA', 'TGACC',
  'TGACT', 'TGGAC', 'TTAAC', 'TTGAC', 'Top Bases Indicator', 'Bottom Bases Indicator', 'Count_A', 'Count_T', 'Count_C']`

In [1]:
import pickle
import pandas as pd
df = pd.read_csv('./Data/dataset2.csv')
transcript_name_and_position = df[['Transcript Name', 'Position']]
df = df.drop(['Transcript Name','Position'], axis=1)
df

Unnamed: 0,Bases,Mean SD,Mean_Mean,Mean Dwelling Time,Median SD,Median_Mean,Median Dwelling Time
0,GAAAC,2.557500,109.888889,0.006574,2.090,110.00,0.005310
1,AAACT,2.278611,108.716667,0.007237,2.130,110.00,0.006360
2,AACTA,1.804167,95.641667,0.006340,1.720,95.50,0.005585
3,ATAAC,2.715526,84.363158,0.006866,2.465,84.35,0.005705
4,TAACC,3.054474,93.942105,0.006421,3.030,93.80,0.006130
...,...,...,...,...,...,...,...
296485,AAACC,3.918317,99.866337,0.005711,3.890,99.40,0.004960
296486,AACCT,2.483366,82.502970,0.006016,2.380,82.50,0.005030
296487,AGAAC,6.446333,132.900000,0.008041,6.425,134.00,0.007360
296488,GAACT,3.094667,102.260000,0.006118,2.330,102.00,0.005605


### Pre-processing data to make sure that data has columns that are accepted by ML model

In [2]:
# From the EDA, it appears that bases ['GGACT', 'GAACT', 'GGACA', 'GGACC', 'AGACT', 'TGACT'] appear the highest amount of 
# times in m6a positive sites - as such, I will use a greater weight when those bases are present
def indicate_top_bases(bases):
    if bases in ['GGACT', 'GAACT']:
        return 2
    elif bases in ['GGACA', 'GGACC', 'AGACT', 'TGACT']:
        return 1
    return 0

# From the EDA, it appears that bases ['GGACT', 'GAACT', 'GGACA', 'GGACC', 'AGACT', 'TGACT'] appear the highest amount of 
# times in m6a negative sites - as such, I will use a smaller weight when those bases are present
def indicate_bottom_bases(bases):
    if bases in ['AAACT', 'GAACA', 'TGACA', 'AGACA', 'TGACC', 'GGACA']:
        return -1
    return 0

# Obtain the counts of the individual bases and use them as features
def count_bases(bases):
    a,t,c=0,0,0
    for i in bases:
        if i == 'A':
            a+=1
        elif i == 'T':
            t+=1
        elif i == 'C':
            c+=1
    return a,t,c

In [3]:
df['Top Bases Indicator'] = df['Bases'].apply(indicate_top_bases)
df['Bottom Bases Indicator'] = df['Bases'].apply(indicate_bottom_bases)
df['Combined Bases Indicator'] = df['Top Bases Indicator'] + df['Bottom Bases Indicator']
df['Count_A'], df['Count_T'], df['Count_C'] = zip(*df['Bases'].apply(count_bases))
df

Unnamed: 0,Bases,Mean SD,Mean_Mean,Mean Dwelling Time,Median SD,Median_Mean,Median Dwelling Time,Top Bases Indicator,Bottom Bases Indicator,Combined Bases Indicator,Count_A,Count_T,Count_C
0,GAAAC,2.557500,109.888889,0.006574,2.090,110.00,0.005310,0,0,0,3,0,1
1,AAACT,2.278611,108.716667,0.007237,2.130,110.00,0.006360,0,-1,-1,3,1,1
2,AACTA,1.804167,95.641667,0.006340,1.720,95.50,0.005585,0,0,0,3,1,1
3,ATAAC,2.715526,84.363158,0.006866,2.465,84.35,0.005705,0,0,0,3,1,1
4,TAACC,3.054474,93.942105,0.006421,3.030,93.80,0.006130,0,0,0,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
296485,AAACC,3.918317,99.866337,0.005711,3.890,99.40,0.004960,0,0,0,3,0,2
296486,AACCT,2.483366,82.502970,0.006016,2.380,82.50,0.005030,0,0,0,2,1,2
296487,AGAAC,6.446333,132.900000,0.008041,6.425,134.00,0.007360,0,0,0,3,0,1
296488,GAACT,3.094667,102.260000,0.006118,2.330,102.00,0.005605,2,0,2,2,1,1


### Remove columns based on Pearson Correlation 
Refer to `Project 2 - 3. Model training (OHE for bases + No relative positions)`

In [4]:
df = df.drop(['Median Dwelling Time', 'Median SD', 'Median_Mean', 'Combined Bases Indicator'], axis=1)
df_dummies = pd.get_dummies(df['Bases'], drop_first=True)
df = pd.concat([df, df_dummies],axis=1).drop(['Bases'],axis=1)
df

Unnamed: 0,Mean SD,Mean_Mean,Mean Dwelling Time,Top Bases Indicator,Bottom Bases Indicator,Count_A,Count_T,Count_C,AAACA,AAACC,...,TAACC,TAACT,TAGAC,TGAAC,TGACA,TGACC,TGACT,TGGAC,TTAAC,TTGAC
0,2.557500,109.888889,0.006574,0,0,3,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2.278611,108.716667,0.007237,0,-1,3,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1.804167,95.641667,0.006340,0,0,3,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2.715526,84.363158,0.006866,0,0,3,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3.054474,93.942105,0.006421,0,0,2,1,2,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296485,3.918317,99.866337,0.005711,0,0,3,0,2,0,1,...,0,0,0,0,0,0,0,0,0,0
296486,2.483366,82.502970,0.006016,0,0,2,1,2,0,0,...,0,0,0,0,0,0,0,0,0,0
296487,6.446333,132.900000,0.008041,0,0,3,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
296488,3.094667,102.260000,0.006118,2,0,2,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


### Import ML Model to predict on test dataset

In [5]:
pickled_model = pickle.load(open('gradientboostingclassifier.sav', 'rb'))
predicted_scores = pickled_model.predict_proba(df)
predicted_scores

Feature names must be in the same order as they were in fit.



array([[0.99190339, 0.00809661],
       [0.99198065, 0.00801935],
       [0.98622676, 0.01377324],
       ...,
       [0.99645494, 0.00354506],
       [0.94516933, 0.05483067],
       [0.94538094, 0.05461906]])

In [6]:
predicted_full_df = pd.concat([transcript_name_and_position, pd.DataFrame(predicted_scores[:,1], columns = ['score'])], axis=1)
predicted_full_df['transcript_id'], predicted_full_df['transcript_position'] = predicted_full_df['Transcript Name'], predicted_full_df['Position']
predicted_full_df = predicted_full_df.drop(['Transcript Name', 'Position'], axis=1)
predicted_full_df = predicted_full_df[['transcript_id', 'transcript_position', 'score']]
predicted_full_df

Unnamed: 0,transcript_id,transcript_position,score
0,AT1G01050.1,154,0.008097
1,AT1G01050.1,155,0.008019
2,AT1G01050.1,156,0.013773
3,AT1G01050.1,164,0.003201
4,AT1G01050.1,165,0.025502
...,...,...,...
296485,AT5G67590.1,663,0.017535
296486,AT5G67590.1,664,0.003201
296487,AT5G67600.1,153,0.003545
296488,AT5G67600.1,154,0.054831


### Post-processing to obtain rows for DRACH sites only

In [7]:
indicator = pd.DataFrame([0,1,0] * int(len(predicted_full_df) / 3), columns = ['Indicator'])
predicted_full_df = pd.concat([predicted_full_df, indicator], axis=1)
predicted_full_df = predicted_full_df[predicted_full_df['Indicator'] == 1]
predicted_full_df

Unnamed: 0,transcript_id,transcript_position,score,Indicator
1,AT1G01050.1,155,0.008019,1
4,AT1G01050.1,165,0.025502,1
7,AT1G01050.1,347,0.005463,1
10,AT1G01050.1,435,0.007918,1
13,AT1G01050.2,463,0.007758,1
...,...,...,...,...
296476,AT5G67590.1,367,0.007361,1
296479,AT5G67590.1,444,0.087061,1
296482,AT5G67590.1,465,0.058009,1
296485,AT5G67590.1,663,0.017535,1


In [8]:
predicted_full_df = predicted_full_df.drop(['Indicator'], axis=1)
predicted_full_df

Unnamed: 0,transcript_id,transcript_position,score
1,AT1G01050.1,155,0.008019
4,AT1G01050.1,165,0.025502
7,AT1G01050.1,347,0.005463
10,AT1G01050.1,435,0.007918
13,AT1G01050.2,463,0.007758
...,...,...,...
296476,AT5G67590.1,367,0.007361
296479,AT5G67590.1,444,0.087061
296482,AT5G67590.1,465,0.058009
296485,AT5G67590.1,663,0.017535


In [9]:
# predicted_full_df.to_csv('./Data/dataset2_prediction.csv', index=False)