### Columns present in ML model - obtained from Project 2 - 3. Model training (OHE for bases + Relative positions)
`['Mean SD', 'Mean_Mean', 'Mean Dwelling Time', 'AAACA', 'AAACC', 'AAACT', 'AACAA', 'AACAC', 'AACAG', 'AACAT', 'AACCA', 'AACCC', 'AACCG', 'AACCT', 'AACTA', 'AACTC', 'AACTG', 'AACTT', 'AAGAC', 'AGAAC', 'AGACA', 'AGACC', 'AGACT', 'AGGAC', 'ATAAC', 'ATGAC', 'CAAAC', 'CAGAC', 'CGAAC', 'CGGAC', 'CTAAC', 'CTGAC', 'GAAAC', 'GAACA', 'GAACC', 'GAACT', 'GACAA', 'GACAC', 'GACAG', 'GACAT', 'GACCA', 'GACCC', 'GACCG', 'GACCT', 'GACTA', 'GACTC', 'GACTG', 'GACTT', 'GAGAC', 'GGAAC', 'GGACA', 'GGACC', 'GGACT', 'GGGAC', 'GTAAC', 'GTGAC', 'TAAAC', 'TAACA', 'TAACC', 'TAACT', 'TAGAC', 'TGAAC', 'TGACA', 'TGACC', 'TGACT', 'TGGAC', 'TTAAC', 'TTGAC', 'Relative Position', 'Count_A', 'Count_T', 'Count_C', 'Count_G']`

In [1]:
import pickle
import pandas as pd
# df = pd.read_csv('./Data/full_df mastercopy.csv') # dataset0
df = pd.read_csv('./Data/dataset2.csv')
transcript_name_and_position = df[['Transcript Name', 'Position']]

try: # will run this when dataset is dataset0
    df = df.drop(['Transcript Name','Position','Gene Name'], axis=1)
except:
    df = df.drop(['Transcript Name','Position'], axis=1)
df

Unnamed: 0,Bases,Mean SD,Mean_Mean,Mean Dwelling Time,Median SD,Median_Mean,Median Dwelling Time
0,GAAAC,2.557500,109.888889,0.006574,2.090,110.00,0.005310
1,AAACT,2.278611,108.716667,0.007237,2.130,110.00,0.006360
2,AACTA,1.804167,95.641667,0.006340,1.720,95.50,0.005585
3,ATAAC,2.715526,84.363158,0.006866,2.465,84.35,0.005705
4,TAACC,3.054474,93.942105,0.006421,3.030,93.80,0.006130
...,...,...,...,...,...,...,...
296485,AAACC,3.918317,99.866337,0.005711,3.890,99.40,0.004960
296486,AACCT,2.483366,82.502970,0.006016,2.380,82.50,0.005030
296487,AGAAC,6.446333,132.900000,0.008041,6.425,134.00,0.007360
296488,GAACT,3.094667,102.260000,0.006118,2.330,102.00,0.005605


### Pre-processing data to make sure that data has columns that are accepted by ML model

In [2]:
# Obtain the counts of the individual bases and use them as features
def count_bases(bases):
    a,t,c,g=0,0,0,0
    for i in bases:
        if i == 'A':
            a+=1
        elif i == 'T':
            t+=1
        elif i == 'C':
            c+=1
        else:
            g+=1
    return a,t,c,g

In [3]:
# Relative positions
relative_positions = [1,2,3] * int(len(df)/3)
relative_positions_df = pd.DataFrame(relative_positions, columns=['Relative Position'])
df = pd.concat([df, relative_positions_df],axis=1)

df['Count_A'], df['Count_T'], df['Count_C'], df['Count_G'] = zip(*df['Bases'].apply(count_bases))
df

Unnamed: 0,Bases,Mean SD,Mean_Mean,Mean Dwelling Time,Median SD,Median_Mean,Median Dwelling Time,Relative Position,Count_A,Count_T,Count_C,Count_G
0,GAAAC,2.557500,109.888889,0.006574,2.090,110.00,0.005310,1,3,0,1,1
1,AAACT,2.278611,108.716667,0.007237,2.130,110.00,0.006360,2,3,1,1,0
2,AACTA,1.804167,95.641667,0.006340,1.720,95.50,0.005585,3,3,1,1,0
3,ATAAC,2.715526,84.363158,0.006866,2.465,84.35,0.005705,1,3,1,1,0
4,TAACC,3.054474,93.942105,0.006421,3.030,93.80,0.006130,2,2,1,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...
296485,AAACC,3.918317,99.866337,0.005711,3.890,99.40,0.004960,2,3,0,2,0
296486,AACCT,2.483366,82.502970,0.006016,2.380,82.50,0.005030,3,2,1,2,0
296487,AGAAC,6.446333,132.900000,0.008041,6.425,134.00,0.007360,1,3,0,1,1
296488,GAACT,3.094667,102.260000,0.006118,2.330,102.00,0.005605,2,2,1,1,1


### Remove columns based on Pearson Correlation 
Refer to `Project 2 - 3. Model training (OHE for bases + No relative positions)`

In [4]:
df = df.drop(['Median Dwelling Time', 'Median SD', 'Median_Mean'], axis=1)
df_dummies = pd.get_dummies(df['Bases'], drop_first=True)
df = pd.concat([df, df_dummies],axis=1).drop(['Bases'],axis=1)
df

Unnamed: 0,Mean SD,Mean_Mean,Mean Dwelling Time,Relative Position,Count_A,Count_T,Count_C,Count_G,AAACA,AAACC,...,TAACC,TAACT,TAGAC,TGAAC,TGACA,TGACC,TGACT,TGGAC,TTAAC,TTGAC
0,2.557500,109.888889,0.006574,1,3,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2.278611,108.716667,0.007237,2,3,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1.804167,95.641667,0.006340,3,3,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2.715526,84.363158,0.006866,1,3,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3.054474,93.942105,0.006421,2,2,1,2,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296485,3.918317,99.866337,0.005711,2,3,0,2,0,0,1,...,0,0,0,0,0,0,0,0,0,0
296486,2.483366,82.502970,0.006016,3,2,1,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
296487,6.446333,132.900000,0.008041,1,3,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
296488,3.094667,102.260000,0.006118,2,2,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


### Import ML Model to predict on test dataset

In [5]:
pickled_model = pickle.load(open('gradientboostingclassifier.sav', 'rb'))
predicted_scores = pickled_model.predict_proba(df)
predicted_scores

Feature names must be in the same order as they were in fit.



array([[9.99446904e-01, 5.53095958e-04],
       [9.97360500e-01, 2.63950032e-03],
       [9.92361565e-01, 7.63843473e-03],
       ...,
       [9.99801092e-01, 1.98907965e-04],
       [9.99115903e-01, 8.84096767e-04],
       [7.00450227e-01, 2.99549773e-01]])

In [6]:
predicted_full_df = pd.concat([transcript_name_and_position, pd.DataFrame(predicted_scores[:,1], columns = ['score'])], axis=1)
predicted_full_df['transcript_id'], predicted_full_df['transcript_position'] = predicted_full_df['Transcript Name'], predicted_full_df['Position']
predicted_full_df = predicted_full_df.drop(['Transcript Name', 'Position'], axis=1)
predicted_full_df = predicted_full_df[['transcript_id', 'transcript_position', 'score']]
predicted_full_df

Unnamed: 0,transcript_id,transcript_position,score
0,AT1G01050.1,154,0.000553
1,AT1G01050.1,155,0.002640
2,AT1G01050.1,156,0.007638
3,AT1G01050.1,164,0.001980
4,AT1G01050.1,165,0.022792
...,...,...,...
296485,AT5G67590.1,663,0.000633
296486,AT5G67590.1,664,0.003380
296487,AT5G67600.1,153,0.000199
296488,AT5G67600.1,154,0.000884


### Post-processing to obtain rows for DRACH sites only

In [7]:
indicator = pd.DataFrame([0,1,0] * int(len(predicted_full_df) / 3), columns = ['Indicator'])
predicted_full_df = pd.concat([predicted_full_df, indicator], axis=1)
predicted_full_df = predicted_full_df[predicted_full_df['Indicator'] == 1]
predicted_full_df

Unnamed: 0,transcript_id,transcript_position,score,Indicator
1,AT1G01050.1,155,0.002640,1
4,AT1G01050.1,165,0.022792,1
7,AT1G01050.1,347,0.000458,1
10,AT1G01050.1,435,0.000314,1
13,AT1G01050.2,463,0.000305,1
...,...,...,...,...
296476,AT5G67590.1,367,0.000535,1
296479,AT5G67590.1,444,0.000421,1
296482,AT5G67590.1,465,0.000426,1
296485,AT5G67590.1,663,0.000633,1


In [8]:
predicted_full_df = predicted_full_df.drop(['Indicator'], axis=1)
predicted_full_df

Unnamed: 0,transcript_id,transcript_position,score
1,AT1G01050.1,155,0.002640
4,AT1G01050.1,165,0.022792
7,AT1G01050.1,347,0.000458
10,AT1G01050.1,435,0.000314
13,AT1G01050.2,463,0.000305
...,...,...,...
296476,AT5G67590.1,367,0.000535
296479,AT5G67590.1,444,0.000421
296482,AT5G67590.1,465,0.000426
296485,AT5G67590.1,663,0.000633


In [9]:
predicted_full_df.to_csv('./Data/geneandtonic_dataset2_1.csv', index=False)