In [1]:
# Import all the necessary libraries
from IPython.display import display
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
import ast
import json

# Do not print warnings
warnings.filterwarnings('ignore')


##### Read the necessary datasets


In [2]:
causal_prior = pd.read_csv('data/CausalPriorFiltered.csv')
gene_status = pd.read_csv('data/OncoKBCancerGeneList.csv')
site_effects = pd.read_csv('data/SiteEffects.csv')


In [3]:
def getGeneType(geneName):
    ''' This function returns -1 -> Tumor Supressor Gene
                                1 -> Oncogene
                                0 -> For Both
                                -2 -> For Not Both
                                -3 -> For Unknown
    '''
    DownGeneLabel = pd.read_csv('data/OncoKBCancerGeneList.csv')
    a = DownGeneLabel.loc[(DownGeneLabel['HugoSymbol']) == (geneName)]

    if(a.empty):
        return -3

    isOncogene = (a['IsOncogene']).iloc[0]
    isTumorSupressor = (a['IsTumorSuppressorGene']).iloc[0]

    if(isOncogene == 'No' and isTumorSupressor == 'Yes'):
        return -1
    elif(isOncogene == 'Yes' and isTumorSupressor == 'No'):
        return 1
    elif(isOncogene == 'Yes' and isTumorSupressor == 'Yes'):
        return 0
    elif(isOncogene == 'No' and isTumorSupressor == 'No'):
        return -2


def upstream_gene_status(geneName):
    # Return if gene is neither Oncogene or Tumor Supressor
    geneType = getGeneType(geneName)
    if(geneType == 0 or geneType == -2 or geneType == -3):
        print('The gene is neither Oncogene or Tumor Supressor')
        return

    # Get upstream genes
    up_genes = causal_prior[causal_prior['interaction'] == geneName]
    up_genes.reset_index(drop=True, inplace=True)

    # Decode InteractionType to 1 and -1
    up_genes['InteractCode'] = 0
    up_genes['InteractCode'] = up_genes['InteractionType'].apply(lambda x: 1 if str(
        x) == 'phosphorylates' or str(x) == 'acetylates' or str(x) == 'methylates' else -1)

    # Construct a new column i.e. 'Sites' splitting sites of each gene
    up_genes['Sites'] = ''
    up_genes['Sites'] = up_genes['Location'].str.split(';')

    # Extract site information of that Gene geneName
    gene_sites = site_effects[site_effects['HugoSymbol'] == geneName]
    gene_sites.reset_index(drop=True, inplace=True)

    # Take the Activating or Inhibiting status(1, -1) from gene_sites and put into up_genes['Sites']
    up_genes['SiteStatus'] = ''
    sites = {}
    for index, row in gene_sites.iterrows():
        sites[row['Location']] = row['Status']

    for index, row in up_genes.iterrows():
        temp = []
        for site in row['Sites']:
            if(site in sites):
                temp.append(sites[site])
            else:
                temp.append(0)
        up_genes.at[index, 'SiteStatus'] = temp

    # Add another column for Tumor Suppressor or Oncogene label
    up_genes['DownGeneLabel'] = geneType

    # Now finally doing computation
    up_genes['Result'] = ''
    for index, row in up_genes.iterrows():
        temp = []
        for site in row['SiteStatus']:
            result = int(site) * \
                int(row['InteractCode']) * int(row['DownGeneLabel'])
            temp.append(result)

        up_genes.at[index, 'Result'] = temp

    # Labal genes type
    up_genes['RootGeneLabel'] = ''
    for index, row in up_genes.iterrows():
        up_genes.at[index, 'RootGeneLabel'] = getGeneType(row['Gene'])

    return up_genes


In [4]:
def get_information_from_down_gene(gene):
    # Lets get all the downstream gene of gene
    down_nodes = causal_prior[causal_prior['Gene'] == gene]
    down_nodes.reset_index(drop=True, inplace=True)

    # filter genes which is listed in OncoKBCancerGeneList
    col1 = []
    col2 = []
    for i in down_nodes['Output']:
        if(getGeneType(i) == 1 or getGeneType(i) == -1):
            col1.append(i)
            col2.append(getGeneType(i))
    down_known_gene = pd.DataFrame(
        list(zip(col1, col2)), columns=['Gene', 'Type'])

    # final step is to get status from genes
    final_result = pd.DataFrame()
    for i in down_known_gene['Gene']:
        output = upstream_gene_status(i)
        result = output.loc[output['Gene'] == gene]
        final_result = final_result.append(result, ignore_index=True)

    return final_result


In [5]:
get_information_from_down_gene("RB1")


KeyError: 'interaction'

In [7]:
output = pd.DataFrame()
genes = causal_prior['Gene'].unique()
for gene in genes:
    a = get_information_from_down_gene(gene)
    output = output.append(a, ignore_index=True)
output


Unnamed: 0,Gene,InteractionType,Output,Location,InteractCode,Sites,SiteStatus,DownGeneLabel,Result,RootGeneLabel
0,RAF1,phosphorylates,RB1,S608,1,[S608],[0],-1,[0],1
1,RAF1,phosphorylates,MYC,T8,1,[T8],[0],1,[0],1
2,RAF1,phosphorylates,HRAS,Y137,1,[Y137],[0],1,[0],1
3,RAF1,phosphorylates,MAP2K2,S226;S222,1,"[S226, S222]","[1, 1]",1,"[1, 1]",1
4,RAF1,phosphorylates,MAP2K1,S218;T292;S298;S222;T286;T386,1,"[S218, T292, S298, S222, T286, T386]","[1, -1, 1, 1, -1, 0]",1,"[1, -1, 1, 1, -1, 0]",1
...,...,...,...,...,...,...,...,...,...,...
5984,ELF1,dephosphorylates,RB1,T821;T826,-1,"[T821, T826]","[1, -1]",-1,"[1, -1]",-3
5985,FOXM1,acetylates,TP53,K373,1,[K373],[-1],-1,[1],-3
5986,KAT6A,acetylates,TP53,K382;K120,1,"[K382, K120]","[-1, -1]",-1,"[1, 1]",-2
5987,DNMT3A,phosphorylates,MECOM,S436,1,[S436],[0],1,[0],-1


In [6]:
output.to_csv('data/outputs/final/final_result.csv', index=False)


NameError: name 'output' is not defined

In [6]:
get_information_from_down_gene('RAF1')


Unnamed: 0,Gene,InteractionType,Output,Location,InteractCode,Sites,SiteStatus,DownGeneLabel,Result,RootGeneLabel
0,RAF1,phosphorylates,RB1,S608,1,[S608],[0],-1,[0],1
1,RAF1,phosphorylates,MYC,T8,1,[T8],[0],1,[0],1
2,RAF1,phosphorylates,HRAS,Y137,1,[Y137],[0],1,[0],1
3,RAF1,phosphorylates,MAP2K2,S226;S222,1,"[S226, S222]","[1, 1]",1,"[1, 1]",1
4,RAF1,phosphorylates,MAP2K1,S218;T292;S298;S222;T286;T386,1,"[S218, T292, S298, S222, T286, T386]","[1, -1, 1, 1, -1, 0]",1,"[1, -1, 1, 1, -1, 0]",1
5,RAF1,phosphorylates,BCL2L11,S602;T599,1,"[S602, T599]","[0, 0]",-1,"[0, 0]",1
6,RAF1,phosphorylates,MAPK1,T182;S29;T185;Y184;Y187,1,"[T182, S29, T185, Y184, Y187]","[0, 1, 1, 0, 1]",1,"[0, 1, 1, 0, 1]",1
7,RAF1,phosphorylates,MAPK3,Y204;Y203;T202;T201,1,"[Y204, Y203, T202, T201]","[1, 0, 1, 0]",1,"[1, 0, 1, 0]",1
8,RAF1,dephosphorylates,BRAF,T753;T401;S750;S151,-1,"[T753, T401, S750, S151]","[-1, -1, -1, -1]",1,"[1, 1, 1, 1]",1
9,RAF1,dephosphorylates,KRAS,S181,-1,[S181],[1],1,[-1],1


##### Doing various analysis on the resultant dataset


In [6]:
output = pd.read_csv('data/outputs/final/final_result.csv')

output['LabelStat'] = ''
output['Predicted'] = ''
for index, row in output.iterrows():
    d = {1: 0, -1: 0, 0: 0}
    for item in ast.literal_eval(row['Result']):
        if(int(item) == 1):
            d[1] += 1
        if(int(item) == -1):
            d[-1] += 1
        if(int(item) == 0):
            d[0] += 1
    output.at[index, 'LabelStat'] = d
    output.at[index, 'Predicted'] = max(d, key=d.get)

output


Unnamed: 0,Gene,InteractionType,Output,Location,InteractCode,Sites,SiteStatus,DownGeneLabel,Result,RootGeneLabel,LabelStat,Predicted
0,RAF1,phosphorylates,RB1,S608,1,['S608'],[0],-1,[0],1,"{1: 0, -1: 0, 0: 1}",0
1,RAF1,phosphorylates,MYC,T8,1,['T8'],[0],1,[0],1,"{1: 0, -1: 0, 0: 1}",0
2,RAF1,phosphorylates,HRAS,Y137,1,['Y137'],[0],1,[0],1,"{1: 0, -1: 0, 0: 1}",0
3,RAF1,phosphorylates,MAP2K2,S226;S222,1,"['S226', 'S222']","[1, 1]",1,"[1, 1]",1,"{1: 2, -1: 0, 0: 0}",1
4,RAF1,phosphorylates,MAP2K1,S218;T292;S298;S222;T286;T386,1,"['S218', 'T292', 'S298', 'S222', 'T286', 'T386']","[1, -1, 1, 1, -1, 0]",1,"[1, -1, 1, 1, -1, 0]",1,"{1: 3, -1: 2, 0: 1}",1
...,...,...,...,...,...,...,...,...,...,...,...,...
5984,ELF1,dephosphorylates,RB1,T821;T826,-1,"['T821', 'T826']","[1, -1]",-1,"[1, -1]",-3,"{1: 1, -1: 1, 0: 0}",1
5985,FOXM1,acetylates,TP53,K373,1,['K373'],[-1],-1,[1],-3,"{1: 1, -1: 0, 0: 0}",1
5986,KAT6A,acetylates,TP53,K382;K120,1,"['K382', 'K120']","[-1, -1]",-1,"[1, 1]",-2,"{1: 2, -1: 0, 0: 0}",1
5987,DNMT3A,phosphorylates,MECOM,S436,1,['S436'],[0],1,[0],-1,"{1: 0, -1: 0, 0: 1}",0


In [21]:
output[output['Gene'] == 'RAF1']


Unnamed: 0,Gene,InteractionType,Output,Location,InteractCode,Sites,SiteStatus,DownGeneLabel,Result,RootGeneLabel,LabelStat,Predicted
0,RAF1,phosphorylates,RB1,S608,1,['S608'],[0],-1,[0],1,"{1: 0, -1: 0, 0: 1}",0
1,RAF1,phosphorylates,MYC,T8,1,['T8'],[0],1,[0],1,"{1: 0, -1: 0, 0: 1}",0
2,RAF1,phosphorylates,HRAS,Y137,1,['Y137'],[0],1,[0],1,"{1: 0, -1: 0, 0: 1}",0
3,RAF1,phosphorylates,MAP2K2,S226;S222,1,"['S226', 'S222']","[1, 1]",1,"[1, 1]",1,"{1: 2, -1: 0, 0: 0}",1
4,RAF1,phosphorylates,MAP2K1,S218;T292;S298;S222;T286;T386,1,"['S218', 'T292', 'S298', 'S222', 'T286', 'T386']","[1, -1, 1, 1, -1, 0]",1,"[1, -1, 1, 1, -1, 0]",1,"{1: 3, -1: 2, 0: 1}",1
5,RAF1,phosphorylates,BCL2L11,S602;T599,1,"['S602', 'T599']","[0, 0]",-1,"[0, 0]",1,"{1: 0, -1: 0, 0: 2}",0
6,RAF1,phosphorylates,MAPK1,T182;S29;T185;Y184;Y187,1,"['T182', 'S29', 'T185', 'Y184', 'Y187']","[0, 1, 1, 0, 1]",1,"[0, 1, 1, 0, 1]",1,"{1: 3, -1: 0, 0: 2}",1
7,RAF1,phosphorylates,MAPK3,Y204;Y203;T202;T201,1,"['Y204', 'Y203', 'T202', 'T201']","[1, 0, 1, 0]",1,"[1, 0, 1, 0]",1,"{1: 2, -1: 0, 0: 2}",1
8,RAF1,dephosphorylates,BRAF,T753;T401;S750;S151,-1,"['T753', 'T401', 'S750', 'S151']","[-1, -1, -1, -1]",1,"[1, 1, 1, 1]",1,"{1: 4, -1: 0, 0: 0}",1
9,RAF1,dephosphorylates,KRAS,S181,-1,['S181'],[1],1,[-1],1,"{1: 0, -1: 1, 0: 0}",-1


In [23]:
summary = pd.DataFrame(columns=[
                       'Gene', 'OncoPredictsGenes', 'TMPredictsGenes', 'BothCount', 'ActualLabel'])


for gene in output['Gene'].unique():
    d = {1: 0, -1: 0, 0: 0}
    gene_list = output[output['Gene'] == gene]

    for index, row in gene_list.iterrows():
        if(row['Predicted'] == 1):
            d[1] += 1
        if(row['Predicted'] == -1):
            d[-1] += 1
        if(row['Predicted'] == 0):
            d[0] += 1

    temp = pd.DataFrame(data={'Gene': [row['Gene']], 'OncoPredictsGenes': [d[1]], 'TMPredictsGenes': [
                        d[-1]], 'BothCount': [d[0]], 'ActualLabel': [getGeneType(row['Gene'])]})
    summary = summary.append(temp, ignore_index=True)

summary


Unnamed: 0,Gene,OncoPredictsGenes,TMPredictsGenes,BothCount,ActualLabel
0,RAF1,6,1,4,1
1,TEAD1,2,0,2,-3
2,ATM,3,16,14,-1
3,OXSR1,0,0,1,-3
4,RBPJ,0,0,1,-3
...,...,...,...,...,...
1495,ELF1,1,0,0,-3
1496,FOXM1,1,0,0,-3
1497,KAT6A,1,0,0,-2
1498,DNMT3A,0,0,1,-1


In [24]:
summary.to_csv('data/outputs/final/summarized_genes.csv', index=False)


#### Calculate the accuracy


In [26]:
filter_summary = summary[summary['ActualLabel'] != -3]
filter_summary = filter_summary[filter_summary['ActualLabel'] != -2]
filter_summary = filter_summary[filter_summary['ActualLabel'] != 0]
filter_summary.reset_index(drop=True, inplace=True)
filter_summary


Unnamed: 0,Gene,OncoPredictsGenes,TMPredictsGenes,BothCount,ActualLabel
0,RAF1,6,1,4,1
1,ATM,3,16,14,-1
2,ATR,0,13,9,-1
3,SGK1,6,6,0,1
4,GNB1,7,0,1,1
...,...,...,...,...,...
213,SMG1,0,1,0,-1
214,RAC2,1,0,1,1
215,SMAD2,1,1,0,-1
216,DUSP4,0,2,0,-1


In [27]:
total_count = filter_summary.shape[0]
correct_count = 0
wrong_count = 0
for index, row in filter_summary.iterrows():
    d = {1: row['OncoPredictsGenes'], -1: row['TMPredictsGenes']}
    if(int(row['ActualLabel']) == max(d, key=d.get)):
        correct_count += 1
    else:
        wrong_count += 1

print(total_count)
print(correct_count)
print(wrong_count)


218
166
52


#### Accuracy percentage


In [28]:
accuracy = (correct_count / total_count) * 100
accuracy


76.14678899082568

In [58]:
wrong_pred_genes = pd.DataFrame()
correct_pred_genes = pd.DataFrame()

for index, row in filter_summary.iterrows():
    d = {1: row['OncoPredictsGenes'], -1: row['TMPredictsGenes']}
    if(int(row['ActualLabel']) == max(d, key=d.get)):
        correct_pred_genes = correct_pred_genes.append(row, ignore_index=True)
    else:
        wrong_pred_genes = wrong_pred_genes.append(row, ignore_index=True)

wrong_pred_genes.to_csv('data/outputs/final/wrong_pred_genes.csv', index=False)
correct_pred_genes.to_csv(
    'data/outputs/final/correct_pred_genes.csv', index=False)


In [59]:
correct_pred_genes


Unnamed: 0,Gene,OncoPredictsGenes,TMPredictsGenes,ActualLabel
0,RAF1,6.0,1.0,1.0
1,ATM,4.0,18.0,-1.0
2,ATR,0.0,14.0,-1.0
3,SGK1,6.0,6.0,1.0
4,GNB1,7.0,0.0,1.0
...,...,...,...,...
157,PPM1D,1.0,0.0,1.0
158,IL7R,2.0,0.0,1.0
159,SMG1,0.0,1.0,-1.0
160,RAC2,1.0,0.0,1.0


In [60]:
correct_genes = pd.DataFrame()
for index, row in correct_pred_genes.iterrows():
    temp = output[output['Gene'] == row['Gene']]
    correct_genes = correct_genes.append(temp, ignore_index=True)

correct_genes.to_csv('data/outputs/final/correct_genes.csv', index=False)


In [61]:
wrong_genes = pd.DataFrame()
for index, row in wrong_pred_genes.iterrows():
    temp = output[output['Gene'] == row['Gene']]
    wrong_genes = wrong_genes.append(temp, ignore_index=True)

wrong_genes.to_csv('data/outputs/final/wrong_genes.csv', index=False)


In [62]:
correct_genes


Unnamed: 0,Gene,InteractionType,Output,Location,InteractCode,Sites,SiteStatus,DownGeneLabel,Result,RootGeneLabel,LabelStat,Predicted
0,RAF1,phosphorylates,MAP2K2,S226;S222,1,"['S226', 'S222']","[1, 1]",1,"[1, 1]",1,"{1: 2, -1: 0}",1
1,RAF1,phosphorylates,MAP2K1,S218;T292;S298;S222;T286;T386,1,"['S218', 'T292', 'S298', 'S222', 'T286', 'T386']","[1, -1, 1, 1, -1, 0]",1,"[1, -1, 1, 1, -1, 0]",1,"{1: 3, -1: 2}",1
2,RAF1,phosphorylates,MAPK1,T182;S29;T185;Y184;Y187,1,"['T182', 'S29', 'T185', 'Y184', 'Y187']","[0, 1, 1, 0, 1]",1,"[0, 1, 1, 0, 1]",1,"{1: 3, -1: 0}",1
3,RAF1,phosphorylates,MAPK3,Y204;Y203;T202;T201,1,"['Y204', 'Y203', 'T202', 'T201']","[1, 0, 1, 0]",1,"[1, 0, 1, 0]",1,"{1: 2, -1: 0}",1
4,RAF1,dephosphorylates,BRAF,T753;T401;S750;S151,-1,"['T753', 'T401', 'S750', 'S151']","[-1, -1, -1, -1]",1,"[1, 1, 1, 1]",1,"{1: 4, -1: 0}",1
...,...,...,...,...,...,...,...,...,...,...,...,...
1013,IL7R,phosphorylates,STAT5A,Y694,1,['Y694'],[1],1,[1],1,"{1: 1, -1: 0}",1
1014,SMG1,phosphorylates,TP53,S15,1,['S15'],[1],-1,[-1],-1,"{1: 0, -1: 1}",-1
1015,RAC2,phosphorylates,VAV2,Y172,1,['Y172'],[1],1,[1],1,"{1: 1, -1: 0}",1
1016,DUSP4,dephosphorylates,MAPK3,Y204;T202,-1,"['Y204', 'T202']","[1, 1]",1,"[-1, -1]",-1,"{1: 0, -1: 2}",-1


In [63]:
wrong_genes


Unnamed: 0,Gene,InteractionType,Output,Location,InteractCode,Sites,SiteStatus,DownGeneLabel,Result,RootGeneLabel,LabelStat,Predicted
0,MAPK1,phosphorylates,GTF2I,S668;S633;S674;S627,1,"['S668', 'S633', 'S674', 'S627']","[1, 0, 1, 0]",1,"[1, 0, 1, 0]",1,"{1: 2, -1: 0}",1
1,MAPK1,phosphorylates,FGFR1,S777,1,['S777'],[-1],1,[-1],1,"{1: 0, -1: 1}",-1
2,MAPK1,phosphorylates,LRP6,S1490;T1572,1,"['S1490', 'T1572']","[1, 1]",1,"[1, 1]",1,"{1: 2, -1: 0}",1
3,MAPK1,phosphorylates,MAPK3,Y204,1,['Y204'],[1],1,[1],1,"{1: 1, -1: 0}",1
4,MAPK1,dephosphorylates,MAP2K1,S218;S222,-1,"['S218', 'S222']","[1, 1]",1,"[-1, -1]",1,"{1: 0, -1: 2}",-1
...,...,...,...,...,...,...,...,...,...,...,...,...
274,SMAD4,dephosphorylates,SMAD3,S425;S423,-1,"['S425', 'S423']","[1, 1]",-1,"[1, 1]",-1,"{1: 2, -1: 0}",1
275,SMAD4,phosphorylates,SMAD2,S467;T8;S245;S465;S464;S255;T220;S250,1,"['S467', 'T8', 'S245', 'S465', 'S464', 'S255',...","[1, 1, 1, 1, 1, 1, 1, 1]",-1,"[-1, -1, -1, -1, -1, -1, -1, -1]",-1,"{1: 0, -1: 8}",-1
276,DNMT1,acetylates,TET2,K111;K110,1,"['K111', 'K110']","[1, 1]",-1,"[-1, -1]",1,"{1: 0, -1: 2}",-1
277,SMAD2,phosphorylates,CTNNB1,Y654,1,['Y654'],[1],1,[1],-1,"{1: 1, -1: 0}",1


---


### Next steps (New Approach)

##### Ignore type 0 from "LabelStat" column and put zero predicted if both 1 and -1 are equal.


In [40]:
output = pd.read_csv('data/outputs/final/final_result.csv')

output['LabelStat'] = ''
output['Predicted'] = ''
for index, row in output.iterrows():
    d = {1: 0, -1: 0}
    for item in ast.literal_eval(row['Result']):
        if(int(item) == 1):
            d[1] += 1
        if(int(item) == -1):
            d[-1] += 1
        if(int(item) == 0):
            pass
    output.at[index, 'LabelStat'] = d

    if(d[1] == d[-1]):
        output.at[index, 'Predicted'] = 0
    else:
        output.at[index, 'Predicted'] = max(d, key=d.get)
output


Unnamed: 0,Gene,InteractionType,Output,Location,InteractCode,Sites,SiteStatus,DownGeneLabel,Result,RootGeneLabel,LabelStat,Predicted
0,RAF1,phosphorylates,RB1,S608,1,['S608'],[0],-1,[0],1,"{1: 0, -1: 0}",0
1,RAF1,phosphorylates,MYC,T8,1,['T8'],[0],1,[0],1,"{1: 0, -1: 0}",0
2,RAF1,phosphorylates,HRAS,Y137,1,['Y137'],[0],1,[0],1,"{1: 0, -1: 0}",0
3,RAF1,phosphorylates,MAP2K2,S226;S222,1,"['S226', 'S222']","[1, 1]",1,"[1, 1]",1,"{1: 2, -1: 0}",1
4,RAF1,phosphorylates,MAP2K1,S218;T292;S298;S222;T286;T386,1,"['S218', 'T292', 'S298', 'S222', 'T286', 'T386']","[1, -1, 1, 1, -1, 0]",1,"[1, -1, 1, 1, -1, 0]",1,"{1: 3, -1: 2}",1
...,...,...,...,...,...,...,...,...,...,...,...,...
5984,ELF1,dephosphorylates,RB1,T821;T826,-1,"['T821', 'T826']","[1, -1]",-1,"[1, -1]",-3,"{1: 1, -1: 1}",0
5985,FOXM1,acetylates,TP53,K373,1,['K373'],[-1],-1,[1],-3,"{1: 1, -1: 0}",1
5986,KAT6A,acetylates,TP53,K382;K120,1,"['K382', 'K120']","[-1, -1]",-1,"[1, 1]",-2,"{1: 2, -1: 0}",1
5987,DNMT3A,phosphorylates,MECOM,S436,1,['S436'],[0],1,[0],-1,"{1: 0, -1: 0}",0


In [51]:
output = output[output['Predicted'] != 0]
output.reset_index(drop=True, inplace=True)


In [52]:
output[output['Gene'] == 'RAF1']


Unnamed: 0,Gene,InteractionType,Output,Location,InteractCode,Sites,SiteStatus,DownGeneLabel,Result,RootGeneLabel,LabelStat,Predicted
0,RAF1,phosphorylates,MAP2K2,S226;S222,1,"['S226', 'S222']","[1, 1]",1,"[1, 1]",1,"{1: 2, -1: 0}",1
1,RAF1,phosphorylates,MAP2K1,S218;T292;S298;S222;T286;T386,1,"['S218', 'T292', 'S298', 'S222', 'T286', 'T386']","[1, -1, 1, 1, -1, 0]",1,"[1, -1, 1, 1, -1, 0]",1,"{1: 3, -1: 2}",1
2,RAF1,phosphorylates,MAPK1,T182;S29;T185;Y184;Y187,1,"['T182', 'S29', 'T185', 'Y184', 'Y187']","[0, 1, 1, 0, 1]",1,"[0, 1, 1, 0, 1]",1,"{1: 3, -1: 0}",1
3,RAF1,phosphorylates,MAPK3,Y204;Y203;T202;T201,1,"['Y204', 'Y203', 'T202', 'T201']","[1, 0, 1, 0]",1,"[1, 0, 1, 0]",1,"{1: 2, -1: 0}",1
4,RAF1,dephosphorylates,BRAF,T753;T401;S750;S151,-1,"['T753', 'T401', 'S750', 'S151']","[-1, -1, -1, -1]",1,"[1, 1, 1, 1]",1,"{1: 4, -1: 0}",1
5,RAF1,dephosphorylates,KRAS,S181,-1,['S181'],[1],1,[-1],1,"{1: 0, -1: 1}",-1
6,RAF1,phosphorylates,NRAS,S89,1,['S89'],[1],1,[1],1,"{1: 1, -1: 0}",1


In [53]:
summary = pd.DataFrame(
    columns=['Gene', 'OncoPredictsGenes', 'TMPredictsGenes', 'ActualLabel'])

for gene in output['Gene'].unique():
    d = {1: 0, -1: 0}
    gene_list = output[output['Gene'] == gene]

    for index, row in gene_list.iterrows():
        if(row['Predicted'] == 1):
            d[1] += 1
        if(row['Predicted'] == -1):
            d[-1] += 1
        if(row['Predicted'] == 0):
            pass

    temp = pd.DataFrame(data={'Gene': [row['Gene']], 'OncoPredictsGenes': [
                        d[1]], 'TMPredictsGenes': [d[-1]], 'ActualLabel': [getGeneType(row['Gene'])]})
    summary = summary.append(temp, ignore_index=True)

summary


Unnamed: 0,Gene,OncoPredictsGenes,TMPredictsGenes,ActualLabel
0,RAF1,6,1,1
1,TEAD1,2,0,-3
2,ATM,4,18,-1
3,GRAP2,7,0,-3
4,ATR,0,14,-1
...,...,...,...,...
1337,PPP2R2C,1,0,-3
1338,PSPN,1,0,-3
1339,ACTR1A,1,0,-3
1340,FOXM1,1,0,-3


In [54]:
summary.to_csv('data/outputs/final/new_summarized_genes.csv', index=False)


In [55]:
filter_summary = summary[summary['ActualLabel'] != -3]
filter_summary = filter_summary[filter_summary['ActualLabel'] != -2]
filter_summary = filter_summary[filter_summary['ActualLabel'] != 0]
filter_summary.reset_index(drop=True, inplace=True)
filter_summary


Unnamed: 0,Gene,OncoPredictsGenes,TMPredictsGenes,ActualLabel
0,RAF1,6,1,1
1,ATM,4,18,-1
2,ATR,0,14,-1
3,SGK1,6,6,1
4,GNB1,7,0,1
...,...,...,...,...
196,IL7R,2,0,1
197,SMG1,0,1,-1
198,RAC2,1,0,1
199,SMAD2,1,1,-1


In [56]:
total_count = filter_summary.shape[0]
correct_count = 0
wrong_count = 0
for index, row in filter_summary.iterrows():
    d = {1: row['OncoPredictsGenes'], -1: row['TMPredictsGenes']}
    if(int(row['ActualLabel']) == max(d, key=d.get)):
        correct_count += 1
    else:
        wrong_count += 1

print(total_count)
print(correct_count)
print(wrong_count)


201
162
39


In [57]:
accuracy = (correct_count / total_count) * 100
accuracy


80.59701492537313

---


#### Detection of Feedback Interaction


In [3]:
interaction = pd.read_csv('data/outputs/final/final_result.csv')

interaction = interaction[interaction['DownGeneLabel'] != -3]
interaction = interaction[interaction['DownGeneLabel'] != 0]
interaction = interaction[interaction['DownGeneLabel'] != -2]

interaction = interaction[interaction['RootGeneLabel'] != -3]
interaction = interaction[interaction['RootGeneLabel'] != 0]
interaction = interaction[interaction['RootGeneLabel'] != -2]

interaction.reset_index(drop=True, inplace=True)
interaction


Unnamed: 0,Gene,InteractionType,Output,Location,InteractCode,Sites,SiteStatus,DownGeneLabel,Result,RootGeneLabel
0,RAF1,phosphorylates,RB1,S608,1,['S608'],[0],-1,[0],1
1,RAF1,phosphorylates,MYC,T8,1,['T8'],[0],1,[0],1
2,RAF1,phosphorylates,HRAS,Y137,1,['Y137'],[0],1,[0],1
3,RAF1,phosphorylates,MAP2K2,S226;S222,1,"['S226', 'S222']","[1, 1]",1,"[1, 1]",1
4,RAF1,phosphorylates,MAP2K1,S218;T292;S298;S222;T286;T386,1,"['S218', 'T292', 'S298', 'S222', 'T286', 'T386']","[1, -1, 1, 1, -1, 0]",1,"[1, -1, 1, 1, -1, 0]",1
...,...,...,...,...,...,...,...,...,...,...
1744,SMAD2,phosphorylates,CTNNB1,Y654,1,['Y654'],[1],1,[1],-1
1745,SMAD2,phosphorylates,SMAD3,S425;S423,1,"['S425', 'S423']","[1, 1]",-1,"[-1, -1]",-1
1746,DUSP4,dephosphorylates,MAPK3,Y204;T202,-1,"['Y204', 'T202']","[1, 1]",1,"[-1, -1]",-1
1747,DUSP4,dephosphorylates,MAPK1,T185;Y187,-1,"['T185', 'Y187']","[1, 1]",1,"[-1, -1]",-1


In [4]:
def prepare_row(row, idx):
    temp = pd.DataFrame(data={
        'Gene': [row['Gene']],
        'InteractionType': [row['InteractionType']],
        'Output': [row['Output']],
        'Site': [ast.literal_eval(row['Sites'])[idx]],
        'RootGeneLabel': [row['RootGeneLabel']],
        'OutputGeneLabel': [row['DownGeneLabel']],
        'InteractionLabel': [row['InteractCode']],
        'SiteStatus': [ast.literal_eval(row['SiteStatus'])[idx]]
    })
    return temp


In [5]:
expected_interaction = pd.DataFrame(
    columns=['Gene', 'InteractionType', 'Output', 'Site', 'RootGeneLabel', 'OutputGeneLabel', 'InteractionLabel', 'SiteStatus'])
unexpected_interaction = pd.DataFrame(
    columns=['Gene', 'InteractionType', 'Output', 'Site', 'RootGeneLabel', 'OutputGeneLabel', 'InteractionLabel', 'SiteStatus'])

for index, row in interaction.iterrows():
    # Both genes are Tumor Suppressor or Oncogene
    if(row['DownGeneLabel'] == row['RootGeneLabel']):
        if(row['InteractCode'] == 1):
            for idx, site in enumerate(json.loads(row['SiteStatus'])):
                # This is condition for Unexpected Interaction
                if(site == -1):
                    unexpected_interaction = unexpected_interaction.append(
                        prepare_row(row, idx), ignore_index=True)

                # This is condition for Expected Interaction
                if(site == 1):
                    expected_interaction = expected_interaction.append(
                        prepare_row(row, idx), ignore_index=True)

        if(row['InteractCode'] == -1):
            for idx, site in enumerate(json.loads(row['SiteStatus'])):
                # This is condition for Unexpected Interaction
                if(site == 1):
                    unexpected_interaction = unexpected_interaction.append(
                        prepare_row(row, idx), ignore_index=True)

                # This is condition for Expected Interaction
                if(site == -1):
                    expected_interaction = expected_interaction.append(
                        prepare_row(row, idx), ignore_index=True)

    # Both genes are not same
    if(row['DownGeneLabel'] != row['RootGeneLabel']):
        if(row['InteractCode'] == 1):
            for idx, site in enumerate(json.loads(row['SiteStatus'])):
                # This is condition for Unexpected Interaction
                if(site == 1):
                    unexpected_interaction = unexpected_interaction.append(
                        prepare_row(row, idx), ignore_index=True)

                # This is condition for Expected Interaction
                if(site == -1):
                    expected_interaction = expected_interaction.append(
                        prepare_row(row, idx), ignore_index=True)

        if(row['InteractCode'] == -1):
            for idx, site in enumerate(json.loads(row['SiteStatus'])):
                # This is condition for Unexpected Interaction
                if(site == -1):
                    unexpected_interaction = unexpected_interaction.append(
                        prepare_row(row, idx), ignore_index=True)

                # This is condition for Expected Interaction
                if(site == 1):
                    expected_interaction = expected_interaction.append(
                        prepare_row(row, idx), ignore_index=True)

expected_interaction


Unnamed: 0,Gene,InteractionType,Output,Site,RootGeneLabel,OutputGeneLabel,InteractionLabel,SiteStatus
0,RAF1,phosphorylates,MAP2K2,S226,1,1,1,1
1,RAF1,phosphorylates,MAP2K2,S222,1,1,1,1
2,RAF1,phosphorylates,MAP2K1,S218,1,1,1,1
3,RAF1,phosphorylates,MAP2K1,S298,1,1,1,1
4,RAF1,phosphorylates,MAP2K1,S222,1,1,1,1
...,...,...,...,...,...,...,...,...
1825,SMAD2,phosphorylates,SMAD3,S423,-1,-1,1,1
1826,DUSP4,dephosphorylates,MAPK3,Y204,-1,1,-1,1
1827,DUSP4,dephosphorylates,MAPK3,T202,-1,1,-1,1
1828,DUSP4,dephosphorylates,MAPK1,T185,-1,1,-1,1


In [6]:
expected_interaction.to_csv(
    'data/outputs/final2/expected_interaction.csv', index=False)
unexpected_interaction.to_csv(
    'data/outputs/final2/unexpected_interaction.csv', index=False)


In [7]:
grouped_unexpected_interaction = unexpected_interaction.groupby(
    ['Gene', 'InteractionType', 'Output'])['Site'].apply(', '.join).reset_index()

grouped_unexpected_interaction.to_csv('data/outputs/final2/unexpected_interaction_grouped.csv', index=False)

In [8]:
grouped_expected_interaction = expected_interaction.groupby(['Gene', 'InteractionType', 'Output'])['Site'].apply(', '.join).reset_index()

grouped_expected_interaction.to_csv('data/outputs/final2/expected_interaction_grouped.csv', index=False)

#### To create seperate file for each of the following
From OG to OG

From OG to TS

From TS to TS

From TS to OG

In [12]:
og_og_expected_interaction = pd.DataFrame(
    columns=['Gene', 'InteractionType', 'Output', 'Site', 'RootGeneLabel', 'OutputGeneLabel', 'InteractionLabel', 'SiteStatus'])
og_ts_expected_interaction = pd.DataFrame(
    columns=['Gene', 'InteractionType', 'Output', 'Site', 'RootGeneLabel', 'OutputGeneLabel', 'InteractionLabel', 'SiteStatus'])
ts_ts_expected_interaction = pd.DataFrame(
    columns=['Gene', 'InteractionType', 'Output', 'Site', 'RootGeneLabel', 'OutputGeneLabel', 'InteractionLabel', 'SiteStatus'])
ts_og_expected_interaction = pd.DataFrame(
    columns=['Gene', 'InteractionType', 'Output', 'Site', 'RootGeneLabel', 'OutputGeneLabel', 'InteractionLabel', 'SiteStatus'])
    
og_og_unexpected_interaction = pd.DataFrame(
    columns=['Gene', 'InteractionType', 'Output', 'Site', 'RootGeneLabel', 'OutputGeneLabel', 'InteractionLabel', 'SiteStatus'])
og_ts_unexpected_interaction = pd.DataFrame(
    columns=['Gene', 'InteractionType', 'Output', 'Site', 'RootGeneLabel', 'OutputGeneLabel', 'InteractionLabel', 'SiteStatus'])
ts_ts_unexpected_interaction = pd.DataFrame(
    columns=['Gene', 'InteractionType', 'Output', 'Site', 'RootGeneLabel', 'OutputGeneLabel', 'InteractionLabel', 'SiteStatus'])
ts_og_unexpected_interaction = pd.DataFrame(
    columns=['Gene', 'InteractionType', 'Output', 'Site', 'RootGeneLabel', 'OutputGeneLabel', 'InteractionLabel', 'SiteStatus'])


for index, row in interaction.iterrows():
    # Both genes are Tumor Suppressor (TS -> TS)
    if(row['DownGeneLabel'] == -1 and row['RootGeneLabel'] == -1):
        if(row['InteractCode'] == 1):
            for idx, site in enumerate(json.loads(row['SiteStatus'])):
                # This is condition for Unexpected Interaction
                if(site == -1):
                    ts_ts_unexpected_interaction = ts_ts_unexpected_interaction.append(
                        prepare_row(row, idx), ignore_index=True)

                # This is condition for Expected Interaction
                if(site == 1):
                    ts_ts_expected_interaction = ts_ts_expected_interaction.append(
                        prepare_row(row, idx), ignore_index=True)

        if(row['InteractCode'] == -1):
            for idx, site in enumerate(json.loads(row['SiteStatus'])):
                # This is condition for Unexpected Interaction
                if(site == 1):
                    ts_ts_unexpected_interaction = ts_ts_unexpected_interaction.append(
                        prepare_row(row, idx), ignore_index=True)

                # This is condition for Expected Interaction
                if(site == -1):
                    ts_ts_expected_interaction = ts_ts_expected_interaction.append(
                        prepare_row(row, idx), ignore_index=True)

    # Both genes are Oncogene (OG -> OG)
    if(row['DownGeneLabel'] == 1 and row['RootGeneLabel'] == 1):
        if(row['InteractCode'] == 1):
            for idx, site in enumerate(json.loads(row['SiteStatus'])):
                # This is condition for Unexpected Interaction
                if(site == -1):
                    og_og_unexpected_interaction = og_og_unexpected_interaction.append(
                        prepare_row(row, idx), ignore_index=True)

                # This is condition for Expected Interaction
                if(site == 1):
                    og_og_expected_interaction = og_og_expected_interaction.append(
                        prepare_row(row, idx), ignore_index=True)

        if(row['InteractCode'] == -1):
            for idx, site in enumerate(json.loads(row['SiteStatus'])):
                # This is condition for Unexpected Interaction
                if(site == 1):
                    og_og_unexpected_interaction = og_og_unexpected_interaction.append(
                        prepare_row(row, idx), ignore_index=True)

                # This is condition for Expected Interaction
                if(site == -1):
                    og_og_expected_interaction = og_og_expected_interaction.append(
                        prepare_row(row, idx), ignore_index=True)

    # One gene is Tumor Suppressor and other is Oncogene (TS -> OG)
    if(row['RootGeneLabel'] == -1 and row['DownGeneLabel'] == 1):
        if(row['InteractCode'] == 1):
            for idx, site in enumerate(json.loads(row['SiteStatus'])):
                # This is condition for Unexpected Interaction
                if(site == 1):
                    ts_og_unexpected_interaction = ts_og_unexpected_interaction.append(
                        prepare_row(row, idx), ignore_index=True)

                # This is condition for Expected Interaction
                if(site == -1):
                    ts_og_expected_interaction = ts_og_expected_interaction.append(
                        prepare_row(row, idx), ignore_index=True)

        if(row['InteractCode'] == -1):
            for idx, site in enumerate(json.loads(row['SiteStatus'])):
                # This is condition for Unexpected Interaction
                if(site == -1):
                    ts_og_unexpected_interaction = ts_og_unexpected_interaction.append(
                        prepare_row(row, idx), ignore_index=True)

                # This is condition for Expected Interaction
                if(site == 1):
                    ts_og_expected_interaction = ts_og_expected_interaction.append(
                        prepare_row(row, idx), ignore_index=True)

    # One gene is Oncogene and other is Tumor Suppressor (OG -> TS)
    if(row['RootGeneLabel'] == 1 and row['DownGeneLabel'] == -1 ):
        if(row['InteractCode'] == 1):
            for idx, site in enumerate(json.loads(row['SiteStatus'])):
                # This is condition for Unexpected Interaction
                if(site == 1):
                    og_ts_unexpected_interaction = og_ts_unexpected_interaction.append(
                        prepare_row(row, idx), ignore_index=True)

                # This is condition for Expected Interaction
                if(site == -1):
                    og_ts_expected_interaction = og_ts_expected_interaction.append(
                        prepare_row(row, idx), ignore_index=True)

        if(row['InteractCode'] == -1):
            for idx, site in enumerate(json.loads(row['SiteStatus'])):
                # This is condition for Unexpected Interaction
                if(site == -1):
                    og_ts_unexpected_interaction = og_ts_unexpected_interaction.append(
                        prepare_row(row, idx), ignore_index=True)

                # This is condition for Expected Interaction
                if(site == 1):
                    og_ts_expected_interaction = og_ts_expected_interaction.append(
                        prepare_row(row, idx), ignore_index=True)


In [13]:
ts_ts_expected_interaction.to_csv('data/outputs/final3/ts_ts_expected_interaction.csv', index=False)
ts_ts_unexpected_interaction.to_csv('data/outputs/final3/ts_ts_unexpected_interaction.csv', index=False)
og_og_expected_interaction.to_csv('data/outputs/final3/og_og_expected_interaction.csv', index=False)
og_og_unexpected_interaction.to_csv('data/outputs/final3/og_og_unexpected_interaction.csv', index=False)
ts_og_expected_interaction.to_csv('data/outputs/final3/ts_og_expected_interaction.csv', index=False)
ts_og_unexpected_interaction.to_csv('data/outputs/final3/ts_og_unexpected_interaction.csv', index=False)
og_ts_expected_interaction.to_csv('data/outputs/final3/og_ts_expected_interaction.csv', index=False)
og_ts_unexpected_interaction.to_csv('data/outputs/final3/og_ts_unexpected_interaction.csv', index=False)


In [15]:
grouped_ts_ts_expected_interaction = ts_ts_expected_interaction.groupby(['Gene', 'InteractionType', 'Output'])['Site'].apply(', '.join).reset_index()
grouped_ts_ts_unexpected_interaction = ts_ts_unexpected_interaction.groupby(['Gene', 'InteractionType', 'Output'])['Site'].apply(', '.join).reset_index()
grouped_og_og_expected_interaction = og_og_expected_interaction.groupby(['Gene', 'InteractionType', 'Output'])['Site'].apply(', '.join).reset_index()
grouped_og_og_unexpected_interaction = og_og_unexpected_interaction.groupby(['Gene', 'InteractionType', 'Output'])['Site'].apply(', '.join).reset_index()
grouped_ts_og_expected_interaction = ts_og_expected_interaction.groupby(['Gene', 'InteractionType', 'Output'])['Site'].apply(', '.join).reset_index()
grouped_ts_og_unexpected_interaction = ts_og_unexpected_interaction.groupby(['Gene', 'InteractionType', 'Output'])['Site'].apply(', '.join).reset_index()
grouped_og_ts_expected_interaction = og_ts_expected_interaction.groupby(['Gene', 'InteractionType', 'Output'])['Site'].apply(', '.join).reset_index()
grouped_og_ts_unexpected_interaction = og_ts_unexpected_interaction.groupby(['Gene', 'InteractionType', 'Output'])['Site'].apply(', '.join).reset_index()


In [16]:
grouped_ts_ts_expected_interaction.to_csv('data/outputs/final3/grouped_ts_ts_expected_interaction.csv', index=False)
grouped_ts_ts_unexpected_interaction.to_csv('data/outputs/final3/grouped_ts_ts_unexpected_interaction.csv', index=False)
grouped_og_og_expected_interaction.to_csv('data/outputs/final3/grouped_og_og_expected_interaction.csv', index=False)
grouped_og_og_unexpected_interaction.to_csv('data/outputs/final3/grouped_og_og_unexpected_interaction.csv', index=False)
grouped_ts_og_expected_interaction.to_csv('data/outputs/final3/grouped_ts_og_expected_interaction.csv', index=False)
grouped_ts_og_unexpected_interaction.to_csv('data/outputs/final3/grouped_ts_og_unexpected_interaction.csv', index=False)
grouped_og_ts_expected_interaction.to_csv('data/outputs/final3/grouped_og_ts_expected_interaction.csv', index=False)
grouped_og_ts_unexpected_interaction.to_csv('data/outputs/final3/grouped_og_ts_unexpected_interaction.csv', index=False)


|  OG-OG        |   TS-TS   |   TS-OG   |   OG-TS |  OG-OG |
| --------      | -------- | ---------- | -------- | ------ |
| expected      |   2      |   2       |   2       |   2 | 
| un-expected   |  2       |   2       |   2       |   2 |