In [1]:
import pandas as pd
import os

In [2]:
class task_1_1(object):
    def __init__(self):
        '''
        Inputs:
        
            - Summary df = DisGeNET dataset (saved on tsv file) filtered by disease (the ours is C0007193).
            
            - Multi-entry gene = gene with 2 comma-separated uniprot entries on the same row.
            
            - HGNC df = HGNC dataset (saved on csv file)
            
            - Uni df = UniProt dataset (saved on excel file) filtered by reviewed and human organism.
        '''
        self.summary_df = pd.read_csv('downloads/C0007193_disease_gda_summary_CURATED.tsv', sep='\t')
        self.multi_entry_gene = 'TMPO'
        self.hgnc_df = pd.read_csv('downloads/hgnc-symbol-check.csv', header= 1)
        self.uni_df = pd.read_excel('downloads/uniprot-yourlist_M202001086746803381A1F0E0DB47453E0216320D568506G-filtered-org--.xlsx')
    
    
    
    def seed_genes_generator(self):
        '''
        Returns:
        
            - list of seed genes approved by HGNC database.
        '''
    
        # save approved genes
        seed_genes = []
        for index,row in self.hgnc_df.iterrows():
            if row['Match type'] == 'Approved symbol':
                seed_genes.append(row['Input'])

        return seed_genes
    
    def check_seeds_approvation(self):
        '''
        Returns:
        
            - print genes approvation (or not approvation).
        '''
        # seeds
        approved_genes = self.seed_genes_generator()
        #check if all my input genes are approved
        if len(set(approved_genes)) == len(set(self.summary_df.Gene)):
            print('All genes are approved! They are', len(set(approved_genes)))
        else:
            print('Not all genes are approved. Check!')
            
    def uniprot_filter(self):
        '''
        Returns:
        
            - final dataframe of seed genes, with informations extracted from UniProt dataframe
        '''
        
        seeds = self.seed_genes_generator()
        # considering seeds in uniprot
        final_df = self.uni_df[self.uni_df['Gene names  (primary )'].isin(seeds)]
        #renaming cols
        original_cols = list(final_df.columns)
        new_cols = ['Uniprot AC','Gene Symbol', 'Protein Name', 'Organism', 'Entrez ID', 'Function']
        final_df = final_df.rename(columns=dict(zip(original_cols, new_cols)))
        # resetting indeces
        final_df.reset_index(drop=True, inplace= True)
        final_df.head()
        final_df = final_df[['Gene Symbol', 'Uniprot AC', 'Protein Name', 'Entrez ID', 'Function', 'Organism']]
        return final_df
    
    def uniprot_duplicated_genes(self):
        '''
        Returns:
        
            - set of seed genes having more uniprot entries in the final df.
        '''
        uniprot_df = self.uniprot_filter()
        dupl = uniprot_df['Gene Symbol'].duplicated()
        idx = list(dupl[dupl==True].index)
        duplicated_genes = list(uniprot_df.loc[idx]['Gene Symbol'])
        return duplicated_genes
    
    def save_results(self):
        '''
        Returns:
        
            - file .csv with the result dataframe
        '''
        final_df = self.uniprot_filter()
        # saving to csv
        try:
            os.remove('results/seed_gene_table.csv')
        except:
            pass

        final_df.to_csv('results/seed_gene_table.csv')
        print('Results saved!')

In [3]:
### task 1.1
task_1_1 = task_1_1()
# check if all disgenet seeds are approved by hgnc
task_1_1.check_seeds_approvation()
# approved genes
approved_genes = task_1_1.seed_genes_generator()
# set of seed genes having more uniprot entries
duplicated_genes = task_1_1.uniprot_duplicated_genes()
# save task 1.1 results
task_1_1.save_results()

All genes are approved! They are 101
Results saved!


In [4]:
class task_1_2_IID(object):
    
    def __init__(self):
        '''
        Inputs:
        
            - iid df = dataframe with iid interactions with at least one seed gene (filtered by heart tissue, experimental data and human organism);
            
            - iid no seeds df = dataframe with interactions among only no seeds which are involved also in interactions with seeds in the other df
                                        (df filtered by heart tissue, experimental data and human organism);
            
            - approved genes = list of seed genes approved by hgnc;
            
            - n interactions seeds = number of interactions with at least one seed;
            
            - n interactions no seeds = number of interactions among no seeds.
        '''
        # iid df of interactions with at least one seed
        self.iid_df = pd.read_csv('downloads/PPIs.txt', sep='\t')
        # df with interactions among the no seeds which have also interactions with seeds in the file above
        self.iid_no_seeds_df = pd.read_csv('downloads/PPI_no_seed.txt', sep='\t')
        # list of approved seed genes
        self.approved_genes = approved_genes
        # number of interactions with at least one seed gene
        self.n_interactions_seeds = len(self.iid_df)
        self.n_interactions_no_seeds = len(self.iid_no_seeds_df)
        
    def no_seeds_generator(self):
        '''
        Returns:
        
            - no seeds found = list of no seeds which interact with our seed genes;
            
            - n seeds = number of seed genes found on iid;
            
            - n no seeds = number of no seeds genes (which interact with our seeds) found on iid;
            
            - n tot proteins = total number of proteins found on iid (seeds + no seeds).
        '''
        # list of genes in the first symbols col
        symbols_1 = list(self.iid_df.symbol1)
        # list of genes in the second symbols col
        symbols_2 = list(self.iid_df.symbol2)

        # set of tot proteins (seed and no-seed genes)
        tot_proteins = symbols_1 + symbols_2 
        tot_proteins = set(tot_proteins)
        no_seeds_found = []
        # seed genes found in the dataset
        seeds_iid_found = []
        for gene in tot_proteins:
            if gene in self.approved_genes:
                seeds_iid_found.append(gene)
            else:
                no_seeds_found.append(gene)
        
        # sets of seeds and no-seeds
        seeds_iid_found = set(seeds_iid_found)
        no_seeds_found = set(no_seeds_found)
        # number of seeds and no-seeds
        n_seeds = len(seeds_iid_found)
        n_no_seeds= len(no_seeds_found)
        # number of total proteins
        n_tot_proteins = len(tot_proteins)
        return no_seeds_found, n_seeds, n_no_seeds, n_tot_proteins

    def save_no_seeds(self):
        '''
        Returns:
        
            - file .txt with all no seeds genes in order to copy and paste them on iid (on which we search for their interactions).
        '''
        no_seeds_found,_,_,_ = self.no_seeds_generator()
        try:
            os.remove('data/iid_no_seeds.txt')
        except:
            pass

        f = open('data/iid_no_seeds.txt', 'w')
        for gene in no_seeds_found:
              f.write(gene+'\n')

        f.close()
        
    def build_final_df(self):
        '''
        Returns:
        
            - iid interactions = dataframe with all the iid interactions (among only seeds, with at least one seed and among only no seeds);
            
            - n interactions = total number of iid interactions.
            
        '''
        # final df
        iid_interactions = pd.concat([self.iid_df, self.iid_no_seeds_df], axis = 0)[['symbol1', 'symbol2','UniProt1', 'UniProt2']]

        #saving dataset
        try:
            os.remove('results/iid_interactions.csv')
        except:
            pass

        iid_interactions.to_csv('results/iid_interactions.csv', sep=';')
        # number of total interactions
        n_interactions = len(iid_interactions)
        
        return iid_interactions,n_interactions
    
    def build_table_results(self):
        '''
        Returns:
        
            - .csv file with the iid interactions df;
            
            - table results = dataframe with iid results (total interactions, total proteins, etc.).
            
        '''
        _, tot_interactions = self.build_final_df()
        _, n_seeds, n_no_seeds, tot_proteins = self.no_seeds_generator()
        
        table_results = pd.DataFrame([tot_interactions, self.n_interactions_seeds, self.n_interactions_no_seeds, tot_proteins, n_seeds, n_no_seeds])
        
        rename_dict = dict(zip([x for x in range(6)], ['Total interactions', 'Interactions with at least one seed', 'Interactions among no-seeds', 
                                                      'Total proteins', 'Seed genes', 'No-seed genes']))
        table_results.rename(rename_dict, inplace = True)
        
        # save on csv
        try:
            os.remove('results/iid_results.csv')
        except:
            pass
        
        table_results.to_csv('results/iid_results.csv')
        return table_results
        
        

In [5]:
IID = task_1_2_IID()
# save no seeds on txt file in order to copy and paste them on iid (to find interactions among no seeds)
IID.save_no_seeds()
# iid df (saved on csv file)
iid_interactions,_ = IID.build_final_df()
# iid table results (saved on csv file)
iid_results = IID.build_table_results()


In [6]:
iid_interactions.head()

Unnamed: 0,symbol1,symbol2,UniProt1,UniProt2
0,TCAP,TTN,O15273,Q8WZ42
1,TCAP,CSRP3,O15273,P50461
2,TCAP,ENO3,O15273,P13929
3,TCAP,ENO1,O15273,P06733
4,TCAP,MYOZ1,O15273,Q9NP98


In [7]:
iid_results.T

Unnamed: 0,Total interactions,Interactions with at least one seed,Interactions among no-seeds,Total proteins,Seed genes,No-seed genes
0,100232,6952,93280,3945,93,3852


## BIOGRID

Here we download the file of human organism from biogrid, and consider just the rows where appear at least one of our seed-genes. So on this way we should consider interactions among only seed-genes and among seed and no-seed genes.

Then we will check if there also interactions among only these no-seed genes.

In [26]:
#load biogrid dataset
biogrid_df = pd.read_table('BIOGRID-ORGANISM-Homo_sapiens-3.5.179.tab2.txt', low_memory = False)
biogrid_df.head()

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,Pubmed ID,Organism Interactor A,Organism Interactor B,Throughput,Score,Modification,Phenotypes,Qualifications,Tags,Source Database
0,103,6416,2318,112315,108607,-,-,MAP2K4,FLNC,JNKK|JNKK1|MAPKK4|MEK4|MKK4|PRKMK4|SAPKK-1|SAP...,...,9006895,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
1,117,84665,88,124185,106603,-,-,MYPN,ACTN2,CMD1DD|CMH22|MYOP|RCM4,...,11309420,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
2,183,90,2339,106605,108625,-,-,ACVR1,FNTA,ACTRI|ACVR1A|ACVRLK2|ALK2|FOP|SKR1|TSRI,...,8599089,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
3,278,2624,5371,108894,111384,-,-,GATA2,PML,DCML|IMD21|MONOMAC|NFE1B,...,10938104,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
4,418,6118,6774,112038,112651,RP4-547C9.3,-,RPA2,STAT3,REPA2|RP-A p32|RP-A p34|RPA32,...,10875894,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID


### Results

In [27]:
# consider just interactions in which appear at least one seed-gene
symbols_A = list(biogrid_df['Official Symbol Interactor A'])
symbols_B = list(biogrid_df['Official Symbol Interactor B'])

kept_indeces = []
for i in range(len(biogrid_df)):
    if symbols_A[i] in approved_genes or symbols_B[i] in approved_genes: 
        kept_indeces.append(i)

seeds_interactions = biogrid_df.loc[kept_indeces]
#seeds_interactions_A = biogrid_df[biogrid_df['Official Symbol Interactor A'].isin(approved_genes)]
#seeds_interactions_B= biogrid_df[biogrid_df['Official Symbol Interactor B'].isin(approved_genes)]
#seeds_interactions = pd.concat([seeds_interactions_A, seeds_interactions_B], axis = 0)

In [28]:
# tot number of proteins
A = list(seeds_interactions['Official Symbol Interactor A'])
B = list(seeds_interactions['Official Symbol Interactor B'])
tot_A_B = set(A + B)
tot_len = len(tot_A_B)

print('Total number of proteins (including seed genes): ', tot_len)

Total number of proteins (including seed genes):  5034


In [29]:
# compute number of seeds and no-seeds
no_seed_genes = []
for i, row in seeds_interactions.iterrows():
    if row['Official Symbol Interactor A'] not in approved_genes:
        no_seed_genes.append(row['Official Symbol Interactor A'])
    elif row['Official Symbol Interactor B'] not in approved_genes:
        no_seed_genes.append(row['Official Symbol Interactor B'])
no_seed_genes = list(set(no_seed_genes))
print('Number of No-Seed Genes: ', len(no_seed_genes)) 
n_seeds = tot_len - len(no_seed_genes)
print('Number of Seed Genes: ', n_seeds)

Number of No-Seed Genes:  4936
Number of Seed Genes:  98


In [30]:
# file txt biogrid genes
try:
    os.remove('data/biogrid_genes.txt')
except:
    pass

ff = open('data/biogrid_genes.txt', 'w')

for gene in tot_A_B:
    ff.write(gene+'\n')
ff.close()

In [31]:
# compute interactions among no-seeds
idx = []
for i in range(len(biogrid_df)):
    if symbols_A[i] in no_seed_genes and symbols_B[i] in no_seed_genes:
        
        idx.append(i)
        
# df with just interactions among no-seed-gene
no_seeds_interactions = biogrid_df.loc[idx]       
        
# total interactions df
biogrid_interactions = pd.concat([seeds_interactions,no_seeds_interactions], axis = 0)

# compute number of inetractions among just seed genes
ccc = 0
symbols_a = list(seeds_interactions['Official Symbol Interactor A'])
symbols_b = list(seeds_interactions['Official Symbol Interactor B'])
for i in range(len(seeds_interactions)):
    if symbols_a[i] in approved_genes and symbols_b[i] in approved_genes:
        ccc +=1
        
# print results
print('We have in total ',len(biogrid_interactions), ' interactions:')
print(len(seeds_interactions), 'interactions with at least one seed-gene involved.')
print(ccc, 'interactions among only seed genes')
print(len(no_seeds_interactions),' interactions among no-seed genes')


We have in total  218358  interactions:
12657 interactions with at least one seed-gene involved.
322 interactions among only seed genes
205701  interactions among no-seed genes


### UniProt AC for Biogrid proteins

In [32]:
# read filtered uniprot dataset
uniprot_for_biogrid = pd.read_excel('uniprot_for_biogrid.xlsx')[['Entry', 'Gene names  (primary )']]

In [33]:
# boolean list: True if i-gene is duplicated
dupl = uniprot_for_biogrid['Gene names  (primary )'].duplicated()
# positional indeces of duplicates
dupl_indeces = list(dupl[dupl == True].index)
# extracting names of genes with more than one uniprot ac
genes_with_more_entries = set(uniprot_for_biogrid.loc[dupl_indeces]['Gene names  (primary )'])

In [34]:
# add uniprot ac for interactors A
biogrid_interactions = biogrid_interactions.merge(uniprot_for_biogrid, left_on='Official Symbol Interactor A', right_on = 'Gene names  (primary )', how= 'left').drop(['Gene names  (primary )'], axis=1)
biogrid_interactions.rename(columns = {'Entry': 'Interactor A Uniprot AC'}, inplace= True)
# add uniprot ac for interactors B
biogrid_interactions = biogrid_interactions.merge(uniprot_for_biogrid, left_on='Official Symbol Interactor B', right_on = 'Gene names  (primary )', how= 'left').drop(['Gene names  (primary )'], axis=1)
biogrid_interactions.rename(columns = {'Entry': 'Interactor B Uniprot AC'}, inplace= True)


In [39]:
null_A = biogrid_interactions['Interactor A Uniprot AC'].isnull()
null_idx_B = list(null_A[null_A==True].index)
null_A_list=list(biogrid_interactions.loc[null_idx_B]['Official Symbol Interactor A'])

null_B = biogrid_interactions['Interactor B Uniprot AC'].isnull()
null_idx_B = list(null_B[null==True].index)
null_B_list = list(biogrid_interactions.loc[null_idx_B]['Official Symbol Interactor B'])


not_mapped_genes = set(null_A_list+null_B_list)
#biogrid_interactions[biogrid_interactions['Official Symbol Interactor A']=='SDHAF3']

In [85]:
# save to csv
try:
    os.remove('results/biogrid_interactions.csv')
except:
    pass

biogrid_interactions.to_csv('results/biogrid_interactions.csv')

# Task 1.3


# A 

Seed genes interactome: interactions that involve seed genes only, from all DBs, in the format:

* interactor A gene symbol, 

* interactor B gene symbol, 

* interactor A Uniprot AC, 

* interactor B Uniprot AC, 

* database source

# B

Union interactome: all proteins interacting with at least one seed gene, from all DBs, same format as above.

In [88]:
# it renames the cols on the requested way
def rename_cols(df):
    original_cols = list(df.columns)
    new_cols = ['interactor A gene symbol', 'interactor B gene symbol', 'interactor A Uniprot AC', 'interactor B Uniprot AC', 'database source']
    df = df.rename(columns=dict(zip(original_cols, new_cols)))
    return df



In [89]:
# turning off warning
pd.set_option('mode.chained_assignment', None)

# renaming cols for iid_df
iid_data = iid_interactions[['symbol1','symbol2', 'UniProt1', 'UniProt2' ]]
source_iid = ['IID' for x in range(len(iid_data))]
iid_data['Source'] = source_iid

iid_data = rename_cols(iid_data)

In [90]:
biogrid_interactions.head()

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,Organism Interactor B,Throughput,Score,Modification,Phenotypes,Qualifications,Tags,Source Database,Interactor A Uniprot AC,Interactor B Uniprot AC
0,103,6416,2318,112315,108607,-,-,MAP2K4,FLNC,JNKK|JNKK1|MAPKK4|MEK4|MKK4|PRKMK4|SAPKK-1|SAP...,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,P45985,Q14315
1,117,84665,88,124185,106603,-,-,MYPN,ACTN2,CMD1DD|CMH22|MYOP|RCM4,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,Q86TC9,P35609
2,2006,153,10755,106662,115978,-,-,ADRB1,GIPC1,ADRB1R|B1AR|BETA1AR|RHR,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,P08588,O14908
3,2765,5664,823,111643,107273,-,PIG30,PSEN2,CAPN1,AD3L|AD4|CMD1V|PS2|STM2,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,P49810,P07384
4,2785,825,7273,107275,113124,-,-,CAPN3,TTN,CANP3|CANPL3|LGMD2|LGMD2A|nCL-1|p94,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,P20807,Q8WZ42


In [91]:
# turning off warning
pd.set_option('mode.chained_assignment', None)

# renaming cols for iid_df
bio_data = biogrid_interactions[['Official Symbol Interactor A','Official Symbol Interactor B', 'Interactor A Uniprot AC', 'Interactor B Uniprot AC' ]]
source_bio = ['Biogrid' for x in range(len(bio_data))]
bio_data['Source'] = source_bio
bio_data = rename_cols(bio_data)

In [35]:

'''
# adding to biogrid df, uniprot for the genes A,B
bio_data = biogrid_interactions[['Official Symbol Interactor A', 'Official Symbol Interactor B']]

df1= final_df[final_df['Gene_Symbol'].isin(list(bio_data['Official Symbol Interactor A']))]
df2 = final_df[final_df['Gene_Symbol'].isin(list(bio_data['Official Symbol Interactor B']))]
df = pd.concat([df1,df2], axis= 0)

df = df[['Gene_Symbol', 'Uniprot AC']]

bio_data = pd.merge(bio_data, df, left_on='Official Symbol Interactor A', right_on='Gene_Symbol', how='outer')
bio_data = bio_data.drop(['Gene_Symbol'], axis=1)
old_cols = list(bio_data.columns)
new_cols = ['Official Symbol Interactor A', 'Official Symbol Interactor B', 'UniA']
bio_data = bio_data.rename(columns= dict(zip(old_cols, new_cols)))
bio_data = pd.merge(bio_data, df, left_on='Official Symbol Interactor B', right_on='Gene_Symbol', how='outer')
bio_data = bio_data.drop(['Gene_Symbol'], axis=1)
'''

In [98]:
# concatenate df of the 2 databases
total_df = pd.concat([iid_data, bio_data], axis = 0)
# resetting indeces
total_df.reset_index(drop=True, inplace= True)

In [99]:
# task A: seed genes interactome SGI
genes_A = list(total_df['interactor A gene symbol'])
genes_B = list(total_df['interactor B gene symbol'])

sgi_idx = []
for i in range(len(total_df)):
    if genes_A[i] in approved_genes and genes_B[i] in approved_genes:
        sgi_idx.append(i)
        
sgi_df = total_df.loc[sgi_idx]
# resetting indeces
sgi_df.reset_index(drop=True, inplace= True)


In [100]:
# task B : union interactome

# interactions in which there is at least one seed-gene
union_idx = []
for i in range(len(total_df)):
    if genes_A[i] in approved_genes or genes_B[i] in approved_genes:
        union_idx.append(i)

union_df = total_df.loc[union_idx]       
        
# resetting indeces
union_df.reset_index(drop=True, inplace= True)



# C

Intersection interactome: all proteins interacting with at least one seed gene confirmed by both DBs, in the format:

* interactor A gene symbol, 

* interactor B gene symbol, 

* interactor A Uniprot AC, 

* interactor B Uniprot AC


In [101]:
intersection_df = pd.merge(iid_data.drop(['database source'], axis = 1), bio_data.drop(['database source'], axis = 1), how='inner')
intersection_df.head()

Unnamed: 0,interactor A gene symbol,interactor B gene symbol,interactor A Uniprot AC,interactor B Uniprot AC
0,TCAP,TTN,O15273,Q8WZ42
1,TCAP,TTN,O15273,Q8WZ42
2,TCAP,MYOZ1,O15273,Q9NP98
3,TCAP,MDM2,O15273,Q00987
4,TCAP,MDM2,O15273,Q00987


In [102]:
#saving files

try:
    os.remove('results/interactome.csv')
    os.remove('results/union.csv')
    os.remove('results/intersection.csv')
    
except:
    pass

sgi_df.to_csv('results/sgi.csv')
union_df.to_csv('results/union.csv')
intersection_df.to_csv('results/intersection.csv')

# TASK 1.4

In [103]:
# saving on txt file seed genes and union interactome genes
# in order to copy and paste genes from txt file to EnrichR 
try:
    os.remove('data/seed_genes.txt')
    os.remove('data/union_genes.txt')
except:
    pass

file_seed = open('data/seed_genes.txt', 'w')


for gene in approved_genes:
    file_seed.write(gene+'\n')
 
file_seed.close()


file_union = open('data/union_genes.txt', 'w')
union_genes_list = list(set(list(union_df['interactor A gene symbol']) + list(union_df['interactor B gene symbol'])))                           
for gene in union_genes_list:
    file_union.write(gene+'\n')
                            
file_union.close()                        