In [1]:
import pandas as pd
import os

# TASK 1.1


## DisGeNET

In [72]:
class task_1_1(object):
    def __init__(self):
        '''
        Inputs:
        
            - Summary file = DisGeNET dataset (saved on tsv file) filtered by disease (the ours is C0007193).
            
            - Multi-entry gene = gene with 2 comma-separated uniprot entries on the same row.
            
            - HGNC file = HGNC dataset (saved on csv file)
            
            - UniProt file = UniProt dataset (saved on excel file) 
        '''
        self.summary_file = 'downloads/C0007193_disease_gda_summary_CURATED.tsv'
        self.multi_entry_gene = 'TMPO'
        self.hgnc_file = 'downloads/hgnc-symbol-check.csv'
        self.uniprot_file = 'downloads/uniprot-yourlist_M202001086746803381A1F0E0DB47453E0216320D568506G-filtered-org--.xlsx'
    def gda_dataframe(self):
        '''
        Returns:
        
            - gene-disease association dataframe.
        '''
        summary_df = pd.read_csv(self.summary_file, sep='\t')
        return summary_df
    
    def multy_entry_rows(self):
        '''
        Returns:
        
            - gene-disease association dataframe in which genes with 2 uniprot entries on the same row
                are distributed on as many rows as the number of their entries (in our case, the 
                gene is TMPO and it has 2 entries: we build 2 rows for the same gene, each one with one of
                its entry).
        '''
        summary_df = self.gda_dataframe()
        # list of UniProt col values
        col_uniprot = list(summary_df.UniProt)
        # index of the itnerested gene
        idx = summary_df.index[summary_df['Gene'] == self.multi_entry_gene][0]
        # value corresponfing to TMPO gene
        double_entries_list = col_uniprot[idx].split(';')
        # first UniProt entry
        first_entry = double_entries_list[0]
        # Second UniProt entry
        second_entry = double_entries_list[1]
        # updating UniProt column with just the first entry for TMPO gene 
        col_uniprot[idx] = first_entry
        summary_df.UniProt = col_uniprot
        # listing the TMPO row
        row_to_update = list((summary_df.loc[idx]))
        # updating the UniProt value with second entry
        row_to_update[4] = second_entry
        # adding the new row to df
        summary_df.loc[len(summary_df)] = row_to_update
        return summary_df
    
    def hgnc_dataframe(self):
        '''
        Returns:
        
            - HGNC dataframe
        '''
        hgnc_df = pd.read_csv(self.hgnc_file, header= 1)
        return hgnc_df
    
    def check_genes_approvation(self):
        '''
        Returns:
        
            - list of seed genes approved by HGNC database.
        '''
        # hgnc df
        hgnc_df = self.hgnc_dataframe()
        # save approved genes
        approved_genes = []
        for index,row in hgnc_df.iterrows():
            if row['Match type'] == 'Approved symbol':
                approved_genes.append(row['Input'])

        #check if all my input genes are approved
        if len(set(approved_genes)) == len(set(summary_df.Gene)):
            print('All genes are approved! They are', len(set(approved_genes)))
        else:
            print('Not all genes are approved. Check!')
        return approved_genes
    
    def uniprot_dataframe(self):
        '''
        Returns:
        
            - Uniprot dataframe.
        '''
        uni_df = pd.read_excel(self.uniprot_file)
        return uni_df
    
    def uniprot_filter(self):
        '''
        Returns:
        
            - final dataframe of seed genes, with informations extracted from UniProt dataframe
        '''
        final_df = self.uniprot_dataframe()
        #renaming cols
        original_cols = list(final_df.columns)
        new_cols = ['Uniprot AC','Gene Symbol', 'Protein Name', 'Organism', 'Entrez ID', 'Function']
        final_df = final_df.rename(columns=dict(zip(original_cols, new_cols)))
        # resetting indeces
        final_df.reset_index(drop=True, inplace= True)
        final_df.head()
        final_df = final_df[['Gene Symbol', 'Uniprot AC', 'Protein Name', 'Entrez ID', 'Function', 'Organism']]
        return final_df
    
    def save_results(self):
        '''
        Returns:
        
            - file .csv with the result dataframe
        '''
        final_df = self.uniprot_filter()
        # saving to csv
        try:
            os.remove('results/seed_gene_table.csv')
        except:
            pass

        final_df.to_csv('results/seed_gene_table.csv')

In [65]:
### task 1.1
task_1_1 = task_1_1()
# list of seed genes and check hgnc seeds approvation
approved_genes = task_1_1.check_genes_approvation()
# save task 1.1 results
task_1_1.save_results()

All genes are approved! They are 101


# TASK 1.2

## IID

In [21]:
# laod iid dataset, where we have interactions with at least one seed
iid_df = pd.read_csv('PPIs.txt', sep='\t')
iid_df.head()

Unnamed: 0,UniProt1,UniProt2,symbol1,symbol2,evidence type,heart
0,O15273,Q8WZ42,TCAP,TTN,exp;pred,1
1,O15273,P50461,TCAP,CSRP3,exp;pred,1
2,O15273,P13929,TCAP,ENO3,exp;pred,1
3,O15273,P06733,TCAP,ENO1,exp,1
4,O15273,Q9NP98,TCAP,MYOZ1,exp;pred,1


In this case we must use experimental data: if we find the tags: e.g. "experimental;predicted;ortolog", they ARE considered as experimental.

### Results

We must consider also interactions among no-seed proteins.
Then I must consider the no-seed genes among the symbol 1 and symbol 2 cols and find their interactions.

In [22]:
print('We have in total ',len(iid_df), ' interactions.')

# list of genes in the first symbols col
symbols_1 = list(iid_df.symbol1)
# list of genes in the second symbols col
symbols_2 = list(iid_df.symbol2)

# list of tot proteins (seed and no-seed genes)
tot_proteins = symbols_1 + symbols_2 
tot_proteins = list(set(tot_proteins))
no_seeds_found = []
# seed genes found in the dataset
seeds_iid_found = []
for gene in tot_proteins:
    if gene in approved_genes:
        seeds_iid_found.append(gene)
    else:
        no_seeds_found.append(gene)
        
seeds_iid_found = list(set(seeds_iid_found))
no_seeds_found = list(set(no_seeds_found))

print('We found ', len(seeds_iid_found), ' seed genes and ', len(no_seeds_found), ' no seed genes')
print('We have a total of ', len(tot_proteins), ' proteins')

We have in total  6952  interactions.
We found  93  seed genes and  3852  no seed genes
We have a total of  3945  proteins


In [23]:
# saving no seeds to txt file in order to copy and paste on IID

try:
    os.remove('data/iid_no_seeds.txt')
except:
    pass

f = open('data/iid_no_seeds.txt', 'w')
for gene in no_seeds_found:
      f.write(gene+'\n')
        
f.close()

In [24]:
# load ppi among only no-seeds
iid_no_seeds = pd.read_csv('PPI_no_seed.txt', sep='\t')
print('We have ', len(iid_no_seeds), ' interactions among only no-seeds.')


We have  93280  interactions among only no-seeds.


In [25]:
iid_interactions = pd.concat([iid_df, iid_no_seeds], axis = 0)

#saving dataset
try:
    os.remove('results/iid_interactions.csv')
except:
    pass

iid_interactions.to_csv('results/iid_interactions.csv')

## BIOGRID

Here we download the file of human organism from biogrid, and consider just the rows where appear at least one of our seed-genes. So on this way we should consider interactions among only seed-genes and among seed and no-seed genes.

Then we will check if there also interactions among only these no-seed genes.

In [26]:
#load biogrid dataset
biogrid_df = pd.read_table('BIOGRID-ORGANISM-Homo_sapiens-3.5.179.tab2.txt', low_memory = False)
biogrid_df.head()

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,Pubmed ID,Organism Interactor A,Organism Interactor B,Throughput,Score,Modification,Phenotypes,Qualifications,Tags,Source Database
0,103,6416,2318,112315,108607,-,-,MAP2K4,FLNC,JNKK|JNKK1|MAPKK4|MEK4|MKK4|PRKMK4|SAPKK-1|SAP...,...,9006895,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
1,117,84665,88,124185,106603,-,-,MYPN,ACTN2,CMD1DD|CMH22|MYOP|RCM4,...,11309420,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
2,183,90,2339,106605,108625,-,-,ACVR1,FNTA,ACTRI|ACVR1A|ACVRLK2|ALK2|FOP|SKR1|TSRI,...,8599089,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
3,278,2624,5371,108894,111384,-,-,GATA2,PML,DCML|IMD21|MONOMAC|NFE1B,...,10938104,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
4,418,6118,6774,112038,112651,RP4-547C9.3,-,RPA2,STAT3,REPA2|RP-A p32|RP-A p34|RPA32,...,10875894,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID


### Results

In [27]:
# consider just interactions in which appear at least one seed-gene
symbols_A = list(biogrid_df['Official Symbol Interactor A'])
symbols_B = list(biogrid_df['Official Symbol Interactor B'])

kept_indeces = []
for i in range(len(biogrid_df)):
    if symbols_A[i] in approved_genes or symbols_B[i] in approved_genes: 
        kept_indeces.append(i)

seeds_interactions = biogrid_df.loc[kept_indeces]
#seeds_interactions_A = biogrid_df[biogrid_df['Official Symbol Interactor A'].isin(approved_genes)]
#seeds_interactions_B= biogrid_df[biogrid_df['Official Symbol Interactor B'].isin(approved_genes)]
#seeds_interactions = pd.concat([seeds_interactions_A, seeds_interactions_B], axis = 0)

In [28]:
# tot number of proteins
A = list(seeds_interactions['Official Symbol Interactor A'])
B = list(seeds_interactions['Official Symbol Interactor B'])
tot_A_B = set(A + B)
tot_len = len(tot_A_B)

print('Total number of proteins (including seed genes): ', tot_len)

Total number of proteins (including seed genes):  5034


In [29]:
# compute number of seeds and no-seeds
no_seed_genes = []
for i, row in seeds_interactions.iterrows():
    if row['Official Symbol Interactor A'] not in approved_genes:
        no_seed_genes.append(row['Official Symbol Interactor A'])
    elif row['Official Symbol Interactor B'] not in approved_genes:
        no_seed_genes.append(row['Official Symbol Interactor B'])
no_seed_genes = list(set(no_seed_genes))
print('Number of No-Seed Genes: ', len(no_seed_genes)) 
n_seeds = tot_len - len(no_seed_genes)
print('Number of Seed Genes: ', n_seeds)

Number of No-Seed Genes:  4936
Number of Seed Genes:  98


In [30]:
# file txt biogrid genes
try:
    os.remove('data/biogrid_genes.txt')
except:
    pass

ff = open('data/biogrid_genes.txt', 'w')

for gene in tot_A_B:
    ff.write(gene+'\n')
ff.close()

In [31]:
# compute interactions among no-seeds
idx = []
for i in range(len(biogrid_df)):
    if symbols_A[i] in no_seed_genes and symbols_B[i] in no_seed_genes:
        
        idx.append(i)
        
# df with just interactions among no-seed-gene
no_seeds_interactions = biogrid_df.loc[idx]       
        
# total interactions df
biogrid_interactions = pd.concat([seeds_interactions,no_seeds_interactions], axis = 0)

# compute number of inetractions among just seed genes
ccc = 0
symbols_a = list(seeds_interactions['Official Symbol Interactor A'])
symbols_b = list(seeds_interactions['Official Symbol Interactor B'])
for i in range(len(seeds_interactions)):
    if symbols_a[i] in approved_genes and symbols_b[i] in approved_genes:
        ccc +=1
        
# print results
print('We have in total ',len(biogrid_interactions), ' interactions:')
print(len(seeds_interactions), 'interactions with at least one seed-gene involved.')
print(ccc, 'interactions among only seed genes')
print(len(no_seeds_interactions),' interactions among no-seed genes')


We have in total  218358  interactions:
12657 interactions with at least one seed-gene involved.
322 interactions among only seed genes
205701  interactions among no-seed genes


### UniProt AC for Biogrid proteins

In [32]:
# read filtered uniprot dataset
uniprot_for_biogrid = pd.read_excel('uniprot_for_biogrid.xlsx')[['Entry', 'Gene names  (primary )']]

In [33]:
# boolean list: True if i-gene is duplicated
dupl = uniprot_for_biogrid['Gene names  (primary )'].duplicated()
# positional indeces of duplicates
dupl_indeces = list(dupl[dupl == True].index)
# extracting names of genes with more than one uniprot ac
genes_with_more_entries = set(uniprot_for_biogrid.loc[dupl_indeces]['Gene names  (primary )'])

In [34]:
# add uniprot ac for interactors A
biogrid_interactions = biogrid_interactions.merge(uniprot_for_biogrid, left_on='Official Symbol Interactor A', right_on = 'Gene names  (primary )', how= 'left').drop(['Gene names  (primary )'], axis=1)
biogrid_interactions.rename(columns = {'Entry': 'Interactor A Uniprot AC'}, inplace= True)
# add uniprot ac for interactors B
biogrid_interactions = biogrid_interactions.merge(uniprot_for_biogrid, left_on='Official Symbol Interactor B', right_on = 'Gene names  (primary )', how= 'left').drop(['Gene names  (primary )'], axis=1)
biogrid_interactions.rename(columns = {'Entry': 'Interactor B Uniprot AC'}, inplace= True)


In [39]:
null_A = biogrid_interactions['Interactor A Uniprot AC'].isnull()
null_idx_B = list(null_A[null_A==True].index)
null_A_list=list(biogrid_interactions.loc[null_idx_B]['Official Symbol Interactor A'])

null_B = biogrid_interactions['Interactor B Uniprot AC'].isnull()
null_idx_B = list(null_B[null==True].index)
null_B_list = list(biogrid_interactions.loc[null_idx_B]['Official Symbol Interactor B'])


not_mapped_genes = set(null_A_list+null_B_list)
#biogrid_interactions[biogrid_interactions['Official Symbol Interactor A']=='SDHAF3']

In [85]:
# save to csv
try:
    os.remove('results/biogrid_interactions.csv')
except:
    pass

biogrid_interactions.to_csv('results/biogrid_interactions.csv')

# Task 1.3


# A 

Seed genes interactome: interactions that involve seed genes only, from all DBs, in the format:

* interactor A gene symbol, 

* interactor B gene symbol, 

* interactor A Uniprot AC, 

* interactor B Uniprot AC, 

* database source

# B

Union interactome: all proteins interacting with at least one seed gene, from all DBs, same format as above.

In [88]:
# it renames the cols on the requested way
def rename_cols(df):
    original_cols = list(df.columns)
    new_cols = ['interactor A gene symbol', 'interactor B gene symbol', 'interactor A Uniprot AC', 'interactor B Uniprot AC', 'database source']
    df = df.rename(columns=dict(zip(original_cols, new_cols)))
    return df



In [89]:
# turning off warning
pd.set_option('mode.chained_assignment', None)

# renaming cols for iid_df
iid_data = iid_interactions[['symbol1','symbol2', 'UniProt1', 'UniProt2' ]]
source_iid = ['IID' for x in range(len(iid_data))]
iid_data['Source'] = source_iid

iid_data = rename_cols(iid_data)

In [90]:
biogrid_interactions.head()

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,Organism Interactor B,Throughput,Score,Modification,Phenotypes,Qualifications,Tags,Source Database,Interactor A Uniprot AC,Interactor B Uniprot AC
0,103,6416,2318,112315,108607,-,-,MAP2K4,FLNC,JNKK|JNKK1|MAPKK4|MEK4|MKK4|PRKMK4|SAPKK-1|SAP...,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,P45985,Q14315
1,117,84665,88,124185,106603,-,-,MYPN,ACTN2,CMD1DD|CMH22|MYOP|RCM4,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,Q86TC9,P35609
2,2006,153,10755,106662,115978,-,-,ADRB1,GIPC1,ADRB1R|B1AR|BETA1AR|RHR,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,P08588,O14908
3,2765,5664,823,111643,107273,-,PIG30,PSEN2,CAPN1,AD3L|AD4|CMD1V|PS2|STM2,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,P49810,P07384
4,2785,825,7273,107275,113124,-,-,CAPN3,TTN,CANP3|CANPL3|LGMD2|LGMD2A|nCL-1|p94,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,P20807,Q8WZ42


In [91]:
# turning off warning
pd.set_option('mode.chained_assignment', None)

# renaming cols for iid_df
bio_data = biogrid_interactions[['Official Symbol Interactor A','Official Symbol Interactor B', 'Interactor A Uniprot AC', 'Interactor B Uniprot AC' ]]
source_bio = ['Biogrid' for x in range(len(bio_data))]
bio_data['Source'] = source_bio
bio_data = rename_cols(bio_data)

In [35]:

'''
# adding to biogrid df, uniprot for the genes A,B
bio_data = biogrid_interactions[['Official Symbol Interactor A', 'Official Symbol Interactor B']]

df1= final_df[final_df['Gene_Symbol'].isin(list(bio_data['Official Symbol Interactor A']))]
df2 = final_df[final_df['Gene_Symbol'].isin(list(bio_data['Official Symbol Interactor B']))]
df = pd.concat([df1,df2], axis= 0)

df = df[['Gene_Symbol', 'Uniprot AC']]

bio_data = pd.merge(bio_data, df, left_on='Official Symbol Interactor A', right_on='Gene_Symbol', how='outer')
bio_data = bio_data.drop(['Gene_Symbol'], axis=1)
old_cols = list(bio_data.columns)
new_cols = ['Official Symbol Interactor A', 'Official Symbol Interactor B', 'UniA']
bio_data = bio_data.rename(columns= dict(zip(old_cols, new_cols)))
bio_data = pd.merge(bio_data, df, left_on='Official Symbol Interactor B', right_on='Gene_Symbol', how='outer')
bio_data = bio_data.drop(['Gene_Symbol'], axis=1)
'''

In [98]:
# concatenate df of the 2 databases
total_df = pd.concat([iid_data, bio_data], axis = 0)
# resetting indeces
total_df.reset_index(drop=True, inplace= True)

In [99]:
# task A: seed genes interactome SGI
genes_A = list(total_df['interactor A gene symbol'])
genes_B = list(total_df['interactor B gene symbol'])

sgi_idx = []
for i in range(len(total_df)):
    if genes_A[i] in approved_genes and genes_B[i] in approved_genes:
        sgi_idx.append(i)
        
sgi_df = total_df.loc[sgi_idx]
# resetting indeces
sgi_df.reset_index(drop=True, inplace= True)


In [100]:
# task B : union interactome

# interactions in which there is at least one seed-gene
union_idx = []
for i in range(len(total_df)):
    if genes_A[i] in approved_genes or genes_B[i] in approved_genes:
        union_idx.append(i)

union_df = total_df.loc[union_idx]       
        
# resetting indeces
union_df.reset_index(drop=True, inplace= True)



# C

Intersection interactome: all proteins interacting with at least one seed gene confirmed by both DBs, in the format:

* interactor A gene symbol, 

* interactor B gene symbol, 

* interactor A Uniprot AC, 

* interactor B Uniprot AC


In [101]:
intersection_df = pd.merge(iid_data.drop(['database source'], axis = 1), bio_data.drop(['database source'], axis = 1), how='inner')
intersection_df.head()

Unnamed: 0,interactor A gene symbol,interactor B gene symbol,interactor A Uniprot AC,interactor B Uniprot AC
0,TCAP,TTN,O15273,Q8WZ42
1,TCAP,TTN,O15273,Q8WZ42
2,TCAP,MYOZ1,O15273,Q9NP98
3,TCAP,MDM2,O15273,Q00987
4,TCAP,MDM2,O15273,Q00987


In [102]:
#saving files

try:
    os.remove('results/interactome.csv')
    os.remove('results/union.csv')
    os.remove('results/intersection.csv')
    
except:
    pass

sgi_df.to_csv('results/sgi.csv')
union_df.to_csv('results/union.csv')
intersection_df.to_csv('results/intersection.csv')

# TASK 1.4

In [103]:
# saving on txt file seed genes and union interactome genes
# in order to copy and paste genes from txt file to EnrichR 
try:
    os.remove('data/seed_genes.txt')
    os.remove('data/union_genes.txt')
except:
    pass

file_seed = open('data/seed_genes.txt', 'w')


for gene in approved_genes:
    file_seed.write(gene+'\n')
 
file_seed.close()


file_union = open('data/union_genes.txt', 'w')
union_genes_list = list(set(list(union_df['interactor A gene symbol']) + list(union_df['interactor B gene symbol'])))                           
for gene in union_genes_list:
    file_union.write(gene+'\n')
                            
file_union.close()                        