In [1]:
import pandas as pd
import os

In [2]:
class task_1_1(object):
    def __init__(self):
        '''
        Inputs:
        
            - Summary df = DisGeNET dataset (saved on tsv file) filtered by disease (the ours is C0007193).
            
            - Multi-entry gene = gene with 2 comma-separated uniprot entries on the same row.
            
            - HGNC df = HGNC dataset (saved on csv file)
            
            - Uni df = UniProt dataset (saved on excel file) filtered by reviewed and human organism.
        '''
        self.summary_df = pd.read_csv('downloads/C0007193_disease_gda_summary_CURATED.tsv', sep='\t')
        self.multi_entry_gene = 'TMPO'
        self.hgnc_df = pd.read_csv('downloads/hgnc-symbol-check.csv', header= 1)
        self.uni_df = pd.read_excel('downloads/uniprot-yourlist_M202001086746803381A1F0E0DB47453E0216320D568506G-filtered-org--.xlsx')
    
    
    
    def seed_genes_generator(self):
        '''
        Returns:
        
            - list of seed genes approved by HGNC database.
        '''
    
        # save approved genes
        seed_genes = []
        for index,row in self.hgnc_df.iterrows():
            if row['Match type'] == 'Approved symbol':
                seed_genes.append(row['Input'])

        return seed_genes
    
    def check_seeds_approvation(self):
        '''
        Returns:
        
            - print genes approvation (or not approvation).
        '''
        # seeds
        approved_genes = self.seed_genes_generator()
        #check if all my input genes are approved
        if len(set(approved_genes)) == len(set(self.summary_df.Gene)):
            print('All genes are approved! They are', len(set(approved_genes)))
        else:
            print('Not all genes are approved. Check!')
            
    def uniprot_filter(self):
        '''
        Returns:
        
            - final dataframe of seed genes, with informations extracted from UniProt dataframe
        '''
        
        seeds = self.seed_genes_generator()
        # considering seeds in uniprot
        final_df = self.uni_df[self.uni_df['Gene names  (primary )'].isin(seeds)]
        #renaming cols
        original_cols = list(final_df.columns)
        new_cols = ['Uniprot AC','Gene Symbol', 'Protein Name', 'Organism', 'Entrez ID', 'Function']
        final_df = final_df.rename(columns=dict(zip(original_cols, new_cols)))
        # resetting indeces
        final_df.reset_index(drop=True, inplace= True)
        final_df.head()
        final_df = final_df[['Gene Symbol', 'Uniprot AC', 'Protein Name', 'Entrez ID', 'Function', 'Organism']]
        return final_df
    
    def uniprot_duplicated_genes(self):
        '''
        Returns:
        
            - set of seed genes having more uniprot entries in the final df.
        '''
        uniprot_df = self.uniprot_filter()
        dupl = uniprot_df['Gene Symbol'].duplicated()
        idx = list(dupl[dupl==True].index)
        duplicated_genes = list(uniprot_df.loc[idx]['Gene Symbol'])
        return duplicated_genes
    
    def save_results(self):
        '''
        Returns:
        
            - file .csv with the result dataframe
        '''
        final_df = self.uniprot_filter()
        # saving to csv
        try:
            os.remove('results/seed_gene_table.csv')
        except:
            pass

        final_df.to_csv('results/seed_gene_table.csv')
        print('Results saved!')

In [3]:
### task 1.1
task_1_1 = task_1_1()
# check if all disgenet seeds are approved by hgnc
task_1_1.check_seeds_approvation()
# approved genes
approved_genes = task_1_1.seed_genes_generator()
# set of seed genes having more uniprot entries
duplicated_genes = task_1_1.uniprot_duplicated_genes()
# save task 1.1 results
task_1_1.save_results()

All genes are approved! They are 101
Results saved!


In [4]:
class task_1_2_IID(object):
    
    def __init__(self):
        '''
        Inputs:
        
            - iid df = dataframe with iid interactions with at least one seed gene (filtered by heart tissue, experimental data and human organism);
            
            - iid no seeds df = dataframe with interactions among only no seeds which are involved also in interactions with seeds in the other df
                                        (df filtered by heart tissue, experimental data and human organism);
            
            - approved genes = list of seed genes approved by hgnc;
            
            - n interactions seeds = number of interactions with at least one seed;
            
            - n interactions no seeds = number of interactions among no seeds.
        '''
        # iid df of interactions with at least one seed
        self.iid_df = pd.read_csv('downloads/PPIs.txt', sep='\t')
        # df with interactions among the no seeds which have also interactions with seeds in the file above
        self.iid_no_seeds_df = pd.read_csv('downloads/PPI_no_seed.txt', sep='\t')
        # list of approved seed genes
        self.approved_genes = approved_genes
        # number of interactions with at least one seed gene
        self.n_interactions_seeds = len(self.iid_df)
        self.n_interactions_no_seeds = len(self.iid_no_seeds_df)
        
    def no_seeds_generator(self):
        '''
        Returns:
        
            - no seeds found = list of no seeds which interact with our seed genes;
            
            - n seeds = number of seed genes found on iid;
            
            - n no seeds = number of no seeds genes (which interact with our seeds) found on iid;
            
            - n tot proteins = total number of proteins found on iid (seeds + no seeds).
        '''
        # list of genes in the first symbols col
        symbols_1 = list(self.iid_df.symbol1)
        # list of genes in the second symbols col
        symbols_2 = list(self.iid_df.symbol2)

        # set of tot proteins (seed and no-seed genes)
        tot_proteins = symbols_1 + symbols_2 
        tot_proteins = set(tot_proteins)
        no_seeds_found = []
        # seed genes found in the dataset
        seeds_iid_found = []
        for gene in tot_proteins:
            if gene in self.approved_genes:
                seeds_iid_found.append(gene)
            else:
                no_seeds_found.append(gene)
        
        # sets of seeds and no-seeds
        seeds_iid_found = set(seeds_iid_found)
        no_seeds_found = set(no_seeds_found)
        # number of seeds and no-seeds
        n_seeds = len(seeds_iid_found)
        n_no_seeds= len(no_seeds_found)
        # number of total proteins
        n_tot_proteins = len(tot_proteins)
        return no_seeds_found, n_seeds, n_no_seeds, n_tot_proteins

    def save_no_seeds(self):
        '''
        Returns:
        
            - file .txt with all no seeds genes in order to copy and paste them on iid (on which we search for their interactions).
        '''
        no_seeds_found,_,_,_ = self.no_seeds_generator()
        try:
            os.remove('data/iid_no_seeds.txt')
        except:
            pass

        f = open('data/iid_no_seeds.txt', 'w')
        for gene in no_seeds_found:
              f.write(gene+'\n')

        f.close()
        
    def build_final_df(self):
        '''
        Returns:
        
            - iid interactions = dataframe with all the iid interactions (among only seeds, with at least one seed and among only no seeds);
            
            - n interactions = total number of iid interactions.
            
        '''
        # final df
        iid_interactions = pd.concat([self.iid_df, self.iid_no_seeds_df], axis = 0)[['symbol1', 'symbol2','UniProt1', 'UniProt2']]

        #saving dataset
        try:
            os.remove('results/iid_interactions.csv')
        except:
            pass

        iid_interactions.to_csv('results/iid_interactions.csv', sep=';')
        # number of total interactions
        n_interactions = len(iid_interactions)
        
        return iid_interactions,n_interactions
    
    def build_table_results(self):
        '''
        Returns:
        
            - .csv file with the iid interactions df;
            
            - table results = dataframe with iid results (total interactions, total proteins, etc.).
            
        '''
        _, tot_interactions = self.build_final_df()
        _, n_seeds, n_no_seeds, tot_proteins = self.no_seeds_generator()
        
        table_results = pd.DataFrame([tot_interactions, self.n_interactions_seeds, self.n_interactions_no_seeds, tot_proteins, n_seeds, n_no_seeds])
        
        rename_dict = dict(zip([x for x in range(6)], ['Total interactions', 'Interactions with at least one seed', 'Interactions among no-seeds', 
                                                      'Total proteins', 'Seed genes', 'No-seed genes']))
        table_results.rename(rename_dict, inplace = True)
        
        # save on csv
        try:
            os.remove('results/iid_results.csv')
        except:
            pass
        
        table_results.to_csv('results/iid_results.csv')
        return table_results
        
        

In [5]:
### Task 1.2 IID
IID = task_1_2_IID()
# save no seeds on txt file in order to copy and paste them on iid (to find interactions among no seeds)
IID.save_no_seeds()
# iid df (saved on csv file)
iid_interactions,_ = IID.build_final_df()
# iid table results (saved on csv file)
iid_results = IID.build_table_results()


In [6]:
iid_interactions.head()

Unnamed: 0,symbol1,symbol2,UniProt1,UniProt2
0,TCAP,TTN,O15273,Q8WZ42
1,TCAP,CSRP3,O15273,P50461
2,TCAP,ENO3,O15273,P13929
3,TCAP,ENO1,O15273,P06733
4,TCAP,MYOZ1,O15273,Q9NP98


In [7]:
iid_results.T

Unnamed: 0,Total interactions,Interactions with at least one seed,Interactions among no-seeds,Total proteins,Seed genes,No-seed genes
0,100232,6952,93280,3945,93,3852


In [190]:
class task_1_2_Biogrid(object):
    
    def __init__(self):
        '''
        Inputs:
        
            - biogrid df = entire biogrid dataset for human organism (respect to iid, here we have to filter by seed genes);
            
            - approved genes = list of seed genes approved by hgnc;
            
            - uniprot for biogrid = uniprot dataset containing uniprot entries for each biogrid id gene in our biogrid dataset.
        '''
        self.biogrid_df = pd.read_table('downloads/BIOGRID-ORGANISM-Homo_sapiens-3.5.179.tab2.txt', low_memory = False)
        self.approved_genes = approved_genes
        self.uniprot_for_biogrid = pd.read_excel('downloads/uniprot_for_biogrid.xlsx')
    def build_ppi_one_two_seeds(self):
        '''
        Returns:
        
            - seeds interactions = dataframe of interactions with at least one seed gene (so are included also interactions among just seeds).
            
            - symbols A = list of genes of symbols interactors A column (from original biogrid df)
            
            - symbols B = list of genes of symbols interactors B column (from original biogrid df)
        '''
        # symbol interactor A column
        symbols_A = list(self.biogrid_df['Official Symbol Interactor A'])
        # symbol interactor B column
        symbols_B = list(self.biogrid_df['Official Symbol Interactor B'])
        # list of indeces of the interactions with at least one seed (so with one or both 2 seeds)
        kept_indeces = []
        for i in range(len(self.biogrid_df)):
            if symbols_A[i] in approved_genes or symbols_B[i] in self.approved_genes: 
                kept_indeces.append(i)
        # biogrid df with interaction with at least one seed
        seeds_interactions = self.biogrid_df.loc[kept_indeces]      
        return seeds_interactions, symbols_A, symbols_B
    
    def genes_data(self):
        '''
        Returns: 
        
            - tot proteins = total number of biogrid proteins (seeds and no-seeds which interact with our seeds);
            
            - n seeds = number of seeds genes found on Biogrid;
            
            - n no seeds = number of no seed genes (which interact with our seeds) found on Biogrid.
            
            - tot A B = set of biogrid proteins (seeds+no seeds)
        '''
        seeds_interactions,_,_ = self.build_ppi_one_two_seeds()
        ### tot number of proteins
        A = list(seeds_interactions['Official Symbol Interactor A'])
        B = list(seeds_interactions['Official Symbol Interactor B'])
        tot_A_B = set(A + B)
        tot_proteins = len(tot_A_B)
        # compute number of seeds and no-seeds
        seed_genes = []
        no_seed_genes = []
        for gene in tot_A_B:
            if gene in self.approved_genes:
                seed_genes.append(gene)
            else:
                no_seed_genes.append(gene)
        n_seeds = len(set(seed_genes))
        n_no_seeds = len(set(no_seed_genes))
        return tot_proteins, n_seeds, n_no_seeds, tot_A_B
    
    def save_biogrid_proteins(self):
        '''
        Returns:
        
            - .txt file with all genes (seeds and no seeds which interact with seeds) found on Biogrid. It will be useful to copy and paste genes
                        on Uniprot in order to find the entries.
        '''
        _,_,_, tot_A_B = self.genes_data()
        # file txt biogrid genes
        try:
            os.remove('data/biogrid_genes.txt')
        except:
            pass

        ff = open('data/biogrid_genes.txt', 'w')

        for gene in tot_A_B:
            ff.write(gene+'\n')
        ff.close()
        
    def build_total_df(self):
        '''
        Returns:

           - no seeds interactions = df with interactions among no seeds only;
           
           - biogrid interaction = df containing all the interactions (among no seeds and with at least one seed)
        '''
        seeds_interactions, symbols_A, symbols_B = self.build_ppi_one_two_seeds()
        # biogrid df idx of itneractions among no seeds only
        idx = []
        for i in range(len(self.biogrid_df)):
            if symbols_A[i] in no_seed_genes and symbols_B[i] in no_seed_genes:

                idx.append(i)

        # df with just interactions among no-seed-genes
        no_seeds_interactions = self.biogrid_df.loc[idx]  
        # total interactions df
        biogrid_interactions = pd.concat([seeds_interactions,no_seeds_interactions], axis = 0)
        biogrid_interactions = biogrid_interactions[['Official Symbol Interactor A', 'Official Symbol Interactor B', 'BioGRID ID Interactor A','BioGRID ID Interactor B']]
        return no_seeds_interactions, biogrid_interactions

    def add_uniprot_entries(self):
        '''
        Returns:
        
            - biogrid interactions = dataframe of biogrid interactions with more 2 columns representing Uniprot entries for the interactors.
                Note that it's possible to have for same genes different uniprot entries (so distinct interactions will be considered).      
        '''
        _, biogrid_interactions = self.build_total_df()
        # cleaning df column from punctuation
        self.uniprot_for_biogrid['Cross-reference (BioGrid)'] = self.uniprot_for_biogrid['Cross-reference (BioGrid)'].str.replace(';','')
        # col convertion in oject type
        biogrid_interactions['BioGRID ID Interactor A'] = biogrid_interactions['BioGRID ID Interactor A'].astype('str')
        biogrid_interactions['BioGRID ID Interactor B'] = biogrid_interactions['BioGRID ID Interactor B'].astype('str')
        # add uniprot ac for interactors A
        biogrid_interactions = biogrid_interactions.merge(self.uniprot_for_biogrid, left_on='BioGRID ID Interactor A', right_on = 'Cross-reference (BioGrid)', how= 'left').drop(['Cross-reference (BioGrid)', 'Gene names  (primary )'], axis=1)
        biogrid_interactions.rename(columns = {'Entry': 'Interactor A Uniprot AC'}, inplace= True)
        # add uniprot ac for interactors B
        biogrid_interactions = biogrid_interactions.merge(self.uniprot_for_biogrid, left_on='BioGRID ID Interactor B', right_on = 'Cross-reference (BioGrid)', how= 'left').drop(['Cross-reference (BioGrid)', 'Gene names  (primary )'], axis=1)
        biogrid_interactions.rename(columns = {'Entry': 'Interactor B Uniprot AC'}, inplace= True)
        
        # save to csv
        try:
            os.remove('results/biogrid_interactions.csv')
        except:
            pass

        biogrid_interactions.to_csv('results/biogrid_interactions.csv')
        
        return biogrid_interactions
        
        
    def compute_results(self):
        '''
        Returns:
        
            - table results: dataframe with some important results about biogrid (number of itneraction, numbero of proteins, etc)
            
            - file .csv with the table results.
        '''
        
        tot_proteins,n_seeds,n_no_seeds,_ = self.genes_data()
        # final df of interactions, including eventual genes with more uniprot ac
        final_df = self.add_uniprot_entries() 
        # compute number of inetractions among just seed genes
        n_ppi_only_seeds = 0
        n_ppi_seeds = 0
        n_ppi_no_seeds = 0
        symbols_a = list(final_df['Official Symbol Interactor A'])
        symbols_b = list(final_df['Official Symbol Interactor B'])
        for i in range(len(final_df)):
            if symbols_a[i] in self.approved_genes and symbols_b[i] in self.approved_genes:
                n_ppi_only_seeds +=1    
            elif symbols_a[i] in self.approved_genes or symbols_b[i] in self.approved_genes:
                n_ppi_seeds+=1
            elif symbols_a[i] not in self.approved_genes and symbols_b[i] not in self.approved_genes:
                n_ppi_no_seeds+=1
                
        
        tot_ppi = len(final_df)
        table_results = pd.DataFrame([tot_ppi, n_ppi_seeds, n_ppi_no_seeds, n_ppi_only_seeds, tot_proteins,n_seeds, n_no_seeds])
        
        rename_dict = dict(zip([x for x in range(7)], ['Total interactions', 'Interactions with at least one seed', 'Interactions among no-seeds', 
                                                      'Interactions among only seeds','Total proteins', 'Seed genes', 'No seed genes']))
        table_results.rename(rename_dict, inplace = True)
        
        # save on csv
        try:
            os.remove('results/biogrid_results.csv')
        except:
            pass
        
        table_results.to_csv('results/biogrid_results.csv')
        
        return table_results
    
    def not_mapped_uniprot(self):
        '''
        Returns:
        
            - not mapped genes = set of genes not mapped by Unipot. These genes don't have Uniprot AC in our final biogrid interactions df;
            
            - not mapped genes set on .txt file.
        '''
        biogrid_interactions = self.add_uniprot_entries()
        null_A = biogrid_interactions['Interactor A Uniprot AC'].isnull()
        null_idx_A = list(null_A[null_A==True].index)
        null_A_list=list(biogrid_interactions.loc[null_idx_A]['Official Symbol Interactor A'])

        null_B = biogrid_interactions['Interactor B Uniprot AC'].isnull()
        null_idx_B = list(null_B[null_B==True].index)
        null_B_list = list(biogrid_interactions.loc[null_idx_B]['Official Symbol Interactor B'])
        # set of not mapped genes
        not_mapped_genes = set(null_A_list+null_B_list)
        try:
            os.remove('data/biogrid_not_mapped_genes.txt')
        except:
            pass

        ff = open('data/biogrid_not_mapped_genes.txt', 'w')

        for gene in not_mapped_genes:
            ff.write(gene+'\n')
        ff.close()
        
        return not_mapped_genes

In [191]:
Biogrid = task_1_2_Biogrid()
# save biogrid proteins in order to find their uniprot ac
Biogrid.save_biogrid_proteins()
# biogrid interactions df
biogrid_interactions = Biogrid.add_uniprot_entries()
# table of biogrid results
biogrid_results = Biogrid.compute_results()
# save not mapped biogrid genes by uniprot on txt file
biogrid_not_mapped_genes = Biogrid.not_mapped_uniprot()

In [198]:
biogrid_interactions.head()

Unnamed: 0,Official Symbol Interactor A,Official Symbol Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Interactor A Uniprot AC,Interactor B Uniprot AC
0,MAP2K4,FLNC,112315,108607,P45985,Q14315
1,MYPN,ACTN2,124185,106603,Q86TC9,P35609
2,ADRB1,GIPC1,106662,115978,P08588,O14908
3,PSEN2,CAPN1,111643,107273,P49810,P07384
4,CAPN3,TTN,107275,113124,P20807,Q8WZ42


In [187]:
biogrid_results

Unnamed: 0,0
Total interactions,219620
Interactions with at least one seed,12781
Interactions among no-seeds,206499
Interactions among only seeds,340
Total proteins,5034
Seed genes,98
No seed genes,4936


In [67]:
'''
#### FIND ON UNIPROT DF GENES WITH MORE ENTRIES
# boolean list: True if i-gene is duplicated
dupl = uniprot_for_biogrid['Gene names  (primary )'].duplicated()
# positional indeces of duplicates
dupl_indeces = list(dupl[dupl == True].index)
# extracting names of genes with more than one uniprot ac
genes_with_more_entries = set(uniprot_for_biogrid.loc[dupl_indeces]['Gene names  (primary )'])
genes_with_more_entries

{'BBC3', 'CDKN2A', 'DUSP13', 'GNAS', 'PRNP', 'SLC35A4', 'TMPO', 'TSPO'}
'''

{'BBC3', 'CDKN2A', 'DUSP13', 'GNAS', 'PRNP', 'SLC35A4', 'TMPO', 'TSPO'}

In [257]:
class interactome:
    def __init__(self):
        '''
        Inputs:
        
            - bio data = biogrid interactions df with just column of symbols and uniprot entries interactors;
            
            - iid data = iid interactions df;
            
            - approved genes = list of seed genes approved by hgnc.
        '''
        self.bio_data = biogrid_interactions[['Official Symbol Interactor A', 'Official Symbol Interactor B', 'Interactor A Uniprot AC','Interactor B Uniprot AC']]
        self.iid_data = iid_interactions
        self.approved_genes = approved_genes

    @staticmethod
    def rename_cols(df):
        '''
        Inputs:
        
            - df = dataframe of interactions (biogrid or iid one).
            
            
        Returns:
        
            - dataframe with renamed columns.
        '''
        original_cols = list(df.columns)
        new_cols = ['interactor A gene symbol', 'interactor B gene symbol', 'interactor A Uniprot AC', 'interactor B Uniprot AC', 'database source']
        df = df.rename(columns=dict(zip(original_cols, new_cols)))
        return df

    @staticmethod
    def df_to_defualt_format(df, source):
        '''
        Inputs:
        
            - df = dataframe of interactions (biogrid or iid one);
            
            - source = string representing the source db name (IID or Biogrid).
            
            
        Returns:
        
            - dataframe with an added column about the source database (iid or biogrid).
        '''
        # turning off warning
        pd.set_option('mode.chained_assignment', None)
        # renaming cols for iid_df
        source_df = [source for x in range(len(df))]
        df['Source'] = source_df
        df = interactome.rename_cols(df)
        return df
    def union(self):
        '''
        Returns:
        
            - total df = union dataframe of all the interactions (biogrid df concatenated to iid df, in which are considered 
                    interactions with at least one seed gene and interactions among no-seeds which interact with our seed genes).
        '''
        bio_df = interactome.df_to_defualt_format(self.bio_data,'Biogrid')
        iid_df = interactome.df_to_defualt_format(self.iid_data, 'IID')
        
        # concatenate df of the 2 databases
        total_df = pd.concat([iid_df, bio_df], axis = 0)
        
        # resetting indeces
        total_df.reset_index(drop=True, inplace= True)
        return total_df
    def sgi(self):
        '''
        Returns:
        
            - sgi df = dataframe with inetractions among only seed genes from both the sources (biogrid and iid).
        '''
        total_df = self.union()
        genes_A = list(total_df['interactor A gene symbol'])
        genes_B = list(total_df['interactor B gene symbol'])

        sgi_idx = []
        for i in range(len(total_df)):
            if genes_A[i] in self.approved_genes and genes_B[i] in self.approved_genes:
                sgi_idx.append(i)

        sgi_df = total_df.loc[sgi_idx]
        # resetting indeces
        sgi_df.reset_index(drop=True, inplace= True)
        return sgi_df
    
    def intersection(self):
        '''
        Returns:
        
            - intersection df = intersection dataframe with all the common interactions btw iid df and biogrid df (we consider as in union
                    interactions with at least one seed gene and interactions among no-seeds which interact with our seed genes).
        '''
        bio_data = interactome.df_to_defualt_format(self.bio_data,'Biogrid')
        iid_data = interactome.df_to_defualt_format(self.iid_data, 'IID')      
        intersection_df = pd.merge(iid_data.drop(['database source'], axis = 1), bio_data.drop(['database source'], axis = 1), how='inner')
        return intersection_df
    
    def save_results(self):
        '''
        Returns:
        
            - .csv files for union, intersection and sgi dataframes.
        '''
        #saving files
        sgi_df = self.sgi()
        union_df = self.union()
        intersection_df = self.intersection()
        try:
            os.remove('results/interactome.csv')
            os.remove('results/union.csv')
            os.remove('results/intersection.csv')

        except:
            pass

        sgi_df.to_csv('results/sgi.csv')
        union_df.to_csv('results/union.csv')
        intersection_df.to_csv('results/intersection.csv')   
        
    def save_genes(self):
        union_df = self.union()
        try:
            os.remove('data/seed_genes.txt')
            os.remove('data/union_genes.txt')
        except:
            pass

        file_seed = open('data/seed_genes.txt', 'w')


        for gene in self.approved_genes:
            file_seed.write(gene+'\n')

        file_seed.close()


        file_union = open('data/union_genes.txt', 'w')
        union_genes_set = set(list(union_df['interactor A gene symbol']) + list(union_df['interactor B gene symbol']))                        
        for gene in union_genes_set:
            file_union.write(gene+'\n')

        file_union.close()                                


In [258]:
interactome = interactome()
sgi_df = interactome.sgi()
union_df = interactome.union()
intersection_df = interactome.intersection()
# save dfs on csv files
interactome.save_results()
# save union genes and seed genes on txt files
interactome.save_genes()

In [259]:
union_df.head()

Unnamed: 0,interactor A gene symbol,interactor B gene symbol,interactor A Uniprot AC,interactor B Uniprot AC,database source
0,TCAP,TTN,O15273,Q8WZ42,IID
1,TCAP,CSRP3,O15273,P50461,IID
2,TCAP,ENO3,O15273,P13929,IID
3,TCAP,ENO1,O15273,P06733,IID
4,TCAP,MYOZ1,O15273,Q9NP98,IID


In [260]:
sgi_df.head()

Unnamed: 0,interactor A gene symbol,interactor B gene symbol,interactor A Uniprot AC,interactor B Uniprot AC,database source
0,TCAP,TTN,O15273,Q8WZ42,IID
1,TCAP,CSRP3,O15273,P50461,IID
2,TCAP,FAS,O15273,P25445,IID
3,LDB3,ACTN2,O75112,P35609,IID
4,NEBL,MYPN,O76041,Q86TC9,IID


In [261]:
intersection_df.head()

Unnamed: 0,interactor A gene symbol,interactor B gene symbol,interactor A Uniprot AC,interactor B Uniprot AC
0,TCAP,TTN,O15273,Q8WZ42
1,TCAP,TTN,O15273,Q8WZ42
2,TCAP,MYOZ1,O15273,Q9NP98
3,TCAP,MDM2,O15273,Q00987
4,TCAP,MDM2,O15273,Q00987
