In [1]:
import pandas as pd
import os

# DisGeNET

In [2]:
# load Evidence Gene-Disease Associations df
evid_df = pd.read_csv('C0007193_disease_gda_evidences_CURATED.tsv', sep='\t')
evid_df.head()

Unnamed: 0,Disease,Disease_id,Gene,Gene_id,Score_gda,Association_Type,Type,Original_DB,Sentence,PMID,PMID_Year
0,"Cardiomyopathy, Dilated",C0007193,TNNI3,7137,0.66,Biomarker,,GENOMICS_ENGLAND,,,
1,"Cardiomyopathy, Dilated",C0007193,EPG5,57724,0.4,Biomarker,,GENOMICS_ENGLAND,Role of Epg5 in selective neurodegeneration an...,23674064.0,2013.0
2,"Cardiomyopathy, Dilated",C0007193,FASLG,356,0.3,Biomarker,,CTD_human,Myocardial Fas ligand expression increases sus...,17943461.0,2008.0
3,"Cardiomyopathy, Dilated",C0007193,LAMP2,3920,0.41,Biomarker,,GENOMICS_ENGLAND,,,
4,"Cardiomyopathy, Dilated",C0007193,LMNA,4000,0.7,Biomarker,,CTD_human,Missense mutations in the rod domain of the <s...,10580070.0,1999.0


In [3]:
# load Summary Gene-Disease Associations df

summary_df = pd.read_csv('C0007193_disease_gda_summary_CURATED.tsv', sep='\t')
summary_df.head()

Unnamed: 0,Disease,Disease_id,Gene,Gene_id,UniProt,Gene_Full_Name,Protein_Class,N_diseases_g,DSI_g,DPI_g,pLI,Score_gda,EL_gda,EI_gda,N_PMIDs,N_SNPs_gda,First_Ref,Last_Ref
0,"Cardiomyopathy, Dilated",C0007193,DMD,1756,P11532,dystrophin,,48,0.477,0.724,1.0,0.9,strong,0.9375,1,0,1993.0,2016.0
1,"Cardiomyopathy, Dilated",C0007193,TNNT2,7139,P45379,"troponin T2, cardiac type",cytoskeletal protein,46,0.611,0.241,0.002605,0.9,strong,1.0,2,11,2000.0,2018.0
2,"Cardiomyopathy, Dilated",C0007193,TTN,7273,Q8WZ42,titin,,37,0.534,0.655,6.433300000000001e-84,0.9,strong,1.0,3,201,1995.0,2017.0
3,"Cardiomyopathy, Dilated",C0007193,ACTC1,70,P68032,"actin, alpha, cardiac muscle 1",cytoskeletal protein,21,0.656,0.414,0.77606,0.84,strong,0.75,0,1,2002.0,2014.0
4,"Cardiomyopathy, Dilated",C0007193,MYH6,4624,P13533,myosin heavy chain 6,,20,0.621,0.552,3.2416000000000003e-31,0.84,strong,1.0,1,0,2006.0,2016.0


In [4]:
# print genes in order to give them in input to HGNC
#for gene in list(summary_df.Gene):
    #print(gene)

### NB

Difference btw summary and evidence:

* Summary has for each row a different gene.

* Evidence has the same genes of summary, but someone is repeated.

If we look at the the lenghts of the 2 sets of genes, they are the same!

In [5]:
print(len(set(list(summary_df.Gene))))
print(len(set(list(evid_df.Gene))))

101
101


### Clean Summary Dataframe

The TMPO gene has an irregular UniProt value. Infact it has two entries separated by ';'. For this reason we change this row, mantaining just one of the two entry. Thenwe add a new row for the same gene but with the other value of entry. At the end we will have for the same gene, 2 rows, which differ only by the UniProt value.

In [6]:
# looking at TMPO gene row
summary_df[summary_df['Gene'] == 'TMPO']

Unnamed: 0,Disease,Disease_id,Gene,Gene_id,UniProt,Gene_Full_Name,Protein_Class,N_diseases_g,DSI_g,DPI_g,pLI,Score_gda,EL_gda,EI_gda,N_PMIDs,N_SNPs_gda,First_Ref,Last_Ref
15,"Cardiomyopathy, Dilated",C0007193,TMPO,7112,P42166;P42167,thymopoietin,,6,0.752,0.241,1.7737e-07,0.61,limited,1.0,1,0,2006.0,2006.0


In [7]:
# list of UniProt col values
col_uniprot = list(summary_df.UniProt)
# value corresponfing to TMPO gene
double_entries_list = col_uniprot[15].split(';')
# first UniProt entry
first_entry = double_entries_list[0]
# Second UniProt entry
second_entry = double_entries_list[1]
# updating UniProt column with just the first entry for TMPO gene 
col_uniprot[15] = first_entry
summary_df.UniProt = col_uniprot

# listing the TMPO row
row_to_update = list((summary_df.loc[15]))
# updating the UniProt value with second entry
row_to_update[4] = second_entry
# adding the new row to df
summary_df.loc[len(summary_df)] = row_to_update

In [8]:
# now TMPO gene has 2 rows
summary_df[summary_df['Gene'] == 'TMPO']

Unnamed: 0,Disease,Disease_id,Gene,Gene_id,UniProt,Gene_Full_Name,Protein_Class,N_diseases_g,DSI_g,DPI_g,pLI,Score_gda,EL_gda,EI_gda,N_PMIDs,N_SNPs_gda,First_Ref,Last_Ref
15,"Cardiomyopathy, Dilated",C0007193,TMPO,7112,P42166,thymopoietin,,6,0.752,0.241,1.7737e-07,0.61,limited,1.0,1,0,2006.0,2006.0
101,"Cardiomyopathy, Dilated",C0007193,TMPO,7112,P42167,thymopoietin,,6,0.752,0.241,1.7737e-07,0.61,limited,1.0,1,0,2006.0,2006.0


# HGNC


In [9]:
# HGNC dataset 
hgnc_df = pd.read_csv('hgnc-symbol-check.csv', header= 1)

In [10]:
hgnc_df.head()

Unnamed: 0,Input,Match type,Approved symbol,Approved name,HGNC ID,Location
0,DMD,Approved symbol,DMD,dystrophin,HGNC:2928,Xp21.2-p21.1
1,TNNT2,Approved symbol,TNNT2,"troponin T2, cardiac type",HGNC:11949,1q32.1
2,TTN,Approved symbol,TTN,titin,HGNC:12403,2q31.2
3,ACTC1,Approved symbol,ACTC1,actin alpha cardiac muscle 1,HGNC:143,15q14
4,MYH6,Approved symbol,MYH6,myosin heavy chain 6,HGNC:7576,14q11.2


In [11]:
# save approved genes
approved_genes = []
for index,row in hgnc_df.iterrows():
    if row['Match type'] == 'Approved symbol':
        approved_genes.append(row['Input'])
        
#check if all my input genes are approved
if len(set(approved_genes)) == len(set(summary_df.Gene)):
    print('All genes are approved! They are', len(set(approved_genes)))
else:
    print('Not all genes are approved. Check!')

All genes are approved! They are 101


In [12]:
#read UniProt dataset
uni_df = pd.read_excel('uniprot-yourlist_M20191221E5A08BB0B2D1C45B0C7BC3B55FD2655669E875N+AND+organism--.xlsx')
uni_df.head()

Unnamed: 0,Entry,Protein names,Gene names,Organism,Gene names (primary ),Cross-reference (GeneID),Gene ontology (molecular function)
0,X5DQZ7,Glutathione peroxidase 1 isoform A,GPX1,Homo sapiens (Human),GPX1,,glutathione peroxidase activity [GO:0004602]
1,X5DQM5,Adrenoceptor beta 2 surface isoform A (Fragment),ADRB2,Homo sapiens (Human),ADRB2,,adenylate cyclase binding [GO:0008179]; beta2-...
2,X5DNJ6,Dystrophia myotonica-protein kinase isoform A ...,DMPK,Homo sapiens (Human),DMPK,,ATP binding [GO:0005524]; protein serine/threo...
3,X5D926,Glutathione peroxidase 1 isoform A,GPX1,Homo sapiens (Human),GPX1,,glutathione peroxidase activity [GO:0004602]
4,X5D8Z6,Dystrophia myotonica-protein kinase isoform B ...,DMPK,Homo sapiens (Human),DMPK,,ATP binding [GO:0005524]; protein serine/threo...


In [13]:
# dict: for each gene, uniprot entry
gene_uniprot = {}
for index,row in summary_df.iterrows():
    key = row['Gene']
    if key not in list(gene_uniprot.keys()):
        gene_uniprot[key] = [row['UniProt']]
    else:
        gene_uniprot[key].append(row['UniProt'])
        
# list of uniprot values
uniprot_list = [item for sublist in list(gene_uniprot.values()) for item in sublist]


In [14]:
# filtering uniprot df by uniprot entries which appear also in summary df
final_df=uni_df.loc[uni_df['Entry'].isin(uniprot_list)]
final_df.head()

Unnamed: 0,Entry,Protein names,Gene names,Organism,Gene names (primary ),Cross-reference (GeneID),Gene ontology (molecular function)
20,Q9UPQ8,Dolichol kinase (EC 2.7.1.108) (Transmembrane ...,DOLK KIAA1094 TMEM15 UNQ2422/PRO4980,Homo sapiens (Human),DOLK,22845;,dolichol kinase activity [GO:0004168]
23,Q9UP52,Transferrin receptor protein 2 (TfR2),TFR2,Homo sapiens (Human),TFR2,7036;,co-receptor binding [GO:0039706]; transferrin ...
31,Q9UGJ0,5'-AMP-activated protein kinase subunit gamma-...,PRKAG2,Homo sapiens (Human),PRKAG2,51422;,ADP binding [GO:0043531]; AMP-activated protei...
35,Q9NP59,Solute carrier family 40 member 1 (Ferroportin...,SLC40A1 FPN1 IREG1 SLC11A3 MSTP079,Homo sapiens (Human),SLC40A1,30061;,ferrous iron transmembrane transporter activit...
36,Q9NNW7,"Thioredoxin reductase 2, mitochondrial (EC 1.8...",TXNRD2 KIAA1652 TRXR2,Homo sapiens (Human),TXNRD2,10587;,electron transfer activity [GO:0009055]; flavi...


In [15]:
#check lenght: ok because one gene has 2 different entries from summary df
len(final_df)

102

In [16]:
#drop Gene names col: we have primal gene names, it's sufficient
final_df = final_df.drop(['Gene names'], axis=1)

In [17]:
#renaming cols
original_cols = list(final_df.columns)
new_cols = ['Uniprot AC', 'Protein_Name', 'Organism', 'Gene_Symbol', 'Entrez_ID', 'Function']
final_df = final_df.rename(columns=dict(zip(original_cols, new_cols)))
final_df.head()

Unnamed: 0,Uniprot AC,Protein_Name,Organism,Gene_Symbol,Entrez_ID,Function
20,Q9UPQ8,Dolichol kinase (EC 2.7.1.108) (Transmembrane ...,Homo sapiens (Human),DOLK,22845;,dolichol kinase activity [GO:0004168]
23,Q9UP52,Transferrin receptor protein 2 (TfR2),Homo sapiens (Human),TFR2,7036;,co-receptor binding [GO:0039706]; transferrin ...
31,Q9UGJ0,5'-AMP-activated protein kinase subunit gamma-...,Homo sapiens (Human),PRKAG2,51422;,ADP binding [GO:0043531]; AMP-activated protei...
35,Q9NP59,Solute carrier family 40 member 1 (Ferroportin...,Homo sapiens (Human),SLC40A1,30061;,ferrous iron transmembrane transporter activit...
36,Q9NNW7,"Thioredoxin reductase 2, mitochondrial (EC 1.8...",Homo sapiens (Human),TXNRD2,10587;,electron transfer activity [GO:0009055]; flavi...


In [18]:
# saving to csv
try:
    os.remove('data.csv')
except:
    pass

final_df.to_csv('data.csv')