In [53]:
import pandas as pd
from typing import List, Dict

In [54]:
def normalize_column_names(columns):
    return [column.lower() for column in columns]
    

def add_index(df):
    df.index.rename('id', inplace=True)
    df.index += 1
    return 

## Loading chemical diseases table and filtering it

In [101]:
columns_cd_str = "ChemicalName,ChemicalID,CasRN,DiseaseName,DiseaseID,DirectEvidence,InferenceGeneSymbol,InferenceScore,OmimIDs,PubMedIDs"
columns_cd = columns_cd_str.split(",")
columns_final_cd = normalize_column_names(columns_cd)

column_cd_dtype = {"chemicalname": str,
                "chemicalid": str,
                "casrn": str,
                "diseasename": str,
                "diseaseid": str,
                "directevidence": str,
                "inferencegenesymbol": str,
                "inferencescore": str,
                "omimids": str,
                "pubmedids": str}
                
df_chem_diseases = pd.read_csv("http://ctdbase.org/reports/CTD_chemicals_diseases.tsv.gz", 
                               compression='gzip', header=0, sep='\t', comment='#', names=columns_final_cd, 
                               low_memory=False, dtype=column_cd_dtype)
df_chem_diseases.head(3)



Unnamed: 0,chemicalname,chemicalid,casrn,diseasename,diseaseid,directevidence,inferencegenesymbol,inferencescore,omimids,pubmedids
0,10074-G5,C534883,,Adenocarcinoma,MESH:D000230,,MYC,4.09,,26432044
1,10074-G5,C534883,,Adenocarcinoma of Lung,MESH:D000077192,,MYC,4.32,,26656844|27602772
2,10074-G5,C534883,,Alopecia,MESH:D000505,,AR,4.5,,15902657


In [102]:
df_chem_diseases = df_chem_diseases[df_chem_diseases['casrn'].notna()]
df_chem_diseases.head(3)
                      

Unnamed: 0,chemicalname,chemicalid,casrn,diseasename,diseaseid,directevidence,inferencegenesymbol,inferencescore,omimids,pubmedids
121,"10,11-dihydro-10,11-dihydroxy-5H-dibenzazepine...",C004822,35079-97-1,"Abnormalities, Drug-Induced",MESH:D000014,,EPHX1,5.15,,2336087
122,"10,11-dihydro-10,11-dihydroxy-5H-dibenzazepine...",C004822,35079-97-1,Acute Lung Injury,MESH:D055371,,EPHX1,5.37,,26840748
123,"10,11-dihydro-10,11-dihydroxy-5H-dibenzazepine...",C004822,35079-97-1,Amphetamine-Related Disorders,MESH:D019969,,EPHX1,5.44,,19598248


In [103]:
df_chem_diseases.drop(labels=['chemicalname', 'casrn', 'directevidence', 'inferencegenesymbol', 'omimids', 'pubmedids'],\
                      axis=1, inplace=True)
 

In [104]:
df_chem_diseases

Unnamed: 0,chemicalid,diseasename,diseaseid,inferencescore
121,C004822,"Abnormalities, Drug-Induced",MESH:D000014,5.15
122,C004822,Acute Lung Injury,MESH:D055371,5.37
123,C004822,Amphetamine-Related Disorders,MESH:D019969,5.44
124,C004822,Carcinoma,MESH:D002277,5.06
125,C004822,"Carcinoma, Hepatocellular",MESH:D006528,4.58
...,...,...,...,...
7805137,C015582,"Bone Diseases, Metabolic",MESH:D001851,5.99
7805138,C015582,Cholestasis,MESH:D002779,4.91
7805139,C015582,Liver Cirrhosis,MESH:D008103,4.98
7805140,C015582,"MICROCEPHALY, CONGENITAL CATARACT, AND PSORIAS...",OMIM:616834,7.81


In [105]:
filteredcontains = df_chem_diseases[df_chem_diseases['diseasename'].str.contains("PARKINSON")]
filteredcontains


Unnamed: 0,chemicalid,diseasename,diseaseid,inferencescore
23008,C026486,"PARKINSON DISEASE 1, AUTOSOMAL DOMINANT",OMIM:168601,4.65
23010,C026486,"PARKINSON DISEASE, LATE-ONSET",OMIM:168600,3.53
27075,C025603,"PARKINSON DISEASE 1, AUTOSOMAL DOMINANT",OMIM:168601,6.16
33580,C048792,"PARKINSON DISEASE, LATE-ONSET",OMIM:168600,6.20
41220,D019813,AMYOTROPHIC LATERAL SCLEROSIS-PARKINSONISM/DEM...,OMIM:105500,3.64
...,...,...,...,...
7758912,D015032,"PARKINSON DISEASE, LATE-ONSET",OMIM:168600,6.17
7762845,D019345,"PARKINSON DISEASE, LATE-ONSET",OMIM:168600,4.30
7772489,D015034,"PARKINSON DISEASE, LATE-ONSET",OMIM:168600,3.93
7780631,D019287,"PARKINSON DISEASE 5, AUTOSOMAL DOMINANT, SUSCE...",OMIM:613643,4.84


### Loading chemical gene table and filtering it

In [98]:
columns_cg_str = "ChemicalName,ChemicalID,CasRN,GeneSymbol,GeneID,GeneForms,Organism,OrganismID,Interaction,InteractionActions,PubMedIDs"
columns_cg = columns_cg_str.split(",")

column_cg_dtype = {"ChemicalName": str,
                "ChemicalID": str,
                "CasRN": str,
                "GeneSymbol": str,
                "GeneID": str,
                "GeneForms": str,
                "Organism": str,
                "OrganismID": str,
                "Interaction": str,
                "InteractionActions": str,
                "PubMedIDs": str}
df_chem_gene = pd.read_csv("http://ctdbase.org/reports/CTD_chem_gene_ixns.tsv.gz", compression='gzip', header=0, sep='\t', comment='#', names=columns_cg, low_memory=False, dtype=column_cg_dtype)
df_chem_gene = df_chem_gene.loc[df_chem_gene['Organism'] == 'Homo sapiens']
df_chem_gene.head(3)


Unnamed: 0,ChemicalName,ChemicalID,CasRN,GeneSymbol,GeneID,GeneForms,Organism,OrganismID,Interaction,InteractionActions,PubMedIDs
0,10074-G5,C534883,,AR,367,protein,Homo sapiens,9606,10074-G5 inhibits the reaction [EPHB2 protein ...,decreases^reaction|increases^expression,32184358
1,10074-G5,C534883,,AR,367,protein,Homo sapiens,9606,10074-G5 results in decreased expression of AR...,decreases^expression,32184358
2,10074-G5,C534883,,AR,367,protein,Homo sapiens,9606,10074-G5 results in decreased expression of AR...,decreases^expression,32184358


In [99]:
df_chem_gene = df_chem_gene[df_chem_gene['CasRN'].notna()]
df_chem_gene

Unnamed: 0,ChemicalName,ChemicalID,CasRN,GeneSymbol,GeneID,GeneForms,Organism,OrganismID,Interaction,InteractionActions,PubMedIDs
19,"10,11-dihydro-10,11-dihydroxy-5H-dibenzazepine...",C004822,35079-97-1,EPHX1,2052,gene,Homo sapiens,9606,[EPHX1 gene SNP affects the metabolism of carb...,affects^chemical synthesis|affects^metabolic p...,15692831
20,"10,11-dihydro-10,11-dihydroxy-5H-dibenzazepine...",C004822,35079-97-1,EPHX1,2052,protein,Homo sapiens,9606,[EPHX1 protein results in increased metabolism...,increases^chemical synthesis|increases^metabol...,15692831
50,10-(alpha-diethylaminopropionyl)phenothiazine,C013759,63834-04-8,HAMP,57817,mRNA,Homo sapiens,9606,10-(alpha-diethylaminopropionyl)phenothiazine ...,decreases^expression,25633564
52,10-decarbamoylmitomycin C,C067795,26909-37-5,CASP3,836,protein,Homo sapiens,9606,10-decarbamoylmitomycin C results in increased...,increases^activity,17530733
53,10-decarbamoylmitomycin C,C067795,26909-37-5,CASP9,842,protein,Homo sapiens,9606,10-decarbamoylmitomycin C results in increased...,increases^activity,17530733
...,...,...,...,...,...,...,...,...,...,...,...
2320048,Zymosan,D015054,9010-72-4,TNFRSF1A,7132,protein,Homo sapiens,9606,Zymosan results in increased expression of TNF...,increases^expression,10072544
2320060,Zymosan,D015054,9010-72-4,TNFRSF1B,7133,protein,Homo sapiens,9606,Zymosan results in increased expression of TNF...,increases^expression,10072544
2320067,Zymosan,D015054,9010-72-4,XIAP,331,mRNA,Homo sapiens,9606,Zymosan analog results in decreased expression...,decreases^expression,16803582
2320068,zymosterol,C015582,128-33-6,CYP27A1,1593,protein,Homo sapiens,9606,CYP27A1 protein results in increased metabolis...,increases^metabolic processing,14622972


In [100]:
df_chem_gene['IntAct'] = df_chem_gene.InteractionActions.str.split('|')
df_chemical_gene = df_chem_gene.explode('IntAct')
df_chemical_gene.drop(labels=['InteractionActions', 'GeneForms', 'PubMedIDs'], axis=1, inplace=True)
df_chemical_gene.rename(columns={'IntAct': 'InteractionActions'}, inplace=True)
df_chemical_gene.head()

Unnamed: 0,ChemicalName,ChemicalID,CasRN,GeneSymbol,GeneID,Organism,OrganismID,Interaction,InteractionActions
19,"10,11-dihydro-10,11-dihydroxy-5H-dibenzazepine...",C004822,35079-97-1,EPHX1,2052,Homo sapiens,9606,[EPHX1 gene SNP affects the metabolism of carb...,affects^chemical synthesis
19,"10,11-dihydro-10,11-dihydroxy-5H-dibenzazepine...",C004822,35079-97-1,EPHX1,2052,Homo sapiens,9606,[EPHX1 gene SNP affects the metabolism of carb...,affects^metabolic processing
20,"10,11-dihydro-10,11-dihydroxy-5H-dibenzazepine...",C004822,35079-97-1,EPHX1,2052,Homo sapiens,9606,[EPHX1 protein results in increased metabolism...,increases^chemical synthesis
20,"10,11-dihydro-10,11-dihydroxy-5H-dibenzazepine...",C004822,35079-97-1,EPHX1,2052,Homo sapiens,9606,[EPHX1 protein results in increased metabolism...,increases^metabolic processing
50,10-(alpha-diethylaminopropionyl)phenothiazine,C013759,63834-04-8,HAMP,57817,Homo sapiens,9606,10-(alpha-diethylaminopropionyl)phenothiazine ...,decreases^expression


### Function get_cas_id() to get those chemicals that are assosiated with most up or down regulated genes

In [67]:
gene_list =[ 'UCHL1', 'PDPK1']
interaction_type ='increases^expression'

def get_cas_ids(hgnc_symbols:List,interaction_type:str = None) -> Dict:
    
    ##################### filtering needed data #####################
    searched=df_chemical_gene[df_chemical_gene['GeneSymbol'].isin(gene_list)]
    searched = searched[['GeneSymbol', 'ChemicalID', 'CasRN', 'InteractionActions']]
    searched=searched.loc[searched['InteractionActions'] == interaction_type]
    cleaned=searched.drop_duplicates(keep=False)


    ##################### initializing empty dictionary ############# 
    hgnc_symbols=cleaned['GeneSymbol'].unique().tolist()
    chemical_gene_interactions = {}
    for hgnc_symbol in hgnc_symbols:
        chemical_gene_interactions[hgnc_symbol]=[]
   

    #################################################################
    # below I would like to generate the dictionary that you decided manually to create
    for index, row in cleaned.iterrows():
        gene = (row['GeneSymbol'])
        chemicalid = row["ChemicalID"]
        cas_id = row["CasRN"]
        interaction=row['InteractionActions']
        chemical_gene_interactions[gene].append({'chemical_id': chemicalid, 'cas_id': cas_id, 'interaction': interaction})

    return chemical_gene_interactions
    
 
get_cas_ids(gene_list,interaction_type)   

{'UCHL1': [{'chemical_id': 'D000082',
   'cas_id': '103-90-2',
   'interaction': 'increases^expression'},
  {'chemical_id': 'D020106',
   'cas_id': '79-06-1',
   'interaction': 'increases^expression'},
  {'chemical_id': 'D016604',
   'cas_id': '1162-65-8',
   'interaction': 'increases^expression'},
  {'chemical_id': 'C000228',
   'cas_id': '313-67-7',
   'interaction': 'increases^expression'},
  {'chemical_id': 'D001151',
   'cas_id': '7440-38-2',
   'interaction': 'increases^expression'},
  {'chemical_id': 'D000077237',
   'cas_id': '1327-53-3',
   'interaction': 'increases^expression'},
  {'chemical_id': 'D001564',
   'cas_id': '50-32-8',
   'interaction': 'increases^expression'},
  {'chemical_id': 'D003035',
   'cas_id': '7440-48-4',
   'interaction': 'increases^expression'},
  {'chemical_id': 'D003300',
   'cas_id': '7440-50-8',
   'interaction': 'increases^expression'},
  {'chemical_id': 'D019327',
   'cas_id': '7758-98-7',
   'interaction': 'increases^expression'},
  {'chemical_i

In [110]:
# http://ctdbase.org/detail.go?type=disease&acc=MESH:D010300
disease = 'PARKINSON'

def filter_chemical_disease_association(chemical_ids, disease):
    """Filters out chemical ids that do not correspond to Parkinson's 
        disease from the chmeical_disease_associations table
        Returns
        -------
        filtered_chemical_ids: List
            List of chemical ids that are associated with Parkinson's disease.            
    """
    ################# take previous dataframe and take chemicalid from there ########################
    # search for them in chem diseases dataframe
    # search for those that have parkinson disease    
    filteredcontains = df_chem_diseases[df_chem_diseases['diseasename'].str.contains(disease)]
    diction =get_cas_ids(gene_list,interaction_type)  
    chem_id = set()
    
    for key, value in diction.items():
        for element in diction[key]:
            chem_id.add(element['chemical_id'])

    return list(chem_id)

In [111]:
chemical_ids=[]
filter_chemical_disease_association(chemical_ids, disease)

['D001151',
 'D000077209',
 'D014520',
 'C000228',
 'C002802',
 'D019327',
 'D016572',
 'D019256',
 'D003035',
 'C006253',
 'C017947',
 'D002104',
 'D014810',
 'D000082',
 'D003300',
 'D016604',
 'D001564',
 'D000077237',
 'D020106',
 'C049032',
 'D014635',
 'D008769']

In [112]:
chemical_ids=[]
filteredlist=filter_chemical_disease_association(chemical_ids, disease)
filteredcontains=filteredcontains.loc[filteredcontains['chemicalid'].isin(filteredlist)]
filteredcontains=filteredcontains.drop_duplicates(keep=False)


In [None]:
filteredcontains.drop('diseaseid', axis=1, inplace=True)

In [114]:
filteredcontains=filteredcontains.sort_values(by=['inferencescore'])
filteredcontains

Unnamed: 0,chemicalid,diseasename,inferencescore
1583878,D001564,"PARKINSON DISEASE 1, AUTOSOMAL DOMINANT",3.13
7545474,D014635,"PARKINSON DISEASE 1, AUTOSOMAL DOMINANT",3.18
7656931,D014810,"PARKINSON DISEASE, LATE-ONSET",3.25
871860,D000082,"PARKINSON DISEASE 1, AUTOSOMAL DOMINANT",3.28
1583883,D001564,"PARKINSON DISEASE 5, AUTOSOMAL DOMINANT, SUSCE...",3.33
...,...,...,...
7656926,D014810,"PARKINSON DISEASE 11, AUTOSOMAL DOMINANT, SUSC...",5.03
2403477,D003035,"PARKINSON DISEASE 5, AUTOSOMAL DOMINANT, SUSCE...",5.31
4888204,D008769,"PARKINSON DISEASE 5, AUTOSOMAL DOMINANT, SUSCE...",5.39
7461899,C002802,"PARKINSON DISEASE 5, AUTOSOMAL DOMINANT, SUSCE...",5.74
