In [3]:
import pandas as pd
from typing import List, Dict
from collections import defaultdict





In [4]:
columns_cg_str = "ChemicalName,ChemicalID,CasRN,GeneSymbol,GeneID,GeneForms,Organism,OrganismID,Interaction,InteractionActions,PubMedIDs"
columns_cg = columns_cg_str.split(",")

column_cg_dtype = {"ChemicalName": str,
                "ChemicalID": str,
                "CasRN": str,
                "GeneSymbol": str,
                "GeneID": str,
                "GeneForms": str,
                "Organism": str,
                "OrganismID": str,
                "Interaction": str,
                "InteractionActions": str,
                "PubMedIDs": str}
df_chem_gene = pd.read_csv("http://ctdbase.org/reports/CTD_chem_gene_ixns.tsv.gz", compression='gzip', header=0, sep='\t', comment='#', names=columns_cg, low_memory=False, dtype=column_cg_dtype)
df_chem_gene.head(3)

df_chem_gene = df_chem_gene.loc[df_chem_gene['Organism'] == 'Homo sapiens']


In [5]:
df_chem_gene = df_chem_gene[df_chem_gene['CasRN'].notna()]
df_chem_gene
 

Unnamed: 0,ChemicalName,ChemicalID,CasRN,GeneSymbol,GeneID,GeneForms,Organism,OrganismID,Interaction,InteractionActions,PubMedIDs
19,"10,11-dihydro-10,11-dihydroxy-5H-dibenzazepine...",C004822,35079-97-1,EPHX1,2052,gene,Homo sapiens,9606,[EPHX1 gene SNP affects the metabolism of carb...,affects^chemical synthesis|affects^metabolic p...,15692831
20,"10,11-dihydro-10,11-dihydroxy-5H-dibenzazepine...",C004822,35079-97-1,EPHX1,2052,protein,Homo sapiens,9606,[EPHX1 protein results in increased metabolism...,increases^chemical synthesis|increases^metabol...,15692831
50,10-(alpha-diethylaminopropionyl)phenothiazine,C013759,63834-04-8,HAMP,57817,mRNA,Homo sapiens,9606,10-(alpha-diethylaminopropionyl)phenothiazine ...,decreases^expression,25633564
52,10-decarbamoylmitomycin C,C067795,26909-37-5,CASP3,836,protein,Homo sapiens,9606,10-decarbamoylmitomycin C results in increased...,increases^activity,17530733
53,10-decarbamoylmitomycin C,C067795,26909-37-5,CASP9,842,protein,Homo sapiens,9606,10-decarbamoylmitomycin C results in increased...,increases^activity,17530733
...,...,...,...,...,...,...,...,...,...,...,...
2320048,Zymosan,D015054,9010-72-4,TNFRSF1A,7132,protein,Homo sapiens,9606,Zymosan results in increased expression of TNF...,increases^expression,10072544
2320060,Zymosan,D015054,9010-72-4,TNFRSF1B,7133,protein,Homo sapiens,9606,Zymosan results in increased expression of TNF...,increases^expression,10072544
2320067,Zymosan,D015054,9010-72-4,XIAP,331,mRNA,Homo sapiens,9606,Zymosan analog results in decreased expression...,decreases^expression,16803582
2320068,zymosterol,C015582,128-33-6,CYP27A1,1593,protein,Homo sapiens,9606,CYP27A1 protein results in increased metabolis...,increases^metabolic processing,14622972


In [6]:
df_chem_gene['IntAct'] = df_chem_gene.InteractionActions.str.split('|')
df_chemical_gene = df_chem_gene.explode('IntAct')
df_chemical_gene.drop(labels=['InteractionActions', 'GeneForms', 'PubMedIDs'], axis=1, inplace=True)
df_chemical_gene.rename(columns={'IntAct': 'InteractionActions'}, inplace=True)
df_chemical_gene.head()
#df_chemical_gene.rename(columns={'IntAct': 'InteractionActions'}, inplace=True)

Unnamed: 0,ChemicalName,ChemicalID,CasRN,GeneSymbol,GeneID,Organism,OrganismID,Interaction,InteractionActions
19,"10,11-dihydro-10,11-dihydroxy-5H-dibenzazepine...",C004822,35079-97-1,EPHX1,2052,Homo sapiens,9606,[EPHX1 gene SNP affects the metabolism of carb...,affects^chemical synthesis
19,"10,11-dihydro-10,11-dihydroxy-5H-dibenzazepine...",C004822,35079-97-1,EPHX1,2052,Homo sapiens,9606,[EPHX1 gene SNP affects the metabolism of carb...,affects^metabolic processing
20,"10,11-dihydro-10,11-dihydroxy-5H-dibenzazepine...",C004822,35079-97-1,EPHX1,2052,Homo sapiens,9606,[EPHX1 protein results in increased metabolism...,increases^chemical synthesis
20,"10,11-dihydro-10,11-dihydroxy-5H-dibenzazepine...",C004822,35079-97-1,EPHX1,2052,Homo sapiens,9606,[EPHX1 protein results in increased metabolism...,increases^metabolic processing
50,10-(alpha-diethylaminopropionyl)phenothiazine,C013759,63834-04-8,HAMP,57817,Homo sapiens,9606,10-(alpha-diethylaminopropionyl)phenothiazine ...,decreases^expression


In [14]:
 
gene_list =[ 'UCHL1', 'PDPK1']
interaction_type ='increases^expression'

def get_cas_ids(hgnc_symbols:List,interaction_type:str = None) -> Dict:
    
    ##################### filtering needed data #######################
    searched=df_chemical_gene[df_chemical_gene['GeneSymbol'].isin(gene_list)]
    searched = searched[['GeneSymbol', 'ChemicalID', 'CasRN', 'InteractionActions']]
    searched=searched.loc[searched['InteractionActions'] == interaction_type]
    cleaned=searched.drop_duplicates(keep=False)
    cleaned=searched.drop_duplicates(keep=False)

    ##################### initializing empty dictionary ############# 
    hgnc_symbols=cleaned['GeneSymbol'].unique().tolist()
    chemical_gene_interactions = {}
    for hgnc_symbol in hgnc_symbols:
        chemical_gene_interactions[hgnc_symbol]=[]
   

    #################################################################
    # below I would like to generate the dictionary that you decided manually to create
    for index, row in cleaned.iterrows():
        gene = (row['GeneSymbol'])
        chemicalid = row["ChemicalID"]
        cas_id = row["CasRN"]
        interaction=row['InteractionActions']
        chemical_gene_interactions[gene].append({'chemical_id': chemicalid, 'cas_id': cas_id, 'interaction': interaction})

    return chemical_gene_interactions
    
 
get_cas_ids(gene_list,interaction_type)    

{'UCHL1': [{'chemical_id': 'D000082',
   'cas_id': '103-90-2',
   'interaction': 'increases^expression'},
  {'chemical_id': 'D020106',
   'cas_id': '79-06-1',
   'interaction': 'increases^expression'},
  {'chemical_id': 'D016604',
   'cas_id': '1162-65-8',
   'interaction': 'increases^expression'},
  {'chemical_id': 'C000228',
   'cas_id': '313-67-7',
   'interaction': 'increases^expression'},
  {'chemical_id': 'D001151',
   'cas_id': '7440-38-2',
   'interaction': 'increases^expression'},
  {'chemical_id': 'D000077237',
   'cas_id': '1327-53-3',
   'interaction': 'increases^expression'},
  {'chemical_id': 'D001564',
   'cas_id': '50-32-8',
   'interaction': 'increases^expression'},
  {'chemical_id': 'D003035',
   'cas_id': '7440-48-4',
   'interaction': 'increases^expression'},
  {'chemical_id': 'D003300',
   'cas_id': '7440-50-8',
   'interaction': 'increases^expression'},
  {'chemical_id': 'D019327',
   'cas_id': '7758-98-7',
   'interaction': 'increases^expression'},
  {'chemical_i