### preprocess pharmgkb

In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
import pubchempy as pcp
import requests

#### Part 1: evidence filter 1

In [3]:
pharmgkb_df = pd.read_csv('../datasets/rawData-21-Dec-2022/pharmgkb/clinicalVariants/clinicalVariants.tsv', sep='\t', on_bad_lines='skip')
print(pharmgkb_df)
pharmgkb_df.head()

                                      variant    gene           type  \
0     CYP2C9*1, CYP2C9*2, CYP2C9*3, CYP2C9*13  CYP2C9  Metabolism/PK   
1               CYP2C9*1, CYP2C9*3, CYP2C9*13  CYP2C9  Metabolism/PK   
2                CYP2C9*1, CYP2C9*2, CYP2C9*3  CYP2C9  Metabolism/PK   
3                                  rs17376848    DPYD       Toxicity   
4                                   rs2297595    DPYD       Toxicity   
...                                       ...     ...            ...   
5028   CYP2C8*1, CYP2C8*2, CYP2C8*3, CYP2C8*4  CYP2C8       Toxicity   
5029                                rs1801253   ADRB1       Efficacy   
5030                                rs2230345    GRK5       Efficacy   
5031                                rs1042713   ADRB2       Efficacy   
5032                              HLA-A*02:01   HLA-A       Toxicity   

     level of evidence             chemicals  \
0                   1A             meloxicam   
1                   1A            lorno

Unnamed: 0,variant,gene,type,level of evidence,chemicals,phenotypes
0,"CYP2C9*1, CYP2C9*2, CYP2C9*3, CYP2C9*13",CYP2C9,Metabolism/PK,1A,meloxicam,
1,"CYP2C9*1, CYP2C9*3, CYP2C9*13",CYP2C9,Metabolism/PK,1A,lornoxicam,
2,"CYP2C9*1, CYP2C9*2, CYP2C9*3",CYP2C9,Metabolism/PK,1A,siponimod,
3,rs17376848,DPYD,Toxicity,1A,capecitabine,Neoplasms
4,rs2297595,DPYD,Toxicity,1A,capecitabine,Neoplasms


In [34]:
chemical_df = pharmgkb_df.dropna(subset=['chemicals'])
chemical_df = chemical_df[~chemical_df['chemicals'].str.contains(',')]
chemical_df = chemical_df[~chemical_df['chemicals'].str.contains('/')]
chemical_df

Unnamed: 0,variant,gene,type,level of evidence,chemicals,phenotypes
0,"CYP2C9*1, CYP2C9*2, CYP2C9*3, CYP2C9*13",CYP2C9,Metabolism/PK,1A,meloxicam,
1,"CYP2C9*1, CYP2C9*3, CYP2C9*13",CYP2C9,Metabolism/PK,1A,lornoxicam,
2,"CYP2C9*1, CYP2C9*2, CYP2C9*3",CYP2C9,Metabolism/PK,1A,siponimod,
3,rs17376848,DPYD,Toxicity,1A,capecitabine,Neoplasms
4,rs2297595,DPYD,Toxicity,1A,capecitabine,Neoplasms
...,...,...,...,...,...,...
5028,"CYP2C8*1, CYP2C8*2, CYP2C8*3, CYP2C8*4",CYP2C8,Toxicity,4,paclitaxel,"Drug Toxicity,Neutropenia,Peripheral Nervous S..."
5029,rs1801253,ADRB1,Efficacy,4,Beta Blocking Agents,"Cardiomyopathy, Dilated,Heart Failure"
5030,rs2230345,GRK5,Efficacy,4,Beta Blocking Agents,Heart Failure
5031,rs1042713,ADRB2,Efficacy,4,Beta Blocking Agents,"Cardiomyopathy, Dilated,Heart Failure"


In [35]:
# filtering missense
missense_df = chemical_df[chemical_df['variant'].str.contains('rs')]
missense_df

Unnamed: 0,variant,gene,type,level of evidence,chemicals,phenotypes
3,rs17376848,DPYD,Toxicity,1A,capecitabine,Neoplasms
4,rs2297595,DPYD,Toxicity,1A,capecitabine,Neoplasms
5,rs1801265,DPYD,Toxicity,1A,capecitabine,Neoplasms
8,rs1801160,DPYD,Toxicity,1A,capecitabine,Neoplasms
9,rs1801159,DPYD,Toxicity,1A,capecitabine,Neoplasms
...,...,...,...,...,...,...
5026,rs1045642,ABCB1,Efficacy,4,docetaxel,"Breast Neoplasms,Neoplasms"
5027,rs1045642,ABCB1,Toxicity,4,paclitaxel,"Drug Toxicity,Neoplasms,Neutropenia,Peripheral..."
5029,rs1801253,ADRB1,Efficacy,4,Beta Blocking Agents,"Cardiomyopathy, Dilated,Heart Failure"
5030,rs2230345,GRK5,Efficacy,4,Beta Blocking Agents,Heart Failure


In [36]:
level_df = missense_df[~missense_df['level of evidence'].str.contains('4')]
level_df

Unnamed: 0,variant,gene,type,level of evidence,chemicals,phenotypes
3,rs17376848,DPYD,Toxicity,1A,capecitabine,Neoplasms
4,rs2297595,DPYD,Toxicity,1A,capecitabine,Neoplasms
5,rs1801265,DPYD,Toxicity,1A,capecitabine,Neoplasms
8,rs1801160,DPYD,Toxicity,1A,capecitabine,Neoplasms
9,rs1801159,DPYD,Toxicity,1A,capecitabine,Neoplasms
...,...,...,...,...,...,...
4760,rs12948059,MAP2K6,Toxicity,3,sorafenib,Drug Toxicity
4761,rs315498,,Toxicity,3,sorafenib,Drug Toxicity
4762,rs9973653,EPAS1,Toxicity,3,sorafenib,Drug Toxicity
4763,rs10737062,CAMK1D,Efficacy,3,losartan,Essential hypertension


In [37]:
columns = ['gene', 'variant', 'chemicals', 'type']
level_df = pd.DataFrame(data=level_df, columns=columns)
level_df.drop_duplicates(inplace=True)
level_df

Unnamed: 0,gene,variant,chemicals,type
3,DPYD,rs17376848,capecitabine,Toxicity
4,DPYD,rs2297595,capecitabine,Toxicity
5,DPYD,rs1801265,capecitabine,Toxicity
8,DPYD,rs1801160,capecitabine,Toxicity
9,DPYD,rs1801159,capecitabine,Toxicity
...,...,...,...,...
4760,MAP2K6,rs12948059,sorafenib,Toxicity
4761,,rs315498,sorafenib,Toxicity
4762,EPAS1,rs9973653,sorafenib,Toxicity
4763,CAMK1D,rs10737062,losartan,Efficacy


#### Part 2: missense filtering

In [16]:
variant_df = pd.read_csv('../datasets/rawData-21-Dec-2022/pharmgkb/variants.tsv', sep='\t', on_bad_lines='skip')
columns = ['Variant Name', 'Gene Symbols', 'Synonyms']
variant_df = pd.DataFrame(data=variant_df, columns=columns)
variant_df = variant_df.dropna(subset=['Gene Symbols'])
variant_df = variant_df[variant_df['Synonyms'].str.contains('NP')].reset_index(drop=True)
variant_df

Unnamed: 0,Variant Name,Gene Symbols,Synonyms
0,rs10012,CYP1B1,"rs3172104, NG_008386.2:g.5855C>G, 58311708, NG..."
1,rs10036156,GABRP,"XM_005265874.1:c.19T>C, XP_005265928.1:p.Leu7=..."
2,rs1013940,SLC5A7,"NP_068587.1:p.Ile89Val, NG_042267.1:g.10679A>G..."
3,rs10187694,"UGT1A10,UGT1A13P,UGT1A8","NC_000002.12:g.233636937=, rs10187694, rs17866..."
4,rs1021737,CTH,"NC_000001.11:g.70439117=, NP_001893.2:p.Ser403..."
...,...,...,...
1852,rs1564660997,CYP2C19,"NP_000760.1:p.His78Tyr, NC_000010.10:g.9653487..."
1853,rs1931013246,CYP2D6,"NC_000022.11:g.42126956T>G, NC_000022.11:g.421..."
1854,rs201821708,CYP3A4,"NG_054907.1:g.395=, NG_054907.1:g.395T>C, NP_0..."
1855,rs375805362,CYP2C9,"NG_008385.2:g.9076=, NG_008385.2:g.9076C>T, NP..."


In [21]:
protein_dict = {
    'Gly':'G', 'Ala':'A', 'Val':'V', 'Leu':'L', 'Ile':'I',
    'Pro':'P', 'Phe':'F', 'Tyr':'Y', 'Trp':'W', 'Ser':'S', 
    'Thr':'T', 'Cys':'C', 'Met':'M', 'Asn':'N', 'Gln':'Q',
    'Asp':'D', 'Glu':'E', 'Lys':'K', 'Arg':'R', 'His':'H'
}

In [22]:
# test
string = 'Arg48Gly'
print(string[0:3],string[3:-3],string[-3:])

Arg 48 Gly


In [30]:
# split NPnumber
variant_table = pd.DataFrame(columns=['gene', 'rsid', 'variant', 'NPid'])
for i in range(variant_df.shape[0]): # 
    gene = variant_df['Gene Symbols'][i].split(',')[0]
    rsid = variant_df['Variant Name'][i]
    synonyms = variant_df['Synonyms'][i]
    # split and filtering
    syn_list = synonyms.split(',')
    np_list = [n for n in syn_list if 'NP' in n]
    missense_list = [n for n in np_list if '=' not in n]
    #print(missense_list)
    try: # for = in NPid: not missense
        NPid = missense_list[0].strip()
    except IndexError:
        continue
    string = NPid.split('p.')[-1]
    try:
        variant = protein_dict[string[0:3]] + string[3:-3] + protein_dict[string[-3:]]
    except KeyError:
        continue
    variant_table = variant_table.append([{'gene':gene, 'rsid':rsid, 'variant':variant, 'NPid':NPid}], ignore_index=True)
variant_table


Unnamed: 0,gene,rsid,variant,NPid
0,CYP1B1,rs10012,R48G,NP_000095.2:p.Arg48Gly
1,GABRP,rs10036156,L7V,NP_055026.1:p.Leu7Val
2,SLC5A7,rs1013940,I89V,NP_068587.1:p.Ile89Val
3,UGT1A10,rs10187694,E139K,NP_061948.1:p.Glu139Lys
4,CTH,rs1021737,S403I,NP_001893.2:p.Ser403Ile
...,...,...,...,...
1480,CYP2C19,rs1564657013,S51G,NP_000760.1:p.Ser51Gly
1481,CYP2C19,rs1564660997,H78Y,NP_000760.1:p.His78Tyr
1482,CYP2D6,rs1931013246,K404Q,NP_000097.3:p.Lys404Gln
1483,CYP3A4,rs201821708,Y319C,NP_059488.2:p.Tyr319Cys


In [38]:
# filtering evidence variants
evidence_variant_list = level_df['variant'].unique()
table_variant_list = variant_table['rsid'].unique()
error_list = [n for n in evidence_variant_list if n not in table_variant_list]
print(len(error_list))

1677


In [40]:
for item in error_list:
    level_df = level_df[~level_df['variant'].str.contains(item)]
level_df = level_df.reset_index(drop=True)
level_df

Unnamed: 0,gene,variant,chemicals,type
0,DPYD,rs2297595,capecitabine,Toxicity
1,DPYD,rs1801265,capecitabine,Toxicity
2,DPYD,rs1801160,capecitabine,Toxicity
3,DPYD,rs1801159,capecitabine,Toxicity
4,DPYD,rs1801158,fluorouracil,Toxicity
...,...,...,...,...
1057,ABCB1,rs1045642,highly active antiretroviral therapy (haart),Efficacy
1058,AHRR,rs2292596,methotrexate,Efficacy
1059,ABCB1,rs1045642,tramadol,Toxicity
1060,SLCO1B1,rs4149056,Glucarpidase,Toxicity


In [42]:
# fix variant_df
evidence_variant_list = level_df['variant'].unique()
table_variant_list = variant_table['rsid'].unique()
error_list = [n for n in table_variant_list if n not in evidence_variant_list]
for item in error_list:
    variant_table = variant_table[~variant_table['rsid'].str.contains(item)]
variant_table = variant_table.reset_index(drop=True)
variant_table

Unnamed: 0,gene,rsid,variant,NPid
0,CYP1B1,rs10012,R48G,NP_000095.2:p.Arg48Gly
1,GRK4,rs1024323,A142V,NP_001004057.1:p.Ala142Val
2,CYP2C9,rs1029359343,R307K,NP_000762.2:p.Arg307Lys
3,GLP1R,rs10305420,P7L,NP_002053.3:p.Pro7Leu
4,ZNF568,rs10405238,Y488D,NP_001191767.1:p.Tyr488Asp
...,...,...,...,...
514,ABCB1,rs9282564,N21H,NP_000918.2:p.Asn21His
515,RRP1B,rs9306160,L436P,NP_055871.1:p.Leu436Pro
516,CYP2C9,rs9332239,P489S,NP_000762.2:p.Pro489Ser
517,MYLIP,rs9370867,N342I,NP_037394.2:p.Asn342Ile


In [58]:
level_df['gene'] = level_df['gene'].astype(object)

In [63]:
# fix evidence table
level_df.loc[level_df['variant']=='rs16947','gene'] = 'CYP2D6'

In [64]:
# fix evidence table
evidence_table = pd.DataFrame(columns=['gene', 'variant', 'chemicals', 'type'])
for i in range(level_df.shape[0]): # 
    try:
        gene = level_df['gene'][i].split(',')[0]
    except AttributeError:
        print(level_df['gene'][i])
        print(level_df['variant'][i])
    variant = level_df['variant'][i]
    chemicals = level_df['chemicals'][i]
    type = level_df['type'][i]
    evidence_table = evidence_table.append([{'gene':gene, 'variant':variant, 'chemicals':chemicals, 'type':type}], ignore_index=True)
#evidence_table.loc[evidence_table['variant']=='rs16947','gene'] = 'CYP2D6'
evidence_table

Unnamed: 0,gene,variant,chemicals,type
0,DPYD,rs2297595,capecitabine,Toxicity
1,DPYD,rs1801265,capecitabine,Toxicity
2,DPYD,rs1801160,capecitabine,Toxicity
3,DPYD,rs1801159,capecitabine,Toxicity
4,DPYD,rs1801158,fluorouracil,Toxicity
...,...,...,...,...
1057,ABCB1,rs1045642,highly active antiretroviral therapy (haart),Efficacy
1058,AHRR,rs2292596,methotrexate,Efficacy
1059,ABCB1,rs1045642,tramadol,Toxicity
1060,SLCO1B1,rs4149056,Glucarpidase,Toxicity


In [67]:
# check uniprot mutation
import re
gene_list = evidence_table['gene'].unique()
gene_table = pd.DataFrame(columns=['gene', 'uniprotac', 'fasta'])
for i in tqdm(range(len(gene_list))):
    gene = gene_list[i]
    url = 'https://rest.uniprot.org/uniprotkb/stream?compressed=false&format=fasta&query=(reviewed:true)%20AND%20(organism_id:9606)%20AND%20' + gene_list[i]
    all_fastas = requests.get(url).text
    fasta_list = re.split(r'\n(?=>)', all_fastas)
    try:
        entry = [fasta for fasta in fasta_list if 'GN='+gene_list[i] in fasta][0]
    except IndexError:
        print(gene)
        continue
    except SSLError:
        print(gene)
        continue
    uniprotac = entry.split('|')[1]
    fasta = entry.split(' ')[-1].replace('\n','')[4:]
    gene_table = gene_table.append([{'gene':gene, 'uniprotac':uniprotac, 'fasta':fasta}], ignore_index=True)
print(gene_table)
gene_table.to_csv('../datasets/middlefile/pharmgkb_gene_table.csv', index=None)


 15%|█▍        | 38/262 [01:26<09:26,  2.53s/it]

CD3EAP


 95%|█████████▍| 248/262 [11:30<01:41,  7.23s/it]

CYP2A7P1


100%|██████████| 262/262 [12:00<00:00,  2.75s/it]

       gene uniprotac                                              fasta
0      DPYD    Q12882  MAPVLSKDSADIESILALNPRTQTHATLCSTSAKKLDKKHWKRNPD...
1    CYP4F2    P78329  MSQLSLSWLGLWPVAASPWLLLLLVGASWLLAHVLAWTYAFYDNCR...
2      EGFR    P00533  MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...
3    CYP2B6    P20813  MELSVLLFLALLTGLLLLLVQRHPNTHDRLPPGPRPLPLLGNLLQM...
4    NUDT15    Q9NV35  MTASAQPRGRRPGVGVGVVVTSCKHPRCVLLGKRKGSVGAGSFQLP...
..      ...       ...                                                ...
255  MTHFD1    P11586  MAPAEILNGKEISAQIRARLKNQVTQLKEQVPGFTPRLAILQVGNR...
256   ADH1C    P00326  MSTAGKVIKCKAAVLWELKKPFSIEEVEVAPPKAHEVRIKMVAAGI...
257  SRD5A2    P31213  MQVQCQQSPVLAGSATLVALGALALYVAKPSGYGKHTESLKPAATR...
258    AHRR    A9YTQ3  MPRTMIPPGECTYAGRKRRRPLQKQRPAVGAEKSNPSKRHRDRLNA...
259   VEGFB    P49765  MSPLLRRLLLAALLQLAPAQAPVSQPDAPGHQRKVVSWIDVYTRAT...

[260 rows x 3 columns]





In [78]:
# manually fix
variant_table = variant_table.append([{'gene': 'UGT1A10', 'rsid': 'rs6759892', 'variant': 'S7A', 'NPid':'NP_001063.2'}], ignore_index=True)
variant_table = variant_table.append([{'gene': 'UGT1A4', 'rsid': 'rs6755571', 'variant': 'P24S', 'NPid':'NP_009051.1'}], ignore_index=True)
variant_table = variant_table.append([{'gene': 'UGT2B7', 'rsid': 'rs61361928', 'variant': 'L46P', 'NPid':'NP_001065.2'}], ignore_index=True)
variant_table.drop_duplicates(inplace=True)
variant_table = variant_table.reset_index(drop=True)
variant_table

Unnamed: 0,gene,rsid,variant,NPid
0,CYP1B1,rs10012,R48G,NP_000095.2:p.Arg48Gly
1,GRK4,rs1024323,A142V,NP_001004057.1:p.Ala142Val
2,CYP2C9,rs1029359343,R307K,NP_000762.2:p.Arg307Lys
3,GLP1R,rs10305420,P7L,NP_002053.3:p.Pro7Leu
4,ZNF568,rs10405238,Y488D,NP_001191767.1:p.Tyr488Asp
...,...,...,...,...
517,MYLIP,rs9370867,N342I,NP_037394.2:p.Asn342Ile
518,CD68,rs9901675,A350T,NP_001242.2:p.Ala350Thr
519,UGT1A10,rs6759892,S7A,NP_001063.2
520,UGT1A4,rs6755571,P24S,NP_009051.1


In [79]:
evidence_table.to_csv('../datasets/middlefile/pharmgkb_evidence_table.csv', index=None)
variant_table.to_csv('../datasets/middlefile/pharmgkb_variant_table.csv', index=None)

In [80]:
# change variant info to evidence table
evidence_table_new = pd.DataFrame(columns=['gene', 'variant', 'rsid', 'drugs', 'type'])
for i in range(len(evidence_table)):
    gene = evidence_table['gene'][i]
    #print(gene)
    rsvariant = evidence_table['variant'][i]
    variant = variant_table[variant_table['rsid'] == rsvariant]['variant'].values[0]
    drugs = evidence_table['chemicals'][i]
    type = evidence_table['type'][i]
    evidence_table_new = evidence_table_new.append([{'gene':gene, 'variant':variant, 'rsid':rsvariant, 'drugs':drugs, 'type':type}], ignore_index=True)
print(evidence_table_new)
evidence_table_new.to_csv('../datasets/middlefile/pharmgkb_evidence_table.csv', index=None)

         gene variant        rsid  \
0        DPYD   M166V   rs2297595   
1        DPYD    C29R   rs1801265   
2        DPYD   V732I   rs1801160   
3        DPYD   I543V   rs1801159   
4        DPYD   S534N   rs1801158   
...       ...     ...         ...   
1057    ABCB1  I1145M   rs1045642   
1058     AHRR   P185A   rs2292596   
1059    ABCB1  I1145M   rs1045642   
1060  SLCO1B1   V174A   rs4149056   
1061    VEGFB   D136E  rs12366035   

                                             drugs      type  
0                                     capecitabine  Toxicity  
1                                     capecitabine  Toxicity  
2                                     capecitabine  Toxicity  
3                                     capecitabine  Toxicity  
4                                     fluorouracil  Toxicity  
...                                            ...       ...  
1057  highly active antiretroviral therapy (haart)  Efficacy  
1058                                  methotrexate 

In [71]:
protein_dict = {'C':0, 'D':1, 'S':2, 'Q':3, 'K':4,
        'I':5, 'P':6, 'T':7, 'F':8, 'N':9,
        'G':10, 'H':11, 'L':12, 'R':13, 'W':14,
        'A':15, 'V':16, 'E':17, 'Y':18, 'M':19}

In [84]:
# fix variant table based on sequence
variant_table.loc[variant_table['rsid']=='rs1042713','variant'] = 'G16R'
variant_table.loc[variant_table['rsid']=='rs71647871','variant'] = 'G143E'
variant_table.loc[variant_table['rsid']=='rs671','variant'] = 'E504K'
variant_table.loc[variant_table['rsid']=='rs396991','variant'] = 'F281I'
variant_table.loc[variant_table['rsid']=='rs20455','variant'] = 'W719R'

In [86]:
variant_table.to_csv('../datasets/middlefile/pharmgkb_variant_table.csv', index=None)

In [None]:
# check mutation sequence
gene_table_manu = pd.read_csv('../datasets/middlefile/pharmgkb_gene_table_fixed.csv')
for i in range(len(evidence_table_new)):
    gene = evidence_table_new['gene'][i]
    #print(gene)
    variant = evidence_table_new['variant'][i]
    fasta = gene_table_manu[gene_table_manu['gene'] == gene]['fasta'].values[0]
    #print(fasta)
    pos_before = variant[0]
    pos = int(variant[1:-1])
    #print(pos)
    pos_after = variant[-1]
    
    # check pos_after
    if(pos_after not in protein_dict.keys()):
        print('pos_after', gene, variant)
    # check pos_before
    try:
        if(fasta[pos-1] != pos_before):
            print('pos_before', gene, variant)
    except IndexError:
        print('IndexError', gene, variant)

In [92]:
# remove error evidence
gene_table_manu = pd.read_csv('../datasets/middlefile/pharmgkb_gene_table_fixed.csv')
evidence_table_new = pd.read_csv('../datasets/middlefile/pharmgkb_evidence_table.csv')
fix_evidence_df = evidence_table_new.copy()
for i in range(len(evidence_table_new)):
    gene = evidence_table_new['gene'][i]
    #print(gene)
    variant = evidence_table_new['variant'][i]
    fasta = gene_table_manu[gene_table_manu['gene'] == gene]['fasta'].values[0]
    #print(fasta)
    pos_before = variant[0]
    pos = int(variant[1:-1])
    #print(pos)
    pos_after = variant[-1]
    
    # check pos_after
    if(pos_after not in protein_dict.keys()):
        #print('pos_after', gene, variant)
        fix_evidence_df = fix_evidence_df[~((fix_evidence_df['gene'] == gene) & (fix_evidence_df['variant'] == variant))]
    # check pos_before
    try:
        if(fasta[pos-1] != pos_before):
            #print('pos_before', gene, variant)
            fix_evidence_df = fix_evidence_df[~((fix_evidence_df['gene'] == gene) & (fix_evidence_df['variant'] == variant))]
    except IndexError:
        #print('IndexError', gene, variant)
        fix_evidence_df = fix_evidence_df[~((fix_evidence_df['gene'] == gene) & (fix_evidence_df['variant'] == variant))]
fix_evidence_df = fix_evidence_df.reset_index(drop=True)
fix_evidence_df

Unnamed: 0,gene,variant,rsid,drugs,type
0,DPYD,M166V,rs2297595,capecitabine,Toxicity
1,DPYD,C29R,rs1801265,capecitabine,Toxicity
2,DPYD,V732I,rs1801160,capecitabine,Toxicity
3,DPYD,I543V,rs1801159,capecitabine,Toxicity
4,DPYD,S534N,rs1801158,fluorouracil,Toxicity
...,...,...,...,...,...
902,ABCC4,K1116N,rs1751034,tenofovir,Metabolism/PK
903,ABCB1,I1145M,rs1045642,highly active antiretroviral therapy (haart),Efficacy
904,ABCB1,I1145M,rs1045642,tramadol,Toxicity
905,SLCO1B1,V174A,rs4149056,Glucarpidase,Toxicity


#### Part 3: drug filtering

In [95]:
fix_evidence_df.to_csv('../datasets/middlefile/pharmgkb_evidence_table.csv', index=None)
drug_unique_list = fix_evidence_df['drugs'].unique()
print(len(drug_unique_list))

253


In [96]:
error_list = []
for i in tqdm(range(len(drug_unique_list))):
    if(len(pcp.get_compounds(drug_unique_list[i],'name')) == 0):
        error_list.append(drug_unique_list[i])
error_list

100%|██████████| 253/253 [05:43<00:00,  1.36s/it]


['hormonal contraceptives for systemic use',
 'hmg coa reductase inhibitors',
 'diuretics',
 'trastuzumab',
 'Measles vaccines',
 'antipsychotics',
 'pitrakinra',
 'rituximab',
 'egfr inhibitors',
 'Tumor necrosis factor alpha (TNF-alpha) inhibitors',
 'eculizumab',
 'Selective serotonin reuptake inhibitors',
 'bevacizumab',
 'corticosteroids',
 'antidepressants',
 'photodynamic therapy',
 'Opium alkaloids and derivatives',
 'Antibiotics',
 'cetuximab',
 'gemtuzumab ozogamicin',
 'Beta Blocking Agents',
 'Antithyroid Preparations',
 'ustekinumab',
 'Drugs Used In Diabetes',
 'antineoplastic agents',
 'etanercept',
 'glucocorticoids',
 'botulinum toxin type a',
 'anthracyclines and related substances',
 'highly active antiretroviral therapy (haart)']