#### Imports

In [43]:
import os
import json
import pandas as pd

#### Open dictionary and select small molecules

In [18]:
with open(os.path.join('..', 'full_database_dict.json'), 'r') as file:
    drug_list = json.load(file)

In [42]:
drug_list[0]['@type']

'biotech'

In [26]:
small_drug_list = []
for drug in drug_list:
    if drug['@type'] == 'small molecule':
        small_drug_list.append(drug)

In [27]:
len(small_drug_list)

12227

In [41]:
small_drug_list[0]['@type']

'small molecule'

#### Extract properties to a new dictionary

In [29]:
# Creating a new dictionary with selected keys from the first instance

my_dict = {}
for key in small_drug_list[1].keys():
    my_dict[key] = []
my_dict.pop('calculated-properties')
my_dict.pop('categories')
my_dict.pop('external-identifiers')
my_dict.pop('products')
my_dict.pop('ahfs-codes')
my_dict.pop('international-brands')
my_dict.pop('atc-codes')

for proprty in small_drug_list[1]['calculated-properties']['property']:
    my_dict[proprty['kind']] = []
my_dict['atc_code'] = []
for i in range(4):
    my_dict[f'atc_code_{i}'] = []
    my_dict[f'atc_code_{i}_$'] = []

# Extracting the properties of each instance

j=0
for mol in small_drug_list:

    for key in my_dict.keys():
        if key in mol.keys():
            
            if key == 'drugbank-id' and mol[key] != None:
                my_dict[key].append(mol[key][0]['$'])
            elif key == 'groups' and mol[key] != None:
                my_dict[key].append(mol[key]['group'][0])
            elif key == 'manufacturers' and mol[key] != None:
                my_dict[key].append(mol[key]['manufacturer'][0]['$'])
            else:
                my_dict[key].append(mol[key])
        
    if 'calculated-properties' not in mol.keys():
        proprty_kind_values = set()
        for proprty in small_drug_list[1]['calculated-properties']['property']:
            if proprty['kind'] not in proprty_kind_values:
                proprty_kind_values.add(proprty['kind'])
                my_dict[proprty['kind']].append(None)      
    
    if 'atc-codes' not in mol.keys() or mol['atc-codes'] == None:
        my_dict['atc_code'].append(None)
        for i in range(4):
            my_dict[f'atc_code_{i}'].append(None)
            my_dict[f'atc_code_{i}_$'].append(None)
        
    for inst_k, inst_v in mol.items():
        if inst_k == 'calculated-properties':
            if inst_v == None:
                proprty_kind_values = set()  
                for proprty in small_drug_list[1]['calculated-properties']['property']:
                    if proprty['kind'] not in proprty_kind_values:
                        proprty_kind_values.add(proprty['kind'])
                        my_dict[proprty['kind']].append(None) 
            else:
                proprty_kind_values = set()  
                for proprty in inst_v['property']:
                    if proprty['kind'] in my_dict.keys() and proprty['kind'] not in proprty_kind_values: 
                        proprty_kind_values.add(proprty['kind'])
                        my_dict[proprty['kind']].append(proprty['value'])
        
        elif inst_k == 'atc-codes':
            if inst_v is not None:          
                my_dict['atc_code'].append(inst_v['atc-code'][0]['@code'])
                for i, value in enumerate(inst_v['atc-code'][0]['level']):
                    my_dict[f'atc_code_{i}'].append(value['@code'])
                    my_dict[f'atc_code_{i}_$'].append(value['$']) 
    j+=1
    for key, val in my_dict.items():
        if len(val) != j:
            my_dict[key].append(None)                                                
                    

In [37]:
drug_df = pd.DataFrame(my_dict)

In [44]:
drug_df.columns

Index(['@type', '@created', '@updated', 'drugbank-id', 'name', 'description',
       'cas-number', 'unii', 'average-mass', 'monoisotopic-mass', 'state',
       'groups', 'general-references', 'synthesis-reference', 'indication',
       'pharmacodynamics', 'mechanism-of-action', 'toxicity', 'metabolism',
       'absorption', 'half-life', 'protein-binding', 'route-of-elimination',
       'volume-of-distribution', 'clearance', 'salts', 'synonyms', 'mixtures',
       'packagers', 'manufacturers', 'prices', 'affected-organisms', 'dosages',
       'pdb-entries', 'fda-label', 'patents', 'food-interactions',
       'drug-interactions', 'experimental-properties', 'external-links',
       'pathways', 'reactions', 'snp-effects', 'snp-adverse-drug-reactions',
       'targets', 'enzymes', 'carriers', 'transporters', 'logP', 'logS',
       'Water Solubility', 'IUPAC Name', 'Traditional IUPAC Name',
       'Molecular Weight', 'Monoisotopic Weight', 'SMILES',
       'Molecular Formula', 'InChI', 'InCh

In [34]:
drug_df.to_csv('drugbank_dataframe.csv', index = False)