In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')


Mounted at /content/drive


Complete code

In [None]:
import json
import urllib.request

def search(list_dict, toCHeading):
    for idx, d in enumerate(list_dict):
        if d['TOCHeading'] == toCHeading:
            return idx

def get_property(data, property_name):
    try:
        idx_1 = search(data['Record']['Section'], 'Chemical and Physical Properties')
        idx_2 = search(data['Record']['Section'][idx_1]['Section'], 'Computed Properties')
        idx_3 = search(data['Record']['Section'][idx_1]['Section'][idx_2]['Section'], property_name)
        if 'Number' in data['Record']['Section'][idx_1]['Section'][idx_2]['Section'][idx_3]['Information'][0]['Value']:
            return data['Record']['Section'][idx_1]['Section'][idx_2]['Section'][idx_3]['Information'][0]['Value']['Number'][0]
        elif 'StringWithMarkup' in data['Record']['Section'][idx_1]['Section'][idx_2]['Section'][idx_3]['Information'][0]['Value']:
            return data['Record']['Section'][idx_1]['Section'][idx_2]['Section'][idx_3]['Information'][0]['Value']['StringWithMarkup'][0]['String']
    except:
        return "Property not available"

def download_chem(id):
    url_string = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{id}/JSON/?response_type=display"
    try:
        response = urllib.request.urlopen(url_string).read().decode('utf-8')
        return json.loads(response)
    except urllib.error.URLError as e:
        return f"Error fetching data for CID: {id}, Error: {str(e)}"

def get_properties(cid):
    data = download_chem(cid)
    if isinstance(data, str):
        return data  # Returning the error message directly if it's a string
    return {
        "complexity": get_property(data, 'Complexity'),
        "molecular_weight": get_property(data, 'Molecular Weight'),
        "hydrogen_bond_donor_count": get_property(data, 'Hydrogen Bond Donor Count'),
        "hydrogen_bond_acceptor_count": get_property(data, 'Hydrogen Bond Acceptor Count'),
        "topological_polar_surface_area": get_property(data, 'Topological Polar Surface Area'),
        "xlogp": get_property(data, 'XLogP3')
    }

# Predefined list of missing CIDs
missing_cids = ['294', '1151', '2026']

# Fetch and print properties for each CID in the list
for cid in missing_cids:
    properties = get_properties(cid)
    print(f"Properties for CID {cid}: {properties}")


Properties for CID 294: {'complexity': 382, 'molecular_weight': '342.30', 'hydrogen_bond_donor_count': 8, 'hydrogen_bond_acceptor_count': 11, 'topological_polar_surface_area': 190, 'xlogp': -4.7}
Properties for CID 1151: {'complexity': 853, 'molecular_weight': '482.17', 'hydrogen_bond_donor_count': 6, 'hydrogen_bond_acceptor_count': 14, 'topological_polar_surface_area': 239, 'xlogp': -5}
Properties for CID 2026: {'complexity': 886, 'molecular_weight': '576.6', 'hydrogen_bond_donor_count': 7, 'hydrogen_bond_acceptor_count': 11, 'topological_polar_surface_area': 210, 'xlogp': 0.2}


Crawling the properties from Pubchem and saving them in the specified format

In [None]:
import json
import urllib.request
import time

def search(list_dict, toCHeading):
    for idx, d in enumerate(list_dict):
        if d['TOCHeading'] == toCHeading:
            return idx
    return None

def get_property(data, property_name):
    idx_1 = search(data['Record']['Section'], 'Chemical and Physical Properties')
    if idx_1 is None:
        return "Section not available"
    idx_2 = search(data['Record']['Section'][idx_1]['Section'], 'Computed Properties')
    if idx_2 is None:
        return "Subsection not available"
    idx_3 = search(data['Record']['Section'][idx_1]['Section'][idx_2]['Section'], property_name)
    if idx_3 is None:
        return "Property not available"
    property_info = data['Record']['Section'][idx_1]['Section'][idx_2]['Section'][idx_3]['Information'][0]['Value']
    if 'Number' in property_info:
        return property_info['Number'][0]
    elif 'StringWithMarkup' in property_info:
        return property_info['StringWithMarkup'][0]['String']
    return "Format not recognized"

def download_chem(id):
    url_string = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{id}/JSON/?response_type=display"
    try:
        response = urllib.request.urlopen(url_string).read().decode('utf-8')
        return json.loads(response)
    except urllib.error.URLError as e:
        return f"Error fetching data for CID: {id}, Error: {str(e)}"

def get_properties(cid):
    data = download_chem(cid)
    if isinstance(data, str):
        return {"cid": cid, "error": data}  # Return error message directly if it's a string
    return {
        "cid": cid,
        "cmpdname": get_property(data, 'IUPAC Name'),
        "mw": get_property(data, 'Molecular Weight'),
        "complexity": get_property(data, 'Complexity'),
        "hbonddonor": get_property(data, 'Hydrogen Bond Donor Count'),
        "hbondacc": get_property(data, 'Hydrogen Bond Acceptor Count'),
        "polararea": get_property(data, 'Topological Polar Surface Area'),
        "xlogp": get_property(data, 'XLogP3')
    }

# Predefined list of missing CIDs
missing_cids= ['294', '1151', '2026', '2459', '2608', '2998', '4243', '5595', '19958', '32731', '36207', '40073', '56068', '60730', '62074', '66273', '66370', '72421', '86370', '125782', '151164', '171283', '198736', '199402', '208909', '213027', '228400', '426058', '439341', '441190', '441362', '452254', '453618', '519316', '656684', '1549144', '2723733', '3037933', '3047796', '3246540', '3465281', '3829279', '4469989', '4655349', '5287971', '5293655', '5311491', '5353563', '5375083', '5459124', '5484736', '5486549', '5489436', '5702059', '5702060', '5702224', '6035169', '6093416', '6333901', '6335620', '6419722', '6473878', '6708740', '6917855', '6918047', '6918627', '6971047', '6991966', '7079600', '7098673', '9573169', '9692361', '9808655', '9832798', '9882672', '9886567', '9888108', '9936012', '9977421', '9995070', '10019578', '10068406', '10106002', '10423777', '10569111']

# Rate limiting setup
rate_limit = 1  # Number of seconds to wait between API calls

# Fetch and collect properties for each CID in the list
properties_list = []
for cid in missing_cids:
    properties = get_properties(cid)
    properties_list.append(properties)
    print(f"Processed CID {cid}")
    time.sleep(rate_limit)  # Pause to respect potential API rate limits

# Write to JSON file in the specified Google Drive folder
path = '/content/drive/MyDrive/FINAL_missing_cids_properties.json'
with open(path, 'w') as f:
    json.dump(properties_list, f, indent=4)

print(f"JSON file with missing CIDs properties has been created at {path}.")


Processed CID 294
Processed CID 1151
Processed CID 2026
Processed CID 2459
Processed CID 2608
Processed CID 2998
Processed CID 4243
Processed CID 5595
Processed CID 19958
Processed CID 32731
Processed CID 36207
Processed CID 40073
Processed CID 56068
Processed CID 60730
Processed CID 62074
Processed CID 66273
Processed CID 66370
Processed CID 72421
Processed CID 86370
Processed CID 125782
Processed CID 151164
Processed CID 171283
Processed CID 198736
Processed CID 199402
Processed CID 208909
Processed CID 213027
Processed CID 228400
Processed CID 426058
Processed CID 439341
Processed CID 441190
Processed CID 441362
Processed CID 452254
Processed CID 453618
Processed CID 519316
Processed CID 656684
Processed CID 1549144
Processed CID 2723733
Processed CID 3037933
Processed CID 3047796
Processed CID 3246540
Processed CID 3465281
Processed CID 3829279
Processed CID 4469989
Processed CID 4655349
Processed CID 5287971
Processed CID 5293655
Processed CID 5311491
Processed CID 5353563
Process

Merging missing chem properties with existing chem properties

In [None]:
import json

def load_json_file(filepath):
    """ Load the content of a JSON file. """
    with open(filepath, 'r') as file:
        return json.load(file)

def save_json_file(data, filepath):
    """ Save data to a JSON file with indentation for readability. """
    with open(filepath, 'w') as file:
        json.dump(data, file, indent=4)

# File paths
new_data_path = '/content/drive/MyDrive/FINAL_missing_cids_properties.json'
existing_data_path = '/content/drive/MyDrive/NEW_augmented_124k_PubChem_compound_list.json'
merged_data_path = '/content/drive/MyDrive/FINAL_Merged_PubChem_compound_list.json'  # Path for the merged file

# Load data from both JSON files
new_data = load_json_file(new_data_path)
existing_data = load_json_file(existing_data_path)

# Merge the data
merged_data = existing_data + new_data  # Appends new data to the existing list

# Save the merged data to a new file
save_json_file(merged_data, merged_data_path)

print(f"Merged data has been saved to {merged_data_path}.")


Merged data has been saved to /content/drive/MyDrive/FINAL_Merged_PubChem_compound_list.json.
