In [2]:
from rdkit import Chem
from rdkit.Chem import Descriptors

def calculate_descriptors(smiles_string):
    # Convert the SMILES string to a molecule object
    molecule = Chem.MolFromSmiles(smiles_string)

    # Calculate all available descriptors
    descriptors = {}
    for descriptor_name, descriptor_fn in Descriptors.descList:
        try:
            descriptors[descriptor_name] = descriptor_fn(molecule)
        except:
            descriptors[descriptor_name] = None

    return descriptors

In [3]:
smiles_string = "CC(=O)OC1=CC=CC=C1C(=O)O"
descriptors = calculate_descriptors(smiles_string)

In [4]:
table_columns = ["mol_molytica_id", "mol_canonical_smiles"] + list(descriptors.keys())

In [5]:
descriptors.values()

dict_values([10.611948223733938, 10.611948223733938, 0.01601851851851821, -1.1140277777777776, 0.5501217966938848, 9.307692307692308, 180.15899999999996, 172.09499999999997, 180.042258736, 68, 0, 0.33900378687731025, -0.4775395271554559, 0.4775395271554559, 0.33900378687731025, 1.3076923076923077, 1.9230769230769231, 2.4615384615384617, 16.53611122125433, 10.182282381035343, 2.104306980957856, -2.0311320919470135, 2.1698345568128055, -2.063000061964297, 5.913065796110142, -0.13141434244030561, 2.0325369160995868, 3.0435273546341013, 343.2228677267164, 9.844934982691242, 6.9813595436500515, 6.9813595436500515, 6.109060905280622, 3.6174536478673316, 3.6174536478673316, 2.3949556783206725, 2.3949556783206725, 1.3711546649445037, 1.3711546649445037, 0.8871712192374142, 0.8871712192374142, -1.8399999999999999, 729.6807528797516, 9.249605734767023, 3.7092512583454584, 2.297415032519928, 74.75705264447721, 9.843390348640755, 11.3129633249809, 0.0, 0.0, 0.0, 11.938610575903699, 4.7945371840718

In [6]:
len(descriptors.values())

210

In [7]:
def add_mol_desc_to_db(smiles, mol_ids, batch_descriptors, target_output_path):
    pass

In [8]:
from concurrent.futures import ProcessPoolExecutor
import json, os
from tqdm import tqdm
import sqlite3

def create_db_and_table(path="data/curated_chembl/SMILES_metadata.db"):
    # Connect to the SQLite database
    conn = sqlite3.connect(path)

    # Create a cursor object
    c = conn.cursor()

    # Define the SQL command to create the table
    sql_command = """
    CREATE TABLE mol_metadata (
        mol_molytica_id INTEGER,
        mol_canonical_smiles TEXT,
        {}
    );
    """.format(", ".join("{} REAL".format(desc) for desc in descriptors.keys()))

    # Execute the SQL command
    c.execute(sql_command)

    # Commit the changes
    conn.commit()

    # Close the connection
    conn.close()


def add_mol_desc_to_db(smiles, mol_ids, batch_descriptors, target_output_path):
    db_path = os.path.join(target_output_path, "SMILES_metadata.db")
    conn = sqlite3.connect(db_path)
    c = conn.cursor()

    # Prepare the batch of data
    data = [(mol_id, smile, *descriptor) for mol_id, smile, descriptor in zip(mol_ids, smiles, batch_descriptors)]

    # Define the SQL command for batch insertion
    placeholders = ', '.join(['?'] * (2 + len(batch_descriptors[0])))  # 2 for mol_id and smile, rest for descriptors
    sql_command = f"INSERT INTO mol_metadata VALUES ({placeholders})"

    # Execute the SQL command
    c.executemany(sql_command, data)

    # Commit the changes and close the connection
    conn.commit()
    conn.close()


def create_SMILES_metadata(target_output_path="data/curated_chembl/"):
    
    db_path = os.path.join(target_output_path, "SMILES_metadata.db")
    if os.path.exists(db_path):
        print("SMILES metadata already exists. Skipping creation.")
        return

    create_db_and_table()

    with open(os.path.join("data", "curated_chembl", "molecule_id_mappings", "id_to_smiles.json"), 'r') as f:
        id_to_smiles = json.load(f)
   
    batch_size = 10000
    num_batches = len(id_to_smiles) // batch_size + 1

    for batch_num in tqdm(range(num_batches), desc="Creating SMILES metadata"):
        batch_start = batch_num * batch_size
        batch_end = min((batch_num + 1) * batch_size, len(id_to_smiles))
        batch_id_to_smiles = {k: v for k, v in id_to_smiles.items() if batch_start <= int(k) < batch_end}

        num_cores = os.cpu_count()
        num_workers = int(num_cores * 0.9)

        with ProcessPoolExecutor(max_workers=num_workers) as executor:
            batch_descriptors = executor.map(calculate_descriptors, batch_id_to_smiles.values())

        smiles = list(batch_id_to_smiles.values())
        mol_ids = list(batch_id_to_smiles.keys())
        batch_descriptors = list(batch_descriptors)

        add_mol_desc_to_db(smiles, mol_ids, batch_descriptors, target_output_path)


In [9]:
def main():
    create_SMILES_metadata()

if __name__ == "__main__":
    main()

Creating SMILES metadata:   0%|          | 0/359 [00:04<?, ?it/s]


KeyboardInterrupt: 