In [None]:
"""
In this notebook, the (almost) raw FAERS dataset is first processed. Then, the Disproportionality Analysis is performed.
Before beginning, download the provided files (link in README.md) and unpack them in PMD/data/raw/step_00
"""

In [1]:
import sys
sys.path.insert(0, '..')

import warnings
from joblib import Parallel, delayed

from sklearn.model_selection import GroupKFold

from src.data.io import read_df
from src.data.butina import butina_cluster
from src.data.descriptors import dataframe_2_morgan
from src.dataset.dpa import *
from src.dataset.preprocessing import *

n_jobs = 16

  from .autonotebook import tqdm as notebook_tqdm


### Step 1

In [None]:
"""
In step 1 the following operations are performed:
    - redundant columns removed,
    - strings for drugs, indications, dates, weights, and ages are evaluated,
    - drugs are combined with their indications (if present),
    - correct datatypes are assigned
"""

In [None]:
os.makedirs('../data/intermediate/step_01', exist_ok=True)

def process_step_1(file):
    out_path = f'../data/intermediate/step_01/chunk_{file.split("/")[-1].split("_")[-1].split(".")[0]}.joblib'

    df = pd.read_parquet(file)
    df = step_1(df=df)

    joblib.dump(df, out_path)
    return file

files = sorted(glob.glob(f'../data/raw/*.parquet'))

results = Parallel(n_jobs=n_jobs, verbose=10, timeout=1200)(
    delayed(process_step_1)(file) for file in files
)

In [3]:
joblib.load('../data/intermediate/step_01/chunk_1.joblib')

Unnamed: 0,primaryid,event_dt,fda_dt,sex,age_months,age_group,weight_kg,reactions,drug_indi
0,153086732,2018-08-14,2018-08-22,M,960.0,Elderly,71.188004,"[blood pressure decreased, heart rate irregular]","[(1, PS, revlimid, plasma cell myeloma), (2, S..."
1,153088091,,2018-08-22,M,,,,[tooth loss],"[(1, PS, humira, psoriasis), (2, SS, humira, p..."
2,153088391,,2018-08-22,F,420.0,Adult,,[drug effect incomplete],"[(1, PS, otezla, psoriasis)]"
3,153090461,,2018-08-23,F,780.0,Elderly,,[injection site discolouration],"[(1, PS, enbrel, rheumatoid arthritis)]"
4,153090671,2018-08-15,2018-08-23,,,,,"[myocardial strain, pleural effusion]","[(1, PS, eligard, prostate cancer)]"
...,...,...,...,...,...,...,...,...,...
276365,152884242,,2018-08-17,F,648.0,Adult,,"[influenza, multiple sclerosis relapse, stress]","[(1, PS, tysabri, multiple sclerosis)]"
276366,152884321,,2018-08-17,F,,,,[alopecia],"[(1, PS, tecfidera, multiple sclerosis), (2, C..."
276367,152886135,,2018-08-17,M,,,90.0,"[arthralgia, circumstance or information capab...","[(1, PS, benlysta, systemic lupus erythematosu..."
276368,152886161,,2018-08-17,M,468.0,Adult,,"[haematoma, neoplasm skin, pain in extremity]","[(1, PS, lamotrigine., product used for unknow..."


### Step 2

In [None]:
"""
In step 2 the following operations are performed:
    - mapping of reactions from LLT to PTs,
    - counting of drug types,
    - filtering of the dataframe based on set max_ps, max_drug, and max_reactions
"""

In [19]:
os.makedirs(f'../data/intermediate/step_02a', exist_ok=True)
os.makedirs(f'../data/intermediate/step_02b', exist_ok=True)

"""
The llt_2_pt.tsv file contains mapping from LLT descriptions used in the FAERS database to MedDRA Preferred Terms (PT), which we are not allowed to make public due to license restrictions. To make running the notebook possible, we make available llt_2_pt_encoded.tsv file, which encodes the PT as just indices. See README.md in PMD/data/mappings for a link to a website where the license can be obtained.
"""
use_encoded = True

if use_encoded:
    llt_2_pt = read_df('../data/mappings/llt_2_pt_encoded.tsv')
    terms_col = 'PT_enc'
else:
    llt_2_pt = read_df('../data/mappings/llt_2_pt.tsv')
    terms_col = 'PT'

pts = tuple(sorted(set(llt_2_pt[terms_col].tolist())))

llt_2_pt = {llt: pt for llt, pt in zip(llt_2_pt['LLT'].tolist(), llt_2_pt[terms_col].tolist())}

pt_2_idx = {pt: idx for idx, pt in enumerate(pts)}
idx_2_pt = {idx: pt for idx, pt in enumerate(pts)}

joblib.dump(pt_2_idx, '../data/mappings/pt_2_idx.joblib')
joblib.dump(idx_2_pt, '../data/mappings/idx_2_pt.joblib')

['../data/mappings/idx_2_pt.joblib']

In [20]:
def process_step_2(file, output_dir, llt_2_pt, max_ps, max_drug, filter_type):
    """
    Parallel wrapper for step 2
    """
    out_path = f'{output_dir}/{file.split("/")[-1]}'

    df = joblib.load(file)
    df = step_2(
        df=df,
        llt_2_pt=llt_2_pt,
        max_ps=max_ps,
        max_drug=max_drug,
        max_reactions=20,  # same for both types
        filter_type=filter_type
    )

    joblib.dump(df, out_path)
    return file

files = sorted(glob.glob(f'../data/intermediate/step_01/chunk*.joblib'))
tasks = []

for file in files:
    tasks.append((file, '../data/intermediate/step_02a', 1, None, 'primary'))

for file in files:
    tasks.append((file, '../data/intermediate/step_02b', None, 10, 'primsec'))

results = Parallel(n_jobs=n_jobs, verbose=10, timeout=600)(
    delayed(process_step_2)(file=file, output_dir=output_dir, llt_2_pt=llt_2_pt, max_ps=max_ps, max_drug=max_drug, filter_type=filter_type)
    for file, output_dir, max_ps, max_drug, filter_type in tasks
)

[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:  1.0min
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:  2.0min
[Parallel(n_jobs=16)]: Done  29 tasks      | elapsed:  3.1min
[Parallel(n_jobs=16)]: Done  40 tasks      | elapsed:  4.5min
[Parallel(n_jobs=16)]: Done  53 tasks      | elapsed:  6.0min
[Parallel(n_jobs=16)]: Done  66 tasks      | elapsed:  7.5min
[Parallel(n_jobs=16)]: Done  81 tasks      | elapsed:  9.1min
[Parallel(n_jobs=16)]: Done  96 tasks      | elapsed: 11.0min
[Parallel(n_jobs=16)]: Done 110 out of 128 | elapsed: 12.5min remaining:  2.1min
[Parallel(n_jobs=16)]: Done 123 out of 128 | elapsed: 13.8min remaining:   33.7s
[Parallel(n_jobs=16)]: Done 128 out of 128 | elapsed: 14.0min finished


In [21]:
joblib.load('../data/intermediate/step_02a/chunk_1.joblib')

Unnamed: 0,primaryid,event_dt,fda_dt,sex,age_months,age_group,weight_kg,reactions,role_code,drug_name,indication
0,153086732,2018-08-14,2018-08-22,M,960.0,Elderly,71.188004,"[PT_02918, PT_08975]",PS,revlimid,plasma cell myeloma
1,153088091,,2018-08-22,M,,,,[PT_19770],PS,humira,psoriasis
2,153088391,,2018-08-22,F,420.0,Adult,,[PT_19443],PS,otezla,psoriasis
3,153090461,,2018-08-23,F,780.0,Elderly,,[PT_10515],PS,enbrel,rheumatoid arthritis
4,153090671,2018-08-15,2018-08-23,,,,,"[PT_13209, PT_15697]",PS,eligard,prostate cancer
...,...,...,...,...,...,...,...,...,...,...,...
272472,152884242,,2018-08-17,F,648.0,Adult,,"[PT_10387, PT_13031, PT_18947]",PS,tysabri,multiple sclerosis
272473,152884321,,2018-08-17,F,,,,[PT_00760],PS,tecfidera,multiple sclerosis
272474,152886135,,2018-08-17,M,,,90.0,"[PT_01713, PT_04530, PT_05314, PT_05798, PT_06...",PS,benlysta,systemic lupus erythematosus
272475,152886161,,2018-08-17,M,468.0,Adult,,"[PT_08774, PT_13532, PT_14594]",PS,lamotrigine.,product used for unknown indication


### Step 3

In [None]:
"""
In step 3 the following operations are performed:
    - mapping of demographic values to numpy arrays
    - mapping of FAERS drug descriptions to active ingredients
    - removal of rows with missing reactions or actives
"""

In [22]:
os.makedirs(f'../data/intermediate/step_03a', exist_ok=True)
os.makedirs(f'../data/intermediate/step_03b', exist_ok=True)
os.makedirs(f'../data/mappings/wq_primary', exist_ok=True)
os.makedirs(f'../data/mappings/wq_secondary', exist_ok=True)

token_map = read_df('../data/mappings/token_mapping.tsv')  # mapping from drug descriptions to active ingredients

def process_step_3(file, output_dir, tok_map, wbk, wsu, waesk, waesu):
    out_path = f'{output_dir}/{file.split("/")[-1]}'

    df = joblib.load(file)
    df = step_3(
        df=df,
        token_map=tok_map,
        wq_both_known=wbk,
        wq_sex_unknown=wsu,
        wq_age_estim_sex_known=waesk,
        wq_age_estim_sex_unknown=waesu
    )

    joblib.dump(df, out_path)
    return file

In [None]:
"""
Required files already provided in /data/mappings

both_known, sex_unknown, age_estim_sex_known, age_estim_sex_unknown = prepare_weight_mapping(f'../data/intermediate/step_02a')

joblib.dump(both_known, f'../data/mappings/wq_primary/wq_both_known.joblib')
joblib.dump(sex_unknown, f'../data/mappings/wq_primary/wq_sex_unknown.joblib')
joblib.dump(age_estim_sex_known, f'../data/mappings/wq_primary/wq_age_estim_sex_known.joblib')
joblib.dump(age_estim_sex_unknown, f'../data/mappings/wq_primary/wq_age_estim_sex_unknown.joblib')

both_known, sex_unknown, age_estim_sex_known, age_estim_sex_unknown = prepare_weight_mapping('../data/intermediate/step_02b')

joblib.dump(both_known, f'../data/mappings/wq_secondary/wq_both_known.joblib')
joblib.dump(sex_unknown, f'../data/mappings/wq_secondary/wq_sex_unknown.joblib')
joblib.dump(age_estim_sex_known, f'../data/mappings/wq_secondary/wq_age_estim_sex_known.joblib')
joblib.dump(age_estim_sex_unknown, f'../data/mappings/wq_secondary/wq_age_estim_sex_unknown.joblib')
"""

In [23]:
both_known = joblib.load(f'../data/mappings/wq_primary/wq_both_known.joblib')
sex_unknown = joblib.load(f'../data/mappings/wq_primary/wq_sex_unknown.joblib')
age_estim_sex_known = joblib.load(f'../data/mappings/wq_primary/wq_age_estim_sex_known.joblib')
age_estim_sex_unknown = joblib.load(f'../data/mappings/wq_primary/wq_age_estim_sex_unknown.joblib')

tasks = []

files_02a = sorted(glob.glob(f'../data/intermediate/step_02a/chunk*.joblib'))
for file in files_02a:
    tasks.append((file, '../data/intermediate/step_03a'))

results = Parallel(n_jobs=n_jobs, verbose=10, timeout=600)(
    delayed(process_step_3)(file, output_dir, token_map, both_known, sex_unknown, age_estim_sex_known, age_estim_sex_unknown)
    for file, output_dir in tasks
)

[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:   18.2s
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:   26.0s
[Parallel(n_jobs=16)]: Done  29 tasks      | elapsed:   41.7s
[Parallel(n_jobs=16)]: Done  40 out of  64 | elapsed:   51.3s remaining:   30.8s
[Parallel(n_jobs=16)]: Done  47 out of  64 | elapsed:  1.0min remaining:   22.1s
[Parallel(n_jobs=16)]: Done  54 out of  64 | elapsed:  1.1min remaining:   12.6s
[Parallel(n_jobs=16)]: Done  61 out of  64 | elapsed:  1.2min remaining:    3.6s
[Parallel(n_jobs=16)]: Done  64 out of  64 | elapsed:  1.3min finished


In [24]:
both_known = joblib.load(f'../data/mappings/wq_secondary/wq_both_known.joblib')
sex_unknown = joblib.load(f'../data/mappings/wq_secondary/wq_sex_unknown.joblib')
age_estim_sex_known = joblib.load(f'../data/mappings/wq_secondary/wq_age_estim_sex_known.joblib')
age_estim_sex_unknown = joblib.load(f'../data/mappings/wq_secondary/wq_age_estim_sex_unknown.joblib')

tasks = []

files_02b = sorted(glob.glob(f'../data/intermediate/step_02b/chunk*.joblib'))
for file in files_02b:
    tasks.append((file, '../data/intermediate/step_03b'))

results = Parallel(n_jobs=n_jobs, verbose=10, timeout=600)(
    delayed(process_step_3)(file, output_dir, token_map, both_known, sex_unknown, age_estim_sex_known, age_estim_sex_unknown)
    for file, output_dir in tasks
)

[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:   34.4s
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:   42.2s
[Parallel(n_jobs=16)]: Done  29 tasks      | elapsed:  1.2min
[Parallel(n_jobs=16)]: Done  40 out of  64 | elapsed:  1.4min remaining:   50.8s
[Parallel(n_jobs=16)]: Done  47 out of  64 | elapsed:  1.7min remaining:   36.7s
[Parallel(n_jobs=16)]: Done  54 out of  64 | elapsed:  1.8min remaining:   20.2s
[Parallel(n_jobs=16)]: Done  61 out of  64 | elapsed:  2.0min remaining:    5.9s
[Parallel(n_jobs=16)]: Done  64 out of  64 | elapsed:  2.0min finished


In [25]:
joblib.load('../data/intermediate/step_03a/chunk_1.joblib')

Unnamed: 0,primaryid,sex,age_group,reactions,role_code,drug_name,indication,weight,active
0,153086732,M,Elderly,"[PT_02918, PT_08975]",PS,revlimid,plasma cell myeloma,Low,lenalidomide
1,153088091,M,Unknown,[PT_19770],PS,humira,psoriasis,Unknown,adalimumab
2,153088391,F,Adult,[PT_19443],PS,otezla,psoriasis,Unknown,apremilast
3,153090461,F,Elderly,[PT_10515],PS,enbrel,rheumatoid arthritis,Unknown,etanercept
4,153090671,Unknown,Unknown,"[PT_13209, PT_15697]",PS,eligard,prostate cancer,Unknown,leuprolide
...,...,...,...,...,...,...,...,...,...
272472,152884242,F,Adult,"[PT_10387, PT_13031, PT_18947]",PS,tysabri,multiple sclerosis,Unknown,natalizumab
272473,152884321,F,Unknown,[PT_00760],PS,tecfidera,multiple sclerosis,Unknown,dimethyl fumarate
272474,152886135,M,Unknown,"[PT_01713, PT_04530, PT_05314, PT_05798, PT_06...",PS,benlysta,systemic lupus erythematosus,High,belimumab
272475,152886161,M,Adult,"[PT_08774, PT_13532, PT_14594]",PS,lamotrigine.,product used for unknown indication,Unknown,lamotrigine


### Step 4

In [None]:
"""
In step 4 the following operations are performed:
    - mapping of active ingredients to SMILES strings
    - check for completeness of mapping
"""

In [26]:
os.makedirs(f'../data/intermediate/step_04a', exist_ok=True)
os.makedirs(f'../data/intermediate/step_04b', exist_ok=True)

smiles_mapping = read_df('../data/mappings/smiles_mapping.tsv')  # mapping from active ingredient names to SMILES strings

smiles = tuple(sorted(set(smiles_mapping['SMILES'])))
smi_2_idx = {smi:idx for idx, smi in enumerate(smiles)}
idx_2_smi = {idx:smi for idx, smi in enumerate(smiles)}

joblib.dump(smi_2_idx, '../data/mappings/smi_2_idx.joblib')
joblib.dump(idx_2_smi, '../data/mappings/idx_2_smi.joblib')

['../data/mappings/idx_2_smi.joblib']

In [27]:
def process_step_4(file, output_dir):
    out_path = f'{output_dir}/{file.split("/")[-1]}'

    df = joblib.load(file)
    df = step_4(
        df=df,
        smiles_mapping=smiles_mapping,
        smiles_col='SMILES'
    )

    joblib.dump(df, out_path)
    return file

tasks = []

files_03a = sorted(glob.glob(f'../data/intermediate/step_03a/chunk*.joblib'))
for file in files_03a:
    tasks.append((file, '../data/intermediate/step_04a'))

files_03b = sorted(glob.glob(f'../data/intermediate/step_03b/chunk*.joblib'))
for file in files_03b:
    tasks.append((file, '../data/intermediate/step_04b'))

results = Parallel(n_jobs=n_jobs, verbose=10, timeout=600)(
    delayed(process_step_4)(file, output_dir)
    for file, output_dir in tasks
)

[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:    1.9s
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    2.7s
[Parallel(n_jobs=16)]: Done  29 tasks      | elapsed:    4.0s
[Parallel(n_jobs=16)]: Done  40 tasks      | elapsed:    5.1s
[Parallel(n_jobs=16)]: Done  53 tasks      | elapsed:    6.5s
[Parallel(n_jobs=16)]: Done  66 tasks      | elapsed:    8.2s
[Parallel(n_jobs=16)]: Done  81 tasks      | elapsed:   10.4s
[Parallel(n_jobs=16)]: Done  96 tasks      | elapsed:   12.3s
[Parallel(n_jobs=16)]: Done 110 out of 128 | elapsed:   14.9s remaining:    2.4s
[Parallel(n_jobs=16)]: Done 123 out of 128 | elapsed:   16.4s remaining:    0.7s
[Parallel(n_jobs=16)]: Done 128 out of 128 | elapsed:   16.7s finished


In [28]:
joblib.load('../data/intermediate/step_04a/chunk_1.joblib')

Unnamed: 0,primaryid,Sex,Age,reactions,role_code,Weight,active,SMILES
0,153086732,M,Elderly,"[PT_02918, PT_08975]",PS,Low,lenalidomide,[Nc1cccc2c1CN(C1CCC(=O)NC1=O)C2=O]
1,153088391,F,Adult,[PT_19443],PS,Unknown,apremilast,[CCOc1cc(C(CS(C)(=O)=O)N2C(=O)c3cccc(NC(C)=O)c...
2,153090811,F,Elderly,"[PT_05719, PT_07114, PT_07677, PT_13993, PT_15...",PS,Unknown,dalfampridine,[Nc1ccncc1]
3,153091271,F,Unknown,[PT_06089],PS,Unknown,apremilast,[CCOc1cc(C(CS(C)(=O)=O)N2C(=O)c3cccc(NC(C)=O)c...
4,153091611,M,Adult,[PT_06089],PS,Unknown,apremilast,[CCOc1cc(C(CS(C)(=O)=O)N2C(=O)c3cccc(NC(C)=O)c...
...,...,...,...,...,...,...,...,...
169165,152880041,M,Unknown,"[PT_06381, PT_10671]",PS,Unknown,alprazolam,[Cc1nnc2n1-c1ccc(Cl)cc1C(c1ccccc1)=NC2]
169166,152881111,M,Adult,[PT_00997],PS,Unknown,amoxicillin,[CC1(C)SC2C(NC(=O)C(N)c3ccc(O)cc3)C(=O)N2C1C(=...
169167,152881282,F,Adult,"[PT_03393, PT_06330, PT_14121]",PS,Unknown,voriconazole,[CC(c1ncncc1F)C(O)(Cn1cncn1)c1ccc(F)cc1F]
169168,152884321,F,Unknown,[PT_00760],PS,Unknown,dimethyl fumarate,[COC(=O)C=CC(=O)OC]


### Step 5

In [None]:
"""
In step 5 the following operations are performed:
- encoding of reactions and SMILES strings to facilitate the subsequent DPA analysis
"""

In [29]:
os.makedirs(f'../data/intermediate/step_05a', exist_ok=True)
os.makedirs(f'../data/intermediate/step_05b', exist_ok=True)

smi_2_idx = joblib.load('../data/mappings/smi_2_idx.joblib')
pt_2_idx = joblib.load('../data/mappings/pt_2_idx.joblib')

In [30]:
def process_step_5(file, output_dir):
    out_path = f'{output_dir}/{file.split("/")[-1]}'

    df = joblib.load(file)
    df = step_5(
        df=df,
        smi_2_idx=smi_2_idx,  # mapping of SMILES to its index
        pt_2_idx=pt_2_idx     # MedDRA PT to idx mapping
    )

    joblib.dump(df, out_path)
    return file

tasks = []

files_04a = sorted(glob.glob(f'../data/intermediate/step_04a/chunk*.joblib'))
for file in files_04a:
    tasks.append((file, '../data/intermediate/step_05a'))

files_04b = sorted(glob.glob(f'../data/intermediate/step_04b/chunk*.joblib'))
for file in files_04b:
    tasks.append((file, '../data/intermediate/step_05b'))

results = Parallel(n_jobs=n_jobs, verbose=1)(
    delayed(process_step_5)(file, output_dir)
    for file, output_dir in tasks
)

[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    4.8s
[Parallel(n_jobs=16)]: Done 128 out of 128 | elapsed:   36.5s finished


In [31]:
joblib.load('../data/intermediate/step_05a/chunk_1.joblib')

Unnamed: 0,Sex,Age,Weight,reac_enc,smi_enc
0,M,Elderly,Low,"[2917, 8974]",[2497]
1,F,Adult,Unknown,[19442],[1221]
2,F,Elderly,Unknown,"[5718, 7113, 7676, 13992, 15250, 16218]",[2508]
3,F,Unknown,Unknown,[6088],[1221]
4,M,Adult,Unknown,[6088],[1221]
...,...,...,...,...,...
169165,M,Unknown,Unknown,"[6380, 10670]",[2160]
169166,M,Adult,Unknown,[996],[588]
169167,F,Adult,Unknown,"[3392, 6329, 14120]",[558]
169168,F,Unknown,Unknown,[759],[1601]


### Step 6

In [None]:
"""
In step 6 the following operations are performed:
    - data stratification by various combinations of demographic factors
    - counting of individual entries (i.e. instead of having 100 rows with the same data, there will be a single row with its count)

This step requires around 32GB of RAM.
"""

In [32]:
os.makedirs(f'../data/intermediate/step_06a', exist_ok=True)
os.makedirs(f'../data/intermediate/step_06b', exist_ok=True)

def process_step_6_1(file):
    df = joblib.load(file)
    return step_6_1(df=df)

def process_dataset(input_pattern, output_dir):
    files = sorted(glob.glob(input_pattern))

    dfs = Parallel(n_jobs=n_jobs, verbose=10, timeout=600)(
        delayed(process_step_6_1)(file) for file in files
    )

    processed_df = step_6_2(pd.concat(dfs, ignore_index=True))

    for strat in processed_df.strat.unique():
        sub_df = processed_df[processed_df.strat == strat].reset_index(drop=True)
        joblib.dump(sub_df, f'{output_dir}/strat_{strat}.joblib')

    return output_dir

_ = process_dataset(f'../data/intermediate/step_05a/chunk*.joblib', '../data/intermediate/step_06a')
_ = process_dataset(f'../data/intermediate/step_05b/chunk*.joblib', '../data/intermediate/step_06b')

[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:    6.0s
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    8.8s
[Parallel(n_jobs=16)]: Done  29 tasks      | elapsed:   12.7s
[Parallel(n_jobs=16)]: Done  40 out of  64 | elapsed:   16.4s remaining:    9.8s
[Parallel(n_jobs=16)]: Done  47 out of  64 | elapsed:   19.3s remaining:    7.0s
[Parallel(n_jobs=16)]: Done  54 out of  64 | elapsed:   22.1s remaining:    4.1s
[Parallel(n_jobs=16)]: Done  61 out of  64 | elapsed:   23.3s remaining:    1.1s
[Parallel(n_jobs=16)]: Done  64 out of  64 | elapsed:   23.7s finished
[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:    9.8s
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:   14.8s
[Parallel(n_jobs=16)]: Done  29 tasks      | elapsed:   23.0s
[Parallel(n_jobs=16)]: Done  40 out of  64 | elapsed:   29.1s remaining:   17.4

In [33]:
joblib.load('../data/intermediate/step_06a/strat_saw.joblib')

Unnamed: 0,smi_enc,reac_enc,count,strat,Sex,Age,Weight
0,1,207,1,saw,F,Adolescent,Average
1,1,1587,1,saw,F,Adolescent,Average
2,1,3459,1,saw,F,Adolescent,Average
3,1,4831,1,saw,F,Adolescent,Average
4,1,6165,1,saw,F,Adolescent,Average
...,...,...,...,...,...,...,...
5131518,2971,21537,42,saw,Unknown,Unknown,Unknown
5131519,2971,21553,2,saw,Unknown,Unknown,Unknown
5131520,2971,21590,6,saw,Unknown,Unknown,Unknown
5131521,2971,21633,1,saw,Unknown,Unknown,Unknown


### Step 7

In [None]:
"""
In step 7 cardiotoxicity labels are assigned to reactions.

We're allowed to include the cardiotoxicity_terms.tsv as the number of terms is lower than 1000 and can be considered an example
as per MedDRA license.
"""

In [34]:
pt_sets = read_df('../data/mappings/cardiotoxicity_terms.tsv')

In [36]:
# Map grouped PT terms to sets of idxs
os.makedirs(f'../data/intermediate/step_07a', exist_ok=True)
os.makedirs(f'../data/intermediate/step_07b', exist_ok=True)

groups = ['Cvas', 'Card', 'Cred']
pt_sets = read_df('../data/mappings/cardiotoxicity_terms.tsv')

sets_dc = {}

for group in groups:
    sets_dc[group] = pt_sets[pt_sets[group] == 1][terms_col].tolist()

joblib.dump(sets_dc, '../data/mappings/meddra_sets_pt_dict.joblib')

pt_2_idx = joblib.load('../data/mappings/pt_2_idx.joblib')

idx_dc = {}

for name, pts in sets_dc.items():
    idx_dc[name] = sorted([pt_2_idx.get(item) for item in pts])

joblib.dump(idx_dc, '../data/mappings/meddra_sets_idx_dict.joblib')

['../data/mappings/meddra_sets_idx_dict.joblib']

In [37]:
idx_dc = joblib.load('../data/mappings/meddra_sets_idx_dict.joblib')

def process_step_7(file, output_base_dir):
    strat_type = file.split('/')[-1].split('_')[-1].rstrip('.joblib')

    df = joblib.load(file)
    result_dfs = step_7(
        df=df,
        idx_dc=idx_dc,
        strat_type=strat_type
    )

    results = []

    for sub_df, pt_set, strat_type in result_dfs:

        pt_set_dir = f'{output_base_dir}/{pt_set}'
        os.makedirs(pt_set_dir, exist_ok=True)

        out_path = f'{pt_set_dir}/strat_{strat_type}.joblib'

        joblib.dump(sub_df, out_path)
        results.append((pt_set, strat_type))

    return results

tasks = []

files_06a = sorted(glob.glob(f'../data/intermediate/step_06a/strat*.joblib'))
for file in files_06a:
    tasks.append((file, '../data/intermediate/step_07a'))

files_06b = sorted(glob.glob(f'../data/intermediate/step_06b/strat*.joblib'))
for file in files_06b:
    tasks.append((file, '../data/intermediate/step_07b'))

_ = Parallel(n_jobs=n_jobs, verbose=1, timeout=600)(
    delayed(process_step_7)(file, output_dir)
    for file, output_dir in tasks
)

[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:   19.2s remaining:  2.2min
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:  1.1min finished


In [38]:
joblib.load('../data/intermediate/step_07a/Card/strat_saw.joblib')

Unnamed: 0,Sex,Age,Weight,smi_enc,Card,Card_ct
0,F,Adolescent,Average,1,0,18
1,F,Adolescent,Average,5,0,94
2,F,Adolescent,Average,7,0,374
3,F,Adolescent,Average,8,0,6753
4,F,Adolescent,Average,8,1,19
...,...,...,...,...,...,...
64010,Unknown,Unknown,Unknown,2962,1,2
64011,Unknown,Unknown,Unknown,2964,0,594
64012,Unknown,Unknown,Unknown,2964,1,3
64013,Unknown,Unknown,Unknown,2971,0,3774


### Step 8

In [None]:
"""
In step 8 DPA contingency matrices are calculated for each data stratification level
"""

In [32]:
os.makedirs(f'../data/intermediate/step_08a', exist_ok=True)
os.makedirs(f'../data/intermediate/step_08b', exist_ok=True)

In [36]:
def process_step_8(file, output_base_dir):
    _, _, _, _, pt_set, strat_type = file.split('/')
    strat_type = strat_type.rstrip('.joblib').split('_')[-1]

    df = joblib.load(file)
    df = step_8(
        df=df,
        smiles_col='smi_enc',
        reaction_col=pt_set,
        count_col=f'{pt_set}_ct',
        pt_idx=1,
        strat_type=strat_type
    )

    os.makedirs(f'{output_base_dir}/{pt_set}', exist_ok=True)

    out_path = f'{output_base_dir}/{pt_set}/strat_{strat_type}.joblib'
    joblib.dump(df, out_path)

    return pt_set, strat_type

tasks = []

files_07a = sorted(glob.glob(f'../data/intermediate/step_07a/*/strat*.joblib'))
for file in files_07a:
    tasks.append((file, '../data/intermediate/step_08a'))

files_07b = sorted(glob.glob(f'../data/intermediate/step_07b/*/strat*.joblib'))
for file in files_07b:
    tasks.append((file, '../data/intermediate/step_08b'))

_ = Parallel(n_jobs=n_jobs, verbose=10, timeout=600)(
    delayed(process_step_8)(file, output_dir)
    for file, output_dir in tasks
)

[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:   13.6s
[Parallel(n_jobs=16)]: Done  22 out of  48 | elapsed:   38.0s remaining:   44.9s
[Parallel(n_jobs=16)]: Done  27 out of  48 | elapsed:   47.2s remaining:   36.7s
[Parallel(n_jobs=16)]: Done  32 out of  48 | elapsed:  1.0min remaining:   30.1s
[Parallel(n_jobs=16)]: Done  37 out of  48 | elapsed:  1.2min remaining:   21.4s
[Parallel(n_jobs=16)]: Done  42 out of  48 | elapsed:  1.5min remaining:   12.7s
[Parallel(n_jobs=16)]: Done  48 out of  48 | elapsed:  2.0min finished


In [37]:
joblib.load('../data/intermediate/step_08a/Card/strat_saw.joblib')

Unnamed: 0,smi_enc,Card,a,b,c,d,Sex,Age,Weight
0,1,1,0,18,1919,61673,F,Adolescent,Average
1,5,1,0,94,1919,61597,F,Adolescent,Average
2,7,1,0,374,1919,61317,F,Adolescent,Average
3,8,1,19,6753,1900,54938,F,Adolescent,Average
4,9,1,5,147,1914,61544,F,Adolescent,Average
...,...,...,...,...,...,...,...,...,...
41849,2951,1,1,57,128894,3534987,Unknown,Unknown,Unknown
41850,2961,1,0,13,128895,3535031,Unknown,Unknown,Unknown
41851,2962,1,2,206,128893,3534838,Unknown,Unknown,Unknown
41852,2964,1,3,594,128892,3534450,Unknown,Unknown,Unknown


### Step 9

In [None]:
"""
In step 9 the following operations are performed:
    - entries with insufficient amount of data to perform DPA are removed
    - the DPA metrics and confidence scores are calculated
    - the cardiotoxicity risk labels are assigned
    - risk labels are binarized
"""

In [8]:
# Division by 0 errors are caught automatically by Numpy and return np.nan
os.makedirs('../data/intermediate/step_09a', exist_ok=True)
os.makedirs('../data/intermediate/step_09b', exist_ok=True)

def process_step_9(file, output_base_dir):
    warnings.simplefilter(action='ignore', category=RuntimeWarning)
    _, _, _, _, pt_set, strat_type = file.split('/')
    strat_type = strat_type.rstrip('.joblib').split('_')[-1]

    df = joblib.load(file)
    df = step_9(
        df=df,
        min_records=3,   # minimum number of drug-adverse reaction to keep a record
        sign_level=0.01, # significance level for Confidence Interval calculation
        shrink=0.5,      # shrinkage factor for Information Component
        dist_power=0.5,  # distance-to-threshold power for sigmoid transformation of raw confidence scores
        saturation=2.5   # plateau value for sigmoid transformation of raw confidence scores
    )

    os.makedirs(f'{output_base_dir}/{pt_set}', exist_ok=True)

    out_path = f'{output_base_dir}/{pt_set}/strat_{strat_type}.joblib'
    joblib.dump(df, out_path)

    return pt_set, strat_type

tasks = []

for file in sorted(glob.glob(f'../data/intermediate/step_08a/*/strat*.joblib')):
    tasks.append((file, '../data/intermediate/step_09a'))

for file in sorted(glob.glob(f'../data/intermediate/step_08b/*/strat*.joblib')):
    tasks.append((file, '../data/intermediate/step_09b'))

_ = Parallel(n_jobs=n_jobs, verbose=10, timeout=600)(
    delayed(process_step_9)(file, output_dir)
    for file, output_dir in tasks
)

[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:    3.6s
[Parallel(n_jobs=16)]: Done  22 out of  48 | elapsed:   10.5s remaining:   12.5s
[Parallel(n_jobs=16)]: Done  27 out of  48 | elapsed:   12.9s remaining:   10.0s
[Parallel(n_jobs=16)]: Done  32 out of  48 | elapsed:   16.9s remaining:    8.4s
[Parallel(n_jobs=16)]: Done  37 out of  48 | elapsed:   18.9s remaining:    5.6s
[Parallel(n_jobs=16)]: Done  42 out of  48 | elapsed:   20.0s remaining:    2.9s
[Parallel(n_jobs=16)]: Done  48 out of  48 | elapsed:   30.0s finished


In [37]:
joblib.load('../data/intermediate/step_09a/Card/strat_s.joblib')

Unnamed: 0,smi_enc,Card,a,b,c,d,Sex,prr,prr_lower,prr_upper,...,prr_weight,prr_bin,ror_tox,ror_conf,ror_weight,ror_bin,ic_tox,ic_conf,ic_weight,ic_bin
0,1,1,278,26153,525068,18380883,F,5.292,4.53798,6.1713,...,0.50805,1.0,Low,0.65038,0.25846,0.0,High,3.46231,0.97994,1.0
1,5,1,15,494,525331,18406542,F,1.06202,0.55154,2.04497,...,0.12964,1.0,Moderate,0.16328,0.12896,1.0,Moderate,0.15028,0.12641,1.0
2,6,1,0,12,525346,18407024,F,,,,...,,,Undefined,,,,Undefined,,,
3,7,1,33,14188,525313,18392848,F,0.1966,0.12562,0.30768,...,0.99843,0.0,Minimal,5.03491,0.99871,0.0,Minimal,1.1768,0.46788,0.0
4,8,1,1411,214476,523935,18192560,F,18.319139,17.10734,19.61677,...,0.67211,1.0,High,2.70719,0.92834,1.0,High,10.01117,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4453,2951,1,2,134,139128,3826700,Unknown,0.41919,0.06874,2.55621,...,,,Undefined,,,,Undefined,,,
4454,2961,1,0,13,139130,3826821,Unknown,,,,...,,,Undefined,,,,Undefined,,,
4455,2962,1,2,207,139128,3826627,Unknown,0.27277,0.04452,1.67119,...,,,Undefined,,,,Undefined,,,
4456,2964,1,4,600,139126,3826234,Unknown,0.18875,0.05229,0.68136,...,0.57912,0.0,Minimal,1.46396,0.59293,0.0,Minimal,0.40005,0.18332,0.0


### Step 10

In [None]:
"""
In step 10 the results from DPA are aggregated and datatypes assigned. Generally, a lot of data manipulation.
"""

In [2]:
os.makedirs(f'../data/intermediate/step_10a', exist_ok=True)
os.makedirs(f'../data/intermediate/step_10b', exist_ok=True)

In [3]:
drop_unknown = True
dpa_metrics = ['prr', 'ror', 'ic']

for dpa_metric in dpa_metrics:

    for directory in sorted(glob.glob(f'../data/intermediate/step_09a/*')):
        pt_type = directory.split('/')[-1]

        dfs = []

        for file in sorted(glob.glob(f'../data/intermediate/step_09a/{pt_type}/strat_*.joblib')):

            _, _, _, _, pt_set, strat_type = file.split('/')
            strat_type = strat_type.rstrip('.joblib').split('_')[-1]

            df = joblib.load(file)
            df = step_10(
                df=df,  # data
                strat_type=strat_type,  # type of stratification
                pt_type=pt_type,
                drop_unknown=drop_unknown,
                dpa_metric=dpa_metric
            )
            dfs.append(df)

        dfs = pd.concat(dfs, ignore_index = True)
        dfs = dfs.sort_values(by=['Sex', 'Age', 'Weight', 'smi_enc']).reset_index(drop=True)

        os.makedirs(f'../data/intermediate/step_10a/{pt_type}', exist_ok=True)
        joblib.dump(dfs, f'../data/intermediate/step_10a/{pt_type}/carbide_{dpa_metric}.joblib')

###

for dpa_metric in dpa_metrics:

    for directory in sorted(glob.glob(f'../data/intermediate/step_09b/*')):
        pt_type = directory.split('/')[-1]

        dfs = []

        for file in sorted(glob.glob(f'../data/intermediate/step_09b/{pt_type}/*.joblib')):

            _, _, _, _, pt_set, strat_type = file.split('/')
            strat_type = strat_type.rstrip('.joblib').split('_')[-1]

            df = joblib.load(file)
            df = step_10(
                df=df,
                strat_type=strat_type,
                pt_type=pt_type,
                drop_unknown=drop_unknown,
                dpa_metric=dpa_metric
            )
            dfs.append(df)

        dfs = pd.concat(dfs, ignore_index = True)
        dfs = dfs.sort_values(by=['Sex', 'Age', 'Weight', 'smi_enc']).reset_index(drop=True)

        os.makedirs(f'../data/intermediate/step_10b/{pt_type}', exist_ok=True)
        joblib.dump(dfs, f'../data/intermediate/step_10b/{pt_type}/carbide_{dpa_metric}.joblib')

In [9]:
joblib.load('../data/intermediate/step_10a/Card/carbide_ic.joblib')

Unnamed: 0,Strat,Sex,Age,Weight,smi_enc,ic,ic_lower,ic_upper,ic_tox,ic_bin,ic_conf,ic_weight,ic_smooth,a,b,c,d
0,saw,Male,Children,Low,84.0,-1.08191,-2.07523,-0.31675,Minimal,0.0,0.59147,0.23911,0.38044,18,990,1554,38424
1,saw,Male,Children,Low,106.0,-0.71735,-3.54029,0.81712,Low,0.0,0.19437,0.13522,0.43239,3,134,1569,39280
2,saw,Male,Children,Low,114.0,1.01196,-0.85457,2.20970,Moderate,1.0,0.32828,0.16518,0.58259,6,65,1566,39349
3,saw,Male,Children,Low,186.0,-0.66964,-1.47965,-0.01814,Minimal,0.0,0.55987,0.22915,0.38542,26,1060,1546,38354
4,saw,Male,Children,Low,189.0,0.65200,-0.92521,1.72313,Moderate,1.0,0.30488,0.15959,0.57980,8,120,1564,39294
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28171,n,Unknown,Unknown,Unknown,2947.0,0.39630,0.07554,0.68885,High,1.0,1.02627,0.40293,0.70146,147,7036,1153214,34598144
28172,n,Unknown,Unknown,Unknown,2951.0,1.40507,1.02621,1.74520,High,1.0,1.64841,0.66826,0.83413,107,1136,1153254,34604044
28173,n,Unknown,Unknown,Unknown,2962.0,-1.53024,-2.35794,-0.86733,Minimal,0.0,0.82983,0.32332,0.33834,25,2243,1153336,34602937
28174,n,Unknown,Unknown,Unknown,2964.0,-1.28236,-2.01787,-0.67996,Minimal,0.0,0.84634,0.3297,0.33515,31,2329,1153330,34602851


### Step 11

In [None]:
"""
In step 11 the following operation are performed:
- decoding of SMILES indices
- encoding of demographic factors as numpy arrays (yes, again)
- concatenation of demographic arrays
- concatenation of demographic factors
- packing of values from DPA analysis
- column renaming, removal, and casting
"""

In [51]:
os.makedirs(f'../data/intermediate/step_11a', exist_ok=True)
os.makedirs(f'../data/intermediate/step_11b', exist_ok=True)
idx_2_smi = joblib.load('../data/mappings/idx_2_smi.pkl')

In [52]:
for file in tqdm(sorted(glob.glob(f'../data/intermediate/step_10a/*/carbide*.joblib')), total=9):
    _, _, _, _, pt_type, file_name = file.split('/')
    dpa_type = file.rstrip('.joblib').split('_')[-1]

    df = joblib.load(file)
    df = step_11(
        df=df,  # data
        smiles_col='smi_enc',  # column with smiles encoded as ints
        idx_2_smi=idx_2_smi,  # idx to smiles mapping
        dpa_metric=dpa_type
    )

    os.makedirs(f'../data/intermediate/step_11a/{pt_type}', exist_ok=True)
    out_path = f'../data/intermediate/step_11a/{pt_type}/carbide_{dpa_type}.joblib'
    joblib.dump(df, out_path)

###
for file in tqdm(sorted(glob.glob(f'../data/intermediate/step_10b/*/carbide*.joblib')), total=9):
    _, _, _, _, pt_type, file_name = file.split('/')
    dpa_type = file.rstrip('.joblib').split('_')[-1]

    df = joblib.load(file)
    df = step_11(
        df=df,
        smiles_col='smi_enc',
        idx_2_smi=idx_2_smi,
        dpa_metric=dpa_type
    )

    os.makedirs(f'../data/intermediate/step_11b/{pt_type}', exist_ok=True)
    out_path = f'../data/intermediate/step_11b/{pt_type}/carbide_{dpa_type}.joblib'
    joblib.dump(df, out_path)

100%|██████████| 9/9 [00:09<00:00,  1.05s/it]
100%|██████████| 9/9 [00:14<00:00,  1.58s/it]


In [54]:
joblib.load('../data/intermediate/step_11a/Card/carbide_ic.joblib')

Unnamed: 0,SMILES,Signature,Risk,Label,Label_confidence,Label_weight,Label_regression,DPA_confusion,DPA_values,DemoFP,Stratification
0,C=CC(N)CCC(=O)O,"(Male, Children, Low)",Minimal,0,0.59147,0.23911,0.38044,"[18, 990, 1554, 38424]","[-1.08191, -2.07523, -0.31675]","[1, 0, 1, 0, 0, 0, 1, 0, 0]",saw
1,C=CCN1CCC23c4c5ccc(O)c4OC2C(=O)CCC3(O)C1C5,"(Male, Children, Low)",Low,0,0.19437,0.13522,0.43239,"[3, 134, 1569, 39280]","[-0.71735, -3.54029, 0.81712]","[1, 0, 1, 0, 0, 0, 1, 0, 0]",saw
2,C=CC[N+]1(C2CC3C4CCC5CC(O)C(N6CCOCC6)CC5(C)C4C...,"(Male, Children, Low)",Moderate,1,0.32828,0.16518,0.58259,"[6, 65, 1566, 39349]","[1.01196, -0.85457, 2.2097]","[1, 0, 1, 0, 0, 0, 1, 0, 0]",saw
3,CC(=O)Nc1ccc(O)cc1,"(Male, Children, Low)",Minimal,0,0.55987,0.22915,0.38542,"[26, 1060, 1546, 38354]","[-0.66964, -1.47965, -0.01814]","[1, 0, 1, 0, 0, 0, 1, 0, 0]",saw
4,CC(=O)Nc1cccc(-n2c(=O)n(C3CC3)c(=O)c3c(Nc4ccc(...,"(Male, Children, Low)",Moderate,1,0.30488,0.15959,0.5798,"[8, 120, 1564, 39294]","[0.652, -0.92521, 1.72313]","[1, 0, 1, 0, 0, 0, 1, 0, 0]",saw
...,...,...,...,...,...,...,...,...,...,...,...
28171,S=c1nc[nH]c2nc[nH]c12,"(Unknown, Unknown, Unknown)",High,1,1.02627,0.40293,0.70146,"[147, 7036, 1153214, 34598144]","[0.3963, 0.07554, 0.68885]","[0, 0, 0, 0, 0, 0, 0, 0, 0]",n
28172,c1cc(CN2CCCNCCNCCCNCC2)ccc1CN1CCCNCCNCCCNCC1,"(Unknown, Unknown, Unknown)",High,1,1.64841,0.66826,0.83413,"[107, 1136, 1153254, 34604044]","[1.40507, 1.02621, 1.7452]","[0, 0, 0, 0, 0, 0, 0, 0, 0]",n
28173,c1ccc2c(CC3=NCCN3)cccc2c1,"(Unknown, Unknown, Unknown)",Minimal,0,0.82983,0.32332,0.33834,"[25, 2243, 1153336, 34602937]","[-1.53024, -2.35794, -0.86733]","[0, 0, 0, 0, 0, 0, 0, 0, 0]",n
28174,c1ccc2c(c1)CCCC2C1=NCCN1,"(Unknown, Unknown, Unknown)",Minimal,0,0.84634,0.3297,0.33515,"[31, 2329, 1153330, 34602851]","[-1.28236, -2.01787, -0.67996]","[0, 0, 0, 0, 0, 0, 0, 0, 0]",n


### Step 12

In [None]:
"""
In step 12, unique SMILES are clustered and assigned to folds.
Final datasets are prepared.
"""

In [63]:
# Prepare the folds mapping

smis = set()
variants = ['Cvas', 'Card', 'Cred']

for file in tqdm(sorted(glob.glob('../data/intermediate/step_11*/*/carbide*.joblib'))):
    df = joblib.load(file).dropna(subset='SMILES')
    smis = smis.union(set(df['SMILES'].tolist()))

smis = sorted(smis)
df = pd.DataFrame({'SMILES': smis})

df = dataframe_2_morgan(df, radius=2, nbits=4096)
df = butina_cluster(df, threshold=0.75)

gkf = GroupKFold(n_splits=5)

x_values = np.vstack(df.Morgan.to_numpy())
y_values = np.zeros(shape=(len(df),))
groups = df.Cluster_ID.to_numpy()

df['Fold'] = -1
for fold, (_, test_idx) in enumerate(gkf.split(x_values, y_values, groups)):
    df.loc[test_idx, 'Fold'] = fold

df = df.drop(columns='Morgan')

df.to_csv('../data/mappings/data_splits.tsv', sep='\t', header=True, index=False)

100%|██████████| 18/18 [00:03<00:00,  5.65it/s]


In [64]:
df

Unnamed: 0,SMILES,Cluster_ID,Fold
0,Brc1c(NC2=NCCN2)ccc2nccnc12,24,1
1,C#CC(O)(C=CCl)CC,233,4
2,C#CC1(O)C=CC2C3CCC4=CC(=O)CCC4C3CCC21CC,72,1
3,C#CC1(O)CCC2C3C(C)CC4=C(CCC(=O)C4)C3CCC21C,72,1
4,C#CC1(O)CCC2C3CCC4=CC(=NO)CCC4C3CCC21CC,72,1
...,...,...,...
1983,c1ccc2c(c1)Nc1ccccc1S2,150,0
1984,c1ccc2c(c1)Sc1ccccc1N2CC1CN2CCC1CC2,2,2
1985,c1ccc2cccc-2cc1,90,1
1986,c1cnc(N2CCN(Cc3ccc4c(c3)OCO4)CC2)nc1,62,4


In [69]:
os.makedirs('../data/carbide/primary', exist_ok=True)
os.makedirs('../data/carbide/secondary', exist_ok=True)
fold_df = read_df('../data/mappings/data_splits.tsv')

for file in tqdm(sorted(glob.glob('../data/intermediate/step_11*/*/carbide*.joblib')), total=18):
    _, _, _, step_type, pt_type, file_name = file.split('/')
    dpa_type = file.rstrip('.joblib').split('_')[-1]
    dataset_type = {'step_11a': 'primary', 'step_11b': 'secondary'}.get(step_type)

    df = joblib.load(file)

    df = step_12(
        df=df,
        fold_df=fold_df,
        smiles_col='SMILES',
        dataset_type=dataset_type,
        dpa_type=dpa_type,
        pt_set=pt_type)

    os.makedirs(f'../data/carbide/{dataset_type}/{pt_type}', exist_ok=True)
    out_path = f'../data/carbide/{dataset_type}/{pt_type}/carbide_{dpa_type}.joblib'
    joblib.dump(df, out_path)

100%|██████████| 18/18 [00:08<00:00,  2.16it/s]


In [70]:
joblib.load('../data/carbide/primary/Card/carbide_ic.joblib')

Unnamed: 0,SMILES,Signature,Risk,Label,Label_confidence,Label_weight,Label_regression,DPA_confusion,DPA_values,DemoFP,Stratification,Cluster_ID,Fold
0,C=CC(N)CCC(=O)O,"(Male, Children, Low)",Minimal,0,0.59147,0.23911,0.38044,"[18, 990, 1554, 38424]","[-1.08191, -2.07523, -0.31675]","[1, 0, 1, 0, 0, 0, 1, 0, 0]",saw,0,0
1,C=CCN1CCC23c4c5ccc(O)c4OC2C(=O)CCC3(O)C1C5,"(Male, Children, Low)",Low,0,0.19437,0.13522,0.43239,"[3, 134, 1569, 39280]","[-0.71735, -3.54029, 0.81712]","[1, 0, 1, 0, 0, 0, 1, 0, 0]",saw,11,0
2,C=CC[N+]1(C2CC3C4CCC5CC(O)C(N6CCOCC6)CC5(C)C4C...,"(Male, Children, Low)",Moderate,1,0.32828,0.16518,0.58259,"[6, 65, 1566, 39349]","[1.01196, -0.85457, 2.2097]","[1, 0, 1, 0, 0, 0, 1, 0, 0]",saw,74,1
3,CC(=O)Nc1ccc(O)cc1,"(Male, Children, Low)",Minimal,0,0.55987,0.22915,0.38542,"[26, 1060, 1546, 38354]","[-0.66964, -1.47965, -0.01814]","[1, 0, 1, 0, 0, 0, 1, 0, 0]",saw,28,3
4,CC(=O)Nc1cccc(-n2c(=O)n(C3CC3)c(=O)c3c(Nc4ccc(...,"(Male, Children, Low)",Moderate,1,0.30488,0.15959,0.5798,"[8, 120, 1564, 39294]","[0.652, -0.92521, 1.72313]","[1, 0, 1, 0, 0, 0, 1, 0, 0]",saw,158,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
28171,S=c1nc[nH]c2nc[nH]c12,"(Unknown, Unknown, Unknown)",High,1,1.02627,0.40293,0.70146,"[147, 7036, 1153214, 34598144]","[0.3963, 0.07554, 0.68885]","[0, 0, 0, 0, 0, 0, 0, 0, 0]",n,29,2
28172,c1cc(CN2CCCNCCNCCCNCC2)ccc1CN1CCCNCCNCCCNCC1,"(Unknown, Unknown, Unknown)",High,1,1.64841,0.66826,0.83413,"[107, 1136, 1153254, 34604044]","[1.40507, 1.02621, 1.7452]","[0, 0, 0, 0, 0, 0, 0, 0, 0]",n,52,4
28173,c1ccc2c(CC3=NCCN3)cccc2c1,"(Unknown, Unknown, Unknown)",Minimal,0,0.82983,0.32332,0.33834,"[25, 2243, 1153336, 34602937]","[-1.53024, -2.35794, -0.86733]","[0, 0, 0, 0, 0, 0, 0, 0, 0]",n,24,1
28174,c1ccc2c(c1)CCCC2C1=NCCN1,"(Unknown, Unknown, Unknown)",Minimal,0,0.84634,0.3297,0.33515,"[31, 2329, 1153330, 34602851]","[-1.28236, -2.01787, -0.67996]","[0, 0, 0, 0, 0, 0, 0, 0, 0]",n,24,1


### Descriptors

In [5]:
from copy import deepcopy
from src.data.descriptors import *
"""
* Calculating CDDD embeddings requires setting up a separate environment. See src/cddd for setup scripts,
or visit https://github.com/jrwnter/cddd.
* Calculating ChemBERTa embeddings requires torch and transformers libraries
"""

cddd_path = '../../cddd/files/CDDD_paths.json'
klek_path = '../src/files/klek_mols.joblib' # List[mol]

In [3]:
smi_df = read_df('../data/mappings/data_splits.tsv')[['SMILES']]

In [6]:
chemberta = dataframe_2_chemberta(deepcopy(smi_df))
maccs = dataframe_2_maccs(deepcopy(smi_df))
klek = dataframe_2_klek(deepcopy(smi_df), source=klek_path)
rdkit = dataframe_2_rdkit(deepcopy(smi_df))
morgan = dataframe_2_morgan(deepcopy(smi_df), radius=2, nbits=1024)
cddd = dataframe_2_cddd(deepcopy(smi_df), path_source=cddd_path, n_cpus=16)

Consider installing the package zmq to utilize the InferenceServer class


2025-12-03 22:09:56.891258: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA


In [7]:
desc = maccs.merge(klek, on='SMILES').merge(rdkit, on='SMILES').merge(morgan, on='SMILES').merge(cddd, on='SMILES').merge(chemberta, on='SMILES')
joblib.dump(desc, '../data/carbide/descriptors.joblib')

['../data/carbide/descriptors.joblib']