In [1]:
%cd ".."
%pwd

/home/martijn_sturm/repos/ads-code


'/home/martijn_sturm/repos/ads-code'

In [2]:
import polars
from src import read, columns

In [3]:
mol_info = read.read_csv("molecule_dictionary")

In [4]:
n = len(mol_info)
n

2354965

In [5]:
mol_info.columns

['molregno',
 'pref_name',
 'chembl_id',
 'max_phase',
 'therapeutic_flag',
 'dosed_ingredient',
 'structure_type',
 'chebi_par_id',
 'molecule_type',
 'first_approval',
 'oral',
 'parenteral',
 'topical',
 'first_in_class',
 'chirality',
 'prodrug',
 'inorganic_flag',
 'usan_year',
 'availability_type',
 'usan_stem',
 'polymer_flag',
 'usan_substem',
 'usan_stem_definition',
 'indication_class',
 'withdrawn_flag']

## Approved compounds

To select approved compounds, the 'max_phase' value must be '4'

In [6]:
n_not_approved = mol_info.select("first_approval").null_count()["first_approval"][0]
n_not_approved

2351748

In [7]:
n_approved = n - n_not_approved
n_approved

3217

In [8]:
approved_molecules = mol_info.filter(polars.col("first_approval").is_not_null())
len(approved_molecules)

3217

We will write the keys of the molecules that are approved to a text file. So we can reuse that.

In [9]:
approved_mol_primary_keys = approved_molecules.get_column("molregno").to_list()
with open("data/approved_mol_primary_keys.txt", "w") as file:
    file.writelines([f"{mol}\n" for mol in approved_mol_primary_keys])

Distribution of approved compounds per year

In [10]:
approved_molecules.get_column("first_approval").sort().value_counts(sort=False)

first_approval,counts
i64,u32
1939,9
1940,2
1941,3
1942,3
1943,5
1944,1
1945,6
1946,13
1947,6
1948,15


## Create approved compound properties dataset

In [11]:
columns.properties_table
properties = read.read_csv("compound_properties").select(["molregno", *columns.properties_table])
type(properties)

polars.dataframe.frame.DataFrame

In [12]:
approved_properties = approved_molecules.select(["molregno", "first_approval"]).join(properties, on="molregno", how="left")
approved_properties.columns

['molregno',
 'first_approval',
 'mw_freebase',
 'full_mwt',
 'mw_monoisotopic',
 'alogp',
 'cx_logp',
 'psa',
 'hba',
 'hbd',
 'hba_lipinski',
 'hbd_lipinski',
 'aromatic_rings',
 'rtb',
 'heavy_atoms',
 'num_lipinski_ro5_violations']

In [13]:
len(approved_properties)

3217

In [14]:
approved_properties.head()

molregno,first_approval,mw_freebase,full_mwt,mw_monoisotopic,alogp,cx_logp,psa,hba,hbd,hba_lipinski,hbd_lipinski,aromatic_rings,rtb,heavy_atoms,num_lipinski_ro5_violations
i64,i64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64
97,1976,383.41,383.41,383.1594,1.78,1.65,106.95,8,1,9,2,3,4,28,0
115,1984,162.24,162.24,162.1157,1.85,1.16,16.13,2,0,2,0,1,1,12,0
146,1990,361.37,361.37,361.1438,1.54,0.51,75.01,6,1,7,1,2,2,26,0
147,1964,232.24,232.24,232.0848,1.42,0.79,72.19,4,1,5,1,2,2,17,0
173,1965,357.79,357.79,357.0768,3.93,3.53,68.53,4,1,5,1,3,4,25,0


In [18]:
approved_properties.null_count()

molregno,first_approval,mw_freebase,full_mwt,mw_monoisotopic,alogp,cx_logp,psa,hba,hbd,hba_lipinski,hbd_lipinski,aromatic_rings,rtb,heavy_atoms,num_lipinski_ro5_violations
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,391,391,391,707,707,707,707,707,707,707,707,707,707,707


In [15]:
approved_properties.write_parquet("data/approved_properties.parquet")

## Analysis

In [19]:
df = approved_properties.drop("molregno").drop_nulls()

In [21]:
target_col = 'first_approval'
feature_cols = set(df.columns) - set([target_col])



TypeError: corrcoef() got an unexpected keyword argument 'method'