# Data curation

In [1]:
# to do
# 0. combine train and test set - done
# 1. canonicalize smiles from Molecule - done
# 2. standardise molecules - done
# 3. extract MorganFP - done
# 4. calculate rdkit descriptors - done
# 5. export pandas dataframe as .csv, split train, test back - done

In [2]:
# 2 csvs: Train and test - done
# Columns: smiles, (example_...), (ecfp_...), (rdkit_desc_...) - done
# Export csv to Temp

In [3]:
# Remove columns Index, Id and molecule
# reducing copy DataFrames

In [4]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.rdBase import BlockLogs
from rdkit.Chem.MolStandardize import rdMolStandardize

from xgboost import XGBClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import cohen_kappa_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils import class_weight
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.metrics import cohen_kappa_score
from numpy import linspace

In [6]:
# suppress Convergence and user Warnings

from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

## Parameters

#### specifying data set features

In [8]:
# True: complete data set with MorganFP and rdkit descriptors
# False: complete data set only with MorganFP

AllData = True
#AllData = False

In [12]:
# True: add example data to the data set

use_example = True
#use_example = False

#### test run

In [9]:
# True: first 10 rows of the data
# False: everything

test_run = False
#test_run = True


#### removing columns with unique values

In [None]:
# True: deletes all columns with unique values
# False: keeps the data set without removing any columns

delun = False
#delun = True

#### specifying radius of morgan finger prints

In [10]:
mfp_r = 1

#### modelling test run

In [11]:
#test_run2 = True
test_run2 = False

#### save prediction as csv file

In [13]:
# True: save data sets as csv files

save = True
save = False

#### specifying usage of StandardScaler during modeling

In [14]:
StdSca = True
#StdSca = False

## Function collection

#### Renaming function

In [15]:
def rename(df_name, new_name):
    df = df_name.copy(deep=True)
    for i in range(len(df_name.columns)):
        name = new_name + str(df_name.columns[i])
        df.rename(columns={df_name.columns[i] : name}, inplace=True)
    return df

#### Molecule standardizer

In [16]:
def transform(smiles, neutralize=True):
        mol = Chem.MolFromSmiles(smiles)
        block = BlockLogs() # Block all RDkit logging
        
        # Normalizing functional groups
        # https://molvs.readthedocs.io/en/latest/guide/standardize.html
        clean_mol = rdMolStandardize.Cleanup(mol) 
        # Get parents fragments
        parent_clean_mol = rdMolStandardize.FragmentParent(clean_mol)

        # Neutralise
        if neutralize:
            uncharger = rdMolStandardize.Uncharger()
            uncharged_parent_clean_mol = uncharger.uncharge(parent_clean_mol)
        else:
            uncharged_parent_clean_mol = parent_clean_mol
        del block # Release logging block to previous state
        return(uncharged_parent_clean_mol)

#### MorganFP generator

In [17]:
def MFP_generator(mol):
    #mfp_r = 3
    return(AllChem.GetMorganFingerprintAsBitVect(mol, radius=mfp_r, nBits=4096))

#### Descriptor calculator

In [18]:
def desc_generator(mol):
    desc_list = []
    for i in range(len(Chem.Descriptors.descList)):
        desc_list.append(Chem.Descriptors.descList[i][1](mol))
    return desc_list

#### Descriptor name generator

In [19]:
def desc_name_generator():
    desc_list = []
    for i in range(len(Chem.Descriptors.descList)):
        desc_list.append(Chem.Descriptors.descList[i][0])
    return desc_list

## 0. Reading data files from csv

#### reading training data set

In [20]:
df_train = pd.read_csv("Data/train.csv")

In [21]:
df_train["sol_category"].value_counts()

2    65835
1     2835
0     2041
Name: sol_category, dtype: int64

#### reading example predictor data set

In [23]:
example = pd.read_csv("Data/example_predictors.csv")
example.drop(columns='smiles', inplace=True)

df_example = rename(example, 'example_')

df_example.rename(columns={df_example.columns[0] : 'Id'}, inplace=True)

#### reading testing set

In [25]:
df_test = pd.read_csv("Data/test.csv")

# dummy value for the solubility category
df_test["sol_category"] = 5

## 1. Combining train and test set

In [27]:
# partition (top 10)
if test_run:
    df_comb = pd.DataFrame(df_train.append(df_test))
    df_combined = df_comb.head(10)

# full data set
else:
    df_combined = pd.DataFrame(df_train.append(df_test))

# resetting index
df_combined = df_combined.reset_index()

  df_combined = pd.DataFrame(df_train.append(df_test))


## 2. Canonicalizing smiles from molecules

In [29]:
# creating molecules from smiles
df_combined['molecule'] = df_combined['smiles'].apply(Chem.MolFromSmiles)

# creating canonical smiles from molecules
df_combined['smiles'] = df_combined['molecule'].apply(Chem.MolToSmiles)

## 3. Standardising molecules

In [30]:
df_combined['molecule'] = df_combined['smiles'].apply(transform)

## 4. Extraction of MorganFP and rdkit descriptors

In [32]:
df_combined_copy = df_combined.copy(deep=True)

#### generating MorganFP

In [33]:
df_combined_copy["FP"] = df_combined_copy["molecule"].apply(MFP_generator)
df_combined_copy

Unnamed: 0,index,Id,smiles,sol_category,molecule,FP
0,0,EOS12286,Cc1nc(N2CCN(C(=O)Nc3ccc(F)cc3F)CC2)cc(-n2ccnc2)n1,0,<rdkit.Chem.rdchem.Mol object at 0x13a75f190>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,EOS85869,CCN(CC)[C@H]1CCN(C(=O)Cc2nc(C(C)C)c(C)s2)C1,0,<rdkit.Chem.rdchem.Mol object at 0x13a75f040>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,EOS85435,CNC(=O)CNC(=O)c1c(-n2cccc2)sc(C)c1C,0,<rdkit.Chem.rdchem.Mol object at 0x13a75f200>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,3,EOS102302,CC(C)(C)c1ccc(CSc2cnn(C(C)(C)C)c(=O)c2Cl)cc1,0,<rdkit.Chem.rdchem.Mol object at 0x13a75f270>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,4,EOS64213,CC[C@H](NC(=O)c1ccnc(-n2ccnc2)c1)c1ccccc1OC,0,<rdkit.Chem.rdchem.Mol object at 0x13a75f120>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...
101013,30302,EOS97822,CN(C)CCN1CCC(CNCc2ccc(C#N)cc2)CC1,5,<rdkit.Chem.rdchem.Mol object at 0x13b254dd0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
101014,30303,EOS97834,Cc1cn(C[C@H](O)CN2CCOCC2)c(=O)n1-c1ccccc1,5,<rdkit.Chem.rdchem.Mol object at 0x13b254e40>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
101015,30304,EOS97889,Cc1ccc(C(=O)Nc2c(N3CCOCC3)nc(-c3ccccc3)[nH]c2=...,5,<rdkit.Chem.rdchem.Mol object at 0x13b254eb0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
101016,30305,EOS97969,CC1Cc2cccc3c(O)c(C(=O)NCCCc4ccccc4)c(=O)n1c23,5,<rdkit.Chem.rdchem.Mol object at 0x13b254f20>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


#### reformating morgan finger print bits into seperate columns

In [34]:
# generate a Numpy array of sample size and fps
fp_arr = np.stack(df_combined_copy["FP"])

#convert to df
df_fp = pd.DataFrame(fp_arr)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101013,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
101014,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
101015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
101016,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Descriptors

In [35]:
if AllData:
    df_combined_copy["desc"] = df_combined_copy["molecule"].apply(desc_generator)
    df_combined_copy

In [36]:
df_desc = pd.DataFrame()
if AllData:
    # generate a Numpy array of sample size and fps
    desc_arr = np.stack(df_combined_copy["desc"])

    #convert to df
    df_desc = pd.DataFrame(desc_arr)

#### resetting index and labeling, optional : caculating descriptors

In [37]:
# adding MorganFP
df_fp = df_fp.reset_index(drop=True)
df_fp = rename(df_fp, 'ecfp_')

# adding rdkit descriptors
if AllData:
    df_desc = df_desc.reset_index(drop=True)
    df_desc.columns=desc_name_generator()
    df_desc = rename(df_desc, 'rdkit_desc_')
df_desc

Unnamed: 0,rdkit_desc_MaxEStateIndex,rdkit_desc_MinEStateIndex,rdkit_desc_MaxAbsEStateIndex,rdkit_desc_MinAbsEStateIndex,rdkit_desc_qed,rdkit_desc_MolWt,rdkit_desc_HeavyAtomMolWt,rdkit_desc_ExactMolWt,rdkit_desc_NumValenceElectrons,rdkit_desc_NumRadicalElectrons,...,rdkit_desc_fr_sulfide,rdkit_desc_fr_sulfonamd,rdkit_desc_fr_sulfone,rdkit_desc_fr_term_acetylene,rdkit_desc_fr_tetrazole,rdkit_desc_fr_thiazole,rdkit_desc_fr_thiocyan,rdkit_desc_fr_thiophene,rdkit_desc_fr_unbrch_alkane,rdkit_desc_fr_urea
0,13.751480,-0.805113,13.751480,0.046085,0.732517,399.405,380.253,399.161915,150.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,12.524029,0.234919,12.524029,0.234919,0.807124,323.506,294.274,323.203134,124.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,12.317465,-0.223009,12.317465,0.019061,0.899102,291.376,274.240,291.104148,106.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,12.357761,-0.382235,12.357761,0.149177,0.698158,364.942,339.742,364.137612,130.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,12.704152,-0.151369,12.704152,0.131009,0.750601,336.395,316.235,336.158626,128.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101013,8.792413,0.730413,8.792413,0.730413,0.835277,300.450,272.226,300.231397,120.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101014,12.599120,-0.572386,12.599120,0.115993,0.879151,317.389,294.205,317.173942,124.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101015,12.929550,-0.388142,12.929550,0.150145,0.715329,390.443,368.267,390.169191,148.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101016,12.916304,-0.513227,12.916304,0.025342,0.684949,362.429,340.253,362.163043,138.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## 5. Combining data sets

In [38]:
# Combined data with only MorganFP
df_combined = pd.concat([df_combined, df_fp], axis=1)

# Adding example descriptors
if use_example:
    #df_combined = pd.concat([df_combined, df_example], axis=1)
    df_combined = pd.merge(df_combined, df_example, on='Id', how='inner')

# complete data set by adding rdkit descriptors
if AllData:
    df_AllData = pd.concat([df_combined, df_desc], axis=1)
    df_AllData.drop(columns=['index','molecule'], inplace=True)


#### delete columns with unique values

In [39]:
if delun:
    if AllData:
        len_un = len(df_AllData.columns)
        df_AllData = df_AllData.loc[:,df_AllData.nunique()!=1]
        num_drop_unique = len_un - len(df_AllData.columns)
        print(f'number of dropped collumns with identical values: {num_drop_unique}')
    else:
        len_un = len(df_combined.columns)
        df_combined = df_combined.loc[:,df_AllData.nunique()!=1]
        num_drop_unique = len_un - len(df_combined.columns)
        print(f'number of dropped collumns with identical values: {num_drop_unique}')

"if AllData:\n    len_un = len(df_AllData.columns)\n    df_AllData = df_AllData.loc[:,df_AllData.nunique()!=1]\n    num_drop_unique = len_un - len(df_AllData.columns)\n    print(f'number of dropped collumns with identical values: {num_drop_unique}')\nelse:\n    len_un = len(df_combined.columns)\n    df_combined = df_combined.loc[:,df_AllData.nunique()!=1]\n    num_drop_unique = len_un - len(df_combined.columns)\n    print(f'number of dropped collumns with identical values: {num_drop_unique}')"

## 6. Separating train and test data sets

In [42]:
df_train_set = pd.DataFrame()
df_test_set = pd.DataFrame()

#### seperating test set with dummy value 5

In [43]:
# complete data set with MorganFP and rdkit descriptors
if AllData == True:
    df_train_set = df_AllData[df_AllData['sol_category'] <= 4]
    df_test_set = df_AllData[df_AllData['sol_category'] == 5]

# data set with only MorganFP    
elif AllData == False:
    df_train_set = df_combined[df_combined['sol_category'] <= 4]
    df_test_set = df_combined[df_combined['sol_category'] == 5]

# Restructuring test data
df_test_set.drop(columns='sol_category', inplace=True)
df_test_set.reset_index(drop=True)

Unnamed: 0,Id,smiles,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,ecfp_5,ecfp_6,ecfp_7,...,rdkit_desc_fr_sulfide,rdkit_desc_fr_sulfonamd,rdkit_desc_fr_sulfone,rdkit_desc_fr_term_acetylene,rdkit_desc_fr_tetrazole,rdkit_desc_fr_thiazole,rdkit_desc_fr_thiocyan,rdkit_desc_fr_thiophene,rdkit_desc_fr_unbrch_alkane,rdkit_desc_fr_urea
0,EOS10000,Cc1n[nH]nc1C(=O)N(C)CC1CCN(Cc2ccccc2)C1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,EOS100001,CC[C@]1(O)C[C@@H]2CN(CCc3c([nH]c4ccccc34)[C@@]...,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,EOS100004,Cc1ccc(-c2ccc(F)cc2COc2ccc(CCC(=O)O)cc2)cc1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,EOS100005,O=C1[C@H](CC[C@H](O)c2ccc(F)cc2)[C@@H](c2ccc(O...,0,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,EOS100008,Cl.c1ccc2c(CC3=NCCN3)cccc2c1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30302,EOS97822,CN(C)CCN1CCC(CNCc2ccc(C#N)cc2)CC1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30303,EOS97834,Cc1cn(C[C@H](O)CN2CCOCC2)c(=O)n1-c1ccccc1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30304,EOS97889,Cc1ccc(C(=O)Nc2c(N3CCOCC3)nc(-c3ccccc3)[nH]c2=...,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30305,EOS97969,CC1Cc2cccc3c(O)c(C(=O)NCCCc4ccccc4)c(=O)n1c23,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


#### Printing data files

In [44]:
print('df_train_set')
df_train_set

df_train_set


Unnamed: 0,Id,smiles,sol_category,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,ecfp_5,ecfp_6,...,rdkit_desc_fr_sulfide,rdkit_desc_fr_sulfonamd,rdkit_desc_fr_sulfone,rdkit_desc_fr_term_acetylene,rdkit_desc_fr_tetrazole,rdkit_desc_fr_thiazole,rdkit_desc_fr_thiocyan,rdkit_desc_fr_thiophene,rdkit_desc_fr_unbrch_alkane,rdkit_desc_fr_urea
0,EOS12286,Cc1nc(N2CCN(C(=O)Nc3ccc(F)cc3F)CC2)cc(-n2ccnc2)n1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,EOS85869,CCN(CC)[C@H]1CCN(C(=O)Cc2nc(C(C)C)c(C)s2)C1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,EOS85435,CNC(=O)CNC(=O)c1c(-n2cccc2)sc(C)c1C,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,EOS102302,CC(C)(C)c1ccc(CSc2cnn(C(C)(C)C)c(=O)c2Cl)cc1,0,0,0,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,EOS64213,CC[C@H](NC(=O)c1ccnc(-n2ccnc2)c1)c1ccccc1OC,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70706,EOS37839,O=C(NCCCc1nc(=O)[nH][nH]1)[C@H]1CCC(F)(F)C1,2,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
70707,EOS2088,Cc1ccc(C(=O)NC2CCCC2)cc1S(=O)(=O)N1CCOCC1,2,0,0,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70708,EOS10587,COCCN1CCC(CN(C)S(=O)(=O)c2cccc(C(F)(F)F)c2)C1,2,0,0,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70709,EOS40533,O=C(Nc1ccc(F)cc1)NC1CCN(C(=O)Cc2cnn(-c3ccccc3)...,2,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [45]:
print('df_test_set')
df_test_set

df_test_set


Unnamed: 0,Id,smiles,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,ecfp_5,ecfp_6,ecfp_7,...,rdkit_desc_fr_sulfide,rdkit_desc_fr_sulfonamd,rdkit_desc_fr_sulfone,rdkit_desc_fr_term_acetylene,rdkit_desc_fr_tetrazole,rdkit_desc_fr_thiazole,rdkit_desc_fr_thiocyan,rdkit_desc_fr_thiophene,rdkit_desc_fr_unbrch_alkane,rdkit_desc_fr_urea
70711,EOS10000,Cc1n[nH]nc1C(=O)N(C)CC1CCN(Cc2ccccc2)C1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70712,EOS100001,CC[C@]1(O)C[C@@H]2CN(CCc3c([nH]c4ccccc34)[C@@]...,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70713,EOS100004,Cc1ccc(-c2ccc(F)cc2COc2ccc(CCC(=O)O)cc2)cc1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70714,EOS100005,O=C1[C@H](CC[C@H](O)c2ccc(F)cc2)[C@@H](c2ccc(O...,0,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70715,EOS100008,Cl.c1ccc2c(CC3=NCCN3)cccc2c1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101013,EOS97822,CN(C)CCN1CCC(CNCc2ccc(C#N)cc2)CC1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101014,EOS97834,Cc1cn(C[C@H](O)CN2CCOCC2)c(=O)n1-c1ccccc1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101015,EOS97889,Cc1ccc(C(=O)Nc2c(N3CCOCC3)nc(-c3ccccc3)[nH]c2=...,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101016,EOS97969,CC1Cc2cccc3c(O)c(C(=O)NCCCc4ccccc4)c(=O)n1c23,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


# Model

In [46]:
df = df_train_set
df['sol_category'] = pd.to_numeric(df['sol_category'], downcast='integer')

#### checking the composition of the data set

In [47]:
df['sol_category'].value_counts()

2    65835
1     2835
0     2041
Name: sol_category, dtype: int64

#### test run split randomly chooses 500 compounds when enables

In [48]:
if test_run2:
    df, _, __, ___ = train_test_split(df, df['sol_category'], train_size=500, stratify=df['sol_category'])

#### defining features X

In [49]:
X = df.copy(deep=True)
X.drop(columns=['Id', 'smiles', 'sol_category'], inplace=True)
#X

Unnamed: 0,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,ecfp_5,ecfp_6,ecfp_7,ecfp_8,ecfp_9,...,rdkit_desc_fr_sulfide,rdkit_desc_fr_sulfonamd,rdkit_desc_fr_sulfone,rdkit_desc_fr_term_acetylene,rdkit_desc_fr_tetrazole,rdkit_desc_fr_thiazole,rdkit_desc_fr_thiocyan,rdkit_desc_fr_thiophene,rdkit_desc_fr_unbrch_alkane,rdkit_desc_fr_urea
0,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0,0,0,0,0,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70706,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
70707,0,0,0,0,0,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70708,0,0,0,0,0,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70709,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


#### defining category list y

In [50]:
y = df.iloc[:, 2]

#### adding features to list, allows to model multiple feature data set in succession

In [51]:
#all_dataset = list()
#all_dataset.append(X)

#### defining parameters for the xgboost model

In [52]:
# setup parameters for xgboost
params = {}
#params['booster'] = 'gbtree' #['gbtree', 'gblinear', 'dart']
#params['objective'] = ['binary:logistic']
#params["eval_metric"] = ["error"]
params['eta'] = 0.001 #, 0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5] # boosting learning rate
params['gamma'] = 0.5 #linspace(0.000000001, 1.0, num=11) # min loss red recuired for further partition on leaf node
params['max_depth'] = 7 #np.arange(1, 11, 2) # max tree dept for base learners
params['n_estimators'] = 100 #np.arange(50, 550, 50) maybe 250 ??
params['min_child_weight'] = 1 # min sum of instance weight in a child
params['max_delta_step'] = 0 # max delta step allowed for each tree's weight estimate
params['subsample']= 0.5 #[0.5, 1] # subsample ratio of training instance
params['colsample_bytree'] = 1 # subsample ratio of columns when cunstructing each tree
#params['silent'] = [1]
#params['seed'] = [0] # = random_state ???
params['base_score'] = 0.5 # initial prediction score, global bias
#params['random_state'] = [0] # = seed ???
#params['scale_pos_weight'] = ratio
params['n_jobs'] = 5

#### applying model on feature data set

In [53]:

# applying StandardScaler
if StdSca:
    scaler = StandardScaler(copy=True, with_mean=True, with_std=True).fit(X)
    X = scaler.fit_transform(X)

# splitting data set
StratifiedKFold(n_splits=5)
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=1,  shuffle=True, stratify=y)#, test_size=0.2, train_size=0.8)

# defining class weights
classes_weights = class_weight.compute_sample_weight(
    class_weight='balanced',
    y=y_train
    )

# defining model
model = XGBClassifier(**params)

# fitting model on training data set
model.fit(X_train, y_train, sample_weight=classes_weights)

# model validation
valPredictions = model.predict(X_val)

# calculating quadratically weighted kappa score
sk_quad_kappa = cohen_kappa_score(y_val, valPredictions, weights='quadratic')
    
 
# printing results
print(f'quadratically weighted kappa score: {sk_quad_kappa}')


quadratic kappa score of data set X: 0.08204952741968219


In [54]:
# r=3 : 0.0711755199241082 

#### de

In [55]:
df_test_set.reset_index(drop=True)
sub_template = pd.read_csv('Data/submission_template_rdm.csv')
#(sub_template['Id'] == df_test_set['Id']).value_counts()

In [56]:
df_test_set.drop(columns=['Id', 'smiles'], inplace=True)

# apply StandardScaler
if StdSca:
    scaler = StandardScaler(copy=True, with_mean=True, with_std=True).fit(df_test_set)
    df_test_set = scaler.fit_transform(df_test_set)

# prediction
testPredictions = model.predict(df_test_set)
sub_template['pred'] = testPredictions

In [57]:
set(testPredictions)

{0, 1, 2}

In [58]:
sub_template['pred'].value_counts()

2    25574
1     2966
0     1767
Name: pred, dtype: int64

In [59]:
save = False

In [60]:
if save:
    sub_template.to_csv('Submissions/submission_20_12_C-lab.csv', index=False)
else:
    if test_run or test_run2:
        print('test run')
    else:
        print('unsaved run')

test run
