In [11]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split,KFold,cross_val_score
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score
from sklearn.feature_selection import RFE,SelectFromModel,SelectKBest,chi2
from tqdm import tqdm

from sklearn.metrics import median_absolute_error as MAE
  
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, PandasTools, Descriptors


from rdkit.ML.Descriptors import MoleculeDescriptors
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.evaluate import PredefinedHoldoutSplit
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

from sklearn.ensemble import VotingClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.impute import SimpleImputer
from sklearn.feature_selection import mutual_info_classif


In [1]:

#Script for generating features with rdkit.chem Descriptors, Morgan Fingerprints, circular finger prints and preprocessing the raw data

# Loading Train Dataset
train_data = pd.read_csv("train_II.csv")


# Loading Test Dataset
test_data = pd.read_csv("test_II.csv")

# New train dataframe after splitting the 'Id' field

splitted_train_data = train_data["Id"].str.split(";", n = -1, expand = True) 
train_data["c_id"]= splitted_train_data[0] 
train_data["assay_id"]= splitted_train_data[1] 
train_data.head(n=10)

train_data["assay_id"] = train_data["assay_id"].astype(int)


#  New test dataframe after splitting the 'Id' field 
splitted_test_data = test_data["x"].str.split(";", n = -1, expand = True)
test_data["c_id"]= splitted_test_data[0] 
test_data["assay_id"]= splitted_test_data[1] 
test_data.head(n=10)

test_data["assay_id"] = test_data["assay_id"].astype(int)


# Rename x to Id to match with train data

test_data = test_data.rename(columns={'x': 'Id'})

def RDkit_descriptors(smiles):
    mols = [Chem.MolFromSmiles(i) for i in smiles] 
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()
    
    Mol_descriptors =[]
    for mol in mols:
        if mol:
        # add hydrogens to molecules
            mol=Chem.AddHs(mol)
        # Calculate all 200 descriptors for each molecule
            descriptors = calc.CalcDescriptors(mol)
        Mol_descriptors.append(descriptors)
#     return Mol_descriptors,desc_names 
    df_descriptors = pd.DataFrame(Mol_descriptors, columns=desc_names)
    return df_descriptors



train_descriptors = RDkit_descriptors(train_data['c_id'])
test_descriptors = RDkit_descriptors(test_data['c_id'])

train_data = pd.concat([train_data, train_descriptors], axis=1)
test_data = pd.concat([test_data, test_descriptors], axis=1)


def morgan_fpts(data):
    Morgan_fpts = []
    for i in data:
        mol = Chem.MolFromSmiles(i) 
        if mol is not None:
            fpts = AllChem.GetMorganFingerprintAsBitVect(mol, 2, 2048)
            mfpts = np.array(fpts)
            Morgan_fpts.append(mfpts)
    return np.array(Morgan_fpts)


Morgan_fpts = morgan_fpts(train_data['c_id'])
Morgan_fpts2 = morgan_fpts(test_data['c_id'])

Morgan_fingerprints = pd.DataFrame(Morgan_fpts,columns=['Col_{}'.format(i) for i in range(Morgan_fpts.shape[1])])

Morgan_fingerprints2 = pd.DataFrame(Morgan_fpts2,columns=['Col_{}'.format(i) for i in range(Morgan_fpts2.shape[1])])

train_data = pd.concat([train_data, Morgan_fingerprints], axis=1)

test_data = pd.concat([test_data, Morgan_fingerprints2], axis=1)


from rdkit import Chem
from rdkit.Chem import AllChem

from rdkit.Chem import AllChem

def CircularFingerprint_descriptors(smiles):
    mols = [Chem.MolFromSmiles(i) for i in smiles]
    fp_size = 1024  # Fingerprint size
    fp_radius = 2   # Fingerprint radius
    fps = []
    for mol in mols:
        if mol:
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, fp_radius, nBits=fp_size)
            fps.append(fp)
        else:
            fps.append(None)
    arr = []
    for fp in fps:
        if fp is None:
            arr.append(np.zeros(fp_size))
        else:
            arr.append(np.array(fp))
    df_fps = pd.DataFrame(arr, columns=[f'bit_{i}' for i in range(fp_size)])
    return df_fps

    
train_circular_fp = CircularFingerprint_descriptors(train_data['c_id'])
train_data = pd.concat([train_data, pd.DataFrame(train_circular_fp)], axis=1)

test_circular_fp = CircularFingerprint_descriptors(test_data['c_id'])
test_data = pd.concat([test_data, pd.DataFrame(test_circular_fp)], axis=1)


# Storing features generated from RdKIt, morgan finger prints and circular fingerprints as features1
train_data.to_csv('train_data_features1.csv', index=False)
test_data.to_csv('test_data_features1.csv', index=False)

[11:07:59] Explicit valence for atom # 1 Si, 8, is greater than permitted
[11:08:05] Explicit valence for atom # 1 Si, 8, is greater than permitted
[11:08:10] Explicit valence for atom # 1 Si, 8, is greater than permitted
[11:08:11] Explicit valence for atom # 1 Si, 8, is greater than permitted
[11:08:16] Explicit valence for atom # 1 Si, 8, is greater than permitted
[11:08:19] Explicit valence for atom # 1 Si, 8, is greater than permitted
[11:53:40] Explicit valence for atom # 1 Si, 8, is greater than permitted
[11:54:12] Explicit valence for atom # 1 Si, 8, is greater than permitted
[11:54:45] Explicit valence for atom # 1 Si, 8, is greater than permitted
[11:54:55] Explicit valence for atom # 1 Si, 8, is greater than permitted
[11:55:28] Explicit valence for atom # 1 Si, 8, is greater than permitted
[11:55:46] Explicit valence for atom # 1 Si, 8, is greater than permitted
[11:56:21] Explicit valence for atom # 1 Si, 8, is greater than permitted
[11:56:25] Explicit valence for atom #

In [2]:
train_data

Unnamed: 0,Id,Expected,c_id,assay_id,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,...,bit_1014,bit_1015,bit_1016,bit_1017,bit_1018,bit_1019,bit_1020,bit_1021,bit_1022,bit_1023
0,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O;1644,2,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O,1644,8.874650,-2.987783,8.874650,0.765809,0.794714,317.599,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CCCCCCCCC(=O)C;2451,2,CCCCCCCCC(=O)C,2451,11.875100,-4.547647,11.875100,2.480493,0.516641,156.269,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-];1384,2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-],1384,8.736945,-6.030543,8.736945,0.000000,0.251327,362.086,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl;16,2,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl,16,10.885281,-3.359276,10.885281,0.170399,0.487998,255.665,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,[Na+].[I-];1856,2,[Na+].[I-],1856,0.000000,0.000000,0.000000,0.000000,0.237972,149.894,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75378,COC(=O)NS(=O)(=O)C1=CC=C(C=C1)N;33,2,COC(=O)NS(=O)(=O)C1=CC=C(C=C1)N,33,12.226619,-5.339907,12.226619,0.265375,0.712426,230.245,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75379,CCOP(=S)(OCC)OC1=NN(C(=N1)Cl)C(C)C;1632,1,CCOP(=S)(OCC)OC1=NN(C(=N1)Cl)C(C)C,1632,8.048957,-5.193498,8.048957,0.110750,0.720533,313.747,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75380,C1=CC=C2C(=C1)NC(=S)S2;1373,1,C1=CC=C2C(=C1)NC(=S)S2,1373,7.612361,-0.332130,7.612361,0.131389,0.596343,167.258,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75381,CCCCC(CC)C=O;2,2,CCCCC(CC)C=O,2,11.548424,-4.352546,11.548424,2.482554,0.519485,128.215,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
test_data

Unnamed: 0,Id,c_id,assay_id,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,...,bit_1014,bit_1015,bit_1016,bit_1017,bit_1018,bit_1019,bit_1020,bit_1021,bit_1022,bit_1023
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682,CC1=CC(=C(C=C1)C(C)(C)C)O,1682,8.018449,-3.840318,8.018449,1.098125,0.624614,164.248,148.120,...,0,0,0,0,0,0,0,0,0,0
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,1656,13.253248,-6.190954,13.253248,0.749312,0.617511,431.452,414.316,...,0,0,0,0,0,0,0,0,0,0
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,36,16.485402,-6.427849,16.485402,0.554422,0.224134,696.264,655.944,...,0,0,0,0,0,1,0,0,0,0
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+],1850,10.745579,-4.559583,10.745579,0.000000,0.393203,201.244,197.212,...,0,0,0,0,0,0,0,0,0,0
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,30,14.743838,-5.881101,14.743838,2.392476,0.639062,418.574,380.270,...,0,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10989,CC(=NOCC1=CC=CC=C1C(=NOC)C(=O)OC)C2=CC(=CC=C2)...,CC(=NOCC1=CC=CC=C1C(=NOC)C(=O)OC)C2=CC(=CC=C2)...,38,13.677486,-5.541970,13.677486,1.263580,0.391778,408.376,389.224,...,0,0,0,1,0,0,0,0,0,0
10990,C1=CC=C(C(=C1)C(C2=CC=C(C=C2)O)C3=CC=C(C=C3)O)...,C1=CC=C(C(=C1)C(C2=CC=C(C=C2)O)C3=CC=C(C=C3)O)...,34,12.722294,-3.388339,12.722294,0.917255,0.635519,320.344,304.216,...,0,0,0,0,0,0,0,0,0,0
10991,CC1=NC(=NC(=N1)OC)NC(=O)NS(=O)(=O)C2=CC=CC=C2C...,CC1=NC(=NC(=N1)OC)NC(=O)NS(=O)(=O)C2=CC=CC=C2C...,1640,13.186679,-5.902467,13.186679,0.638923,0.705120,381.370,366.250,...,0,0,0,0,0,0,0,0,0,0
10992,COP(=O)(OC)OC=C(Cl)Cl;28,COP(=O)(OC)OC=C(Cl)Cl,28,11.701188,-5.111667,11.701188,0.820825,0.540194,220.976,213.920,...,0,0,0,0,0,0,0,0,0,0


In [4]:
test_data.to_csv('test_data_features1.csv', index=False)

In [5]:
#Script to generate features using datal mol and sanitising the data

In [6]:
# Loading Train Dataset
train_data_mol = pd.read_csv("train_II.csv")


# Loading Test Dataset
test_data_mol = pd.read_csv("test_II.csv")

In [7]:
# New train dataframe after splitting the 'Id' field

splitted_train_data = train_data_mol["Id"].str.split(";", n = -1, expand = True) 
train_data_mol["c_id"]= splitted_train_data[0] 
train_data_mol["assay_id"]= splitted_train_data[1] 
train_data_mol.head(n=10)

train_data_mol["assay_id"] = train_data_mol["assay_id"].astype(int)


#  New test dataframe after splitting the 'Id' field 
splitted_test_data = test_data_mol["x"].str.split(";", n = -1, expand = True)
test_data_mol["c_id"]= splitted_test_data[0] 
test_data_mol["assay_id"]= splitted_test_data[1] 
test_data_mol.head(n=10)

test_data_mol["assay_id"] = test_data_mol["assay_id"].astype(int)

In [8]:
# Rename x to Id to match with train data

test_data_mol = test_data_mol.rename(columns={'x': 'Id'})

In [10]:

def _sanitize(row):
    try:
        mol = dm.to_mol(row['c_id'], ordered=True)
        mol = dm.fix_mol(mol)
        mol = dm.sanitize_mol(mol, sanifix=True, charge_neutral=False)
        mol = dm.standardize_mol(
            mol,
            disconnect_metals=False,
            normalize=True,
            reionize=True,
            uncharge=False,
            stereo=True,
        )
        row["standard_smiles"] = dm.standardize_smiles(dm.to_smiles(mol))
        row["selfies"] = dm.to_selfies(mol)
        row["inchi"] = dm.to_inchi(mol)
        row["inchikey"] = dm.to_inchikey(mol)
    except ValueError as e:
        print(f"Error processing molecule {row['c_id']}: {str(e)}")
        row["standard_smiles"] = None
        row["selfies"] = None
        row["inchi"] = None
        row["inchikey"] = None

    return row

In [16]:
test_data_mol = test_data_mol.apply(_sanitize, axis=1)
test_data_mol

train_data_mol = train_data_mol.apply(_sanitize, axis=1)
train_data_mol

In [17]:
#Finding values that Si+ atom and recording the faulty c_id

def nans(df): return df[df.isnull().any(axis=1)]

nans(train_data_mol)

Unnamed: 0,Id,Expected,c_id,assay_id,standard_smiles,selfies,inchi,inchikey
10135,F[Si-2](F)(F)(F)(F)F.[Na+].[Na+];1850,2,F[Si-2](F)(F)(F)(F)F.[Na+].[Na+],1850,,,,
26306,F[Si-2](F)(F)(F)(F)F.[Na+].[Na+];1855,2,F[Si-2](F)(F)(F)(F)F.[Na+].[Na+],1855,,,,
42332,F[Si-2](F)(F)(F)(F)F.[Na+].[Na+];2,2,F[Si-2](F)(F)(F)(F)F.[Na+].[Na+],2,,,,
47225,F[Si-2](F)(F)(F)(F)F.[Na+].[Na+];1857,2,F[Si-2](F)(F)(F)(F)F.[Na+].[Na+],1857,,,,
62942,F[Si-2](F)(F)(F)(F)F.[Na+].[Na+];1856,2,F[Si-2](F)(F)(F)(F)F.[Na+].[Na+],1856,,,,
72002,F[Si-2](F)(F)(F)(F)F.[Na+].[Na+];1852,2,F[Si-2](F)(F)(F)(F)F.[Na+].[Na+],1852,,,,


In [26]:
train_data_mol["mol"] = train_data_mol["c_id"].apply(dm.to_mol)

train_data_mol["mol"].fillna(method='bfill', inplace=True)
mols = train_data_mol["mol"].tolist()

data_descriptors = dm.descriptors.batch_compute_many_descriptors(mols)
data_descriptors.info()

train_data_mol = pd.concat([train_data_mol, data_descriptors], axis=1)

train_data_mol.to_csv('train_data_datamol.csv', index = False)

[14:04:29] Explicit valence for atom # 1 Si, 8, is greater than permitted
[14:04:33] Explicit valence for atom # 1 Si, 8, is greater than permitted
[14:04:37] Explicit valence for atom # 1 Si, 8, is greater than permitted
[14:04:38] Explicit valence for atom # 1 Si, 8, is greater than permitted
[14:04:42] Explicit valence for atom # 1 Si, 8, is greater than permitted
[14:04:45] Explicit valence for atom # 1 Si, 8, is greater than permitted


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75383 entries, 0 to 75382
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mw                       75383 non-null  float64
 1   fsp3                     75383 non-null  float64
 2   n_lipinski_hba           75383 non-null  int64  
 3   n_lipinski_hbd           75383 non-null  int64  
 4   n_rings                  75383 non-null  int64  
 5   n_hetero_atoms           75383 non-null  int64  
 6   n_heavy_atoms            75383 non-null  int64  
 7   n_rotatable_bonds        75383 non-null  int64  
 8   n_radical_electrons      75383 non-null  int64  
 9   tpsa                     75383 non-null  float64
 10  qed                      75383 non-null  float64
 11  clogp                    75383 non-null  float64
 12  sas                      75383 non-null  float64
 13  n_aliphatic_carbocycles  75383 non-null  int64  
 14  n_aliphatic_heterocyle

In [27]:
test_data_mol["mol"] = test_data_mol["c_id"].apply(dm.to_mol)

mols_test = test_data_mol["mol"].tolist()

data_descriptors_test = dm.descriptors.batch_compute_many_descriptors(mols_test)

data_descriptors_test = pd.concat([test_data_mol, data_descriptors_test], axis=1)

test_data_mol = data_descriptors_test

test_data_mol.to_csv('test_data_datamol.csv', index = False)

In [28]:
#Merging all the features generated 

train_data = pd.read_csv("train_data_features1.csv")

test_data = pd.read_csv("test_data_features1.csv")

train_data_mol = pd.read_csv("train_data_datamol.csv")

test_data_mol = pd.read_csv("test_data_datamol.csv")

# 'n_lipinski_hbd', 'tpsa', 'n_aliphatic_carbocycles' - Feature selection was applied on train_data_mol separately to find that these three were the best features from the group
train_data['n_lipinski_hbd'] = train_data_mol['n_lipinski_hbd']
train_data['tpsa'] = train_data_mol['tpsa']
train_data['n_aliphatic_carbocycles'] = train_data_mol['n_aliphatic_carbocycles']

# 'n_lipinski_hbd', 'tpsa', 'n_aliphatic_carbocycles'
test_data['n_lipinski_hbd'] = test_data_mol['n_lipinski_hbd']
test_data['tpsa'] = test_data_mol['tpsa']
test_data['n_aliphatic_carbocycles'] = test_data_mol['n_aliphatic_carbocycles']



In [29]:
#Normalising the feautes generated

train_data_features = train_data.iloc[:, 4:]

test_data_features = test_data.iloc[:, 3:]


# scaler = MinMaxScaler()
scaler = MinMaxScaler()
train_data[train_data_features.columns] = scaler.fit_transform(train_data[train_data_features.columns])
test_data[test_data_features.columns] = scaler.transform(test_data[test_data_features.columns])
# test_data[Morgan_fingerprints2.columns] = scaler.fit_transform(test_data[Morgan_fingerprints2.columns])

train_data[train_data.columns[4:]] = np.log(train_data[train_data.columns[4:]] + 1)

test_data[test_data.columns[3:]] = np.log(test_data[test_data.columns[3:]] + 1)

train_data.to_csv('train_data_final&Scaled.csv', index=False)

test_data.to_csv('test_data_final&Scaled.csv', index=False)

In [19]:
# Loading pre stored Train Dataset
train_data = pd.read_csv("train_data_final&Scaled.csv")


# Loading pre stored Test Dataset
test_data = pd.read_csv("test_data_final&Scaled.csv")

In [37]:
selected_features = list(train_data.iloc[:, 3:].columns)

In [39]:
train_data[selected_features]

Unnamed: 0,assay_id,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,bit_1017,bit_1018,bit_1019,bit_1020,bit_1021,bit_1022,bit_1023,n_lipinski_hbd,tpsa,n_aliphatic_carbocycles
0,1644,0.360552,0.467471,0.360552,0.144813,0.609565,0.158778,0.158568,0.158049,0.137530,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.076961,0.050699,0.0
1,2451,0.457990,0.387003,0.457990,0.408607,0.431695,0.072824,0.064566,0.072810,0.088879,...,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.021704,0.0
2,1384,0.355844,0.303991,0.355844,0.000000,0.226221,0.181236,0.162347,0.180976,0.202417,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
3,16,0.426884,0.448886,0.426884,0.034085,0.411439,0.126648,0.125964,0.126406,0.123465,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.039221,0.102137,0.0
4,1856,0.000000,0.605571,0.000000,0.000000,0.214673,0.069271,0.072507,0.069320,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75378,33,0.468808,0.343511,0.468808,0.052590,0.560143,0.113156,0.112040,0.113118,0.112069,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.113329,0.119202,0.0
75379,1632,0.331984,0.351692,0.331984,0.022285,0.565123,0.156810,0.153342,0.156545,0.143101,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.072382,0.0
75380,1373,0.316542,0.591129,0.316542,0.026383,0.485991,0.078919,0.079555,0.078824,0.065139,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.039221,0.020093,0.0
75381,2,0.447831,0.397429,0.447831,0.408886,0.433684,0.057093,0.050574,0.057082,0.071127,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.021704,0.0


In [42]:
# create feature and target variables
X = train_data[selected_features]
y = train_data['Expected']

# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.18, random_state=42)

# Impute missing values with mean and convert back to a pandas dataframe
imputer = SimpleImputer(strategy='mean')
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)

selected_features_SelectK = list(X_train_imputed.columns[X_train_imputed.columns.isin(X_train_imputed.columns[SelectKBest(mutual_info_classif, k=200).fit(X_train_imputed, y_train).get_support()])])

# print the selected features
# print('Selected features:', list(X_train_imputed.columns[X_train_imputed.columns.isin(X_train_imputed.columns[SelectKBest(mutual_info_classif, k=250).fit(X_train_imputed, y_train).get_support()])]))


In [6]:
selected_features_SelectK 

['bit_0',
 'bit_1',
 'bit_38',
 'bit_114',
 'bit_269',
 'bit_398',
 'bit_410',
 'bit_577',
 'bit_590',
 'bit_592',
 'bit_717',
 'bit_814',
 'bit_850',
 'bit_935',
 'bit_962',
 'assay_id',
 'qed',
 'MolWt',
 'HeavyAtomMolWt',
 'ExactMolWt',
 'MaxAbsPartialCharge',
 'MinAbsPartialCharge',
 'BCUT2D_MWLOW',
 'BCUT2D_CHGLO',
 'BCUT2D_LOGPLOW',
 'Chi0n',
 'Chi1v',
 'Chi2v',
 'Chi3v',
 'Kappa1',
 'Kappa2',
 'LabuteASA',
 'SMR_VSA1',
 'SlogP_VSA5',
 'VSA_EState5',
 'VSA_EState6',
 'MolLogP',
 'MolMR',
 'fr_nitro_arom',
 'fr_methoxy',
 'fr_Al_COO',
 'fr_Al_OH_noTert',
 'fr_ArN',
 'fr_Ar_COO',
 'fr_Ar_N',
 'fr_COO',
 'fr_COO2',
 'fr_HOCCN',
 'fr_NH2',
 'fr_N_O',
 'fr_Ndealkylation2',
 'fr_Nhpyrrole',
 'fr_allylic_oxid',
 'fr_aryl_methyl',
 'fr_epoxide',
 'fr_ester',
 'fr_ether',
 'fr_furan',
 'fr_guanido',
 'fr_hdrzone',
 'fr_imidazole',
 'fr_isothiocyan',
 'fr_ketone',
 'fr_ketone_Topliss',
 'fr_morpholine',
 'fr_nitro_arom_nonortho',
 'fr_oxime',
 'fr_phos_ester',
 'fr_piperdine',
 'fr_prisulf

In [7]:
#Feature selection with sequential selector



# create feature and target variables
X = train_data[selected_features_SelectK]
y = train_data['Expected']

# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.21, random_state=1)

# Setting up SFS for DecisionTree
clf = DecisionTreeClassifier()

sfs1 = sfs(clf,
           k_features=30,
           forward=True,
           floating=False,
           verbose=2,
           scoring='f1_macro',
           cv=5, 
           n_jobs=-1)

# Impute missing values with mean and convert back to a pandas dataframe
imputer = SimpleImputer(strategy='mean')
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)

sfs1.fit(X_train_imputed, y_train)

print('Best accuracy score: %.2f' % sfs1.k_score_)   # k_score_ shows the best score
print('Best subset (indices):', sfs1.k_feature_idx_) # k_feature_idx_ shows the index of features
print('Best subset (corresponding names):', sfs1.k_feature_names_) # k_feature_names_ shows the feature names

feature_cols = pd.DataFrame(sfs1.subsets_).transpose()
print(feature_cols)

selected_features_sequential = feature_cols['feature_names'][30]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 114 out of 114 | elapsed:    6.1s finished

[2023-04-08 16:40:54] Features: 1/30 -- score: 0.6400716013151[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 113 out of 113 | elapsed:    3.9s finished

[2023-04-08 16:40:59] Features: 2/30 -- score: 0.6915060266512223[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 112 out of 112 | elapsed:    8.7s finished

[2023-04-08 16:41:07] Features: 3/30 -- score: 0.715086389206736[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 111 out o

Best accuracy score: 0.73
Best subset (indices): (1, 12, 14, 15, 17, 19, 20, 21, 23, 24, 37, 38, 46, 53, 54, 61, 67, 74, 77, 86, 89, 92, 94, 95, 99, 101, 102, 103, 105, 107)
Best subset (corresponding names): ('bit_1', 'bit_850', 'bit_962', 'assay_id', 'MolWt', 'ExactMolWt', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'BCUT2D_CHGLO', 'BCUT2D_LOGPLOW', 'MolMR', 'fr_nitro_arom', 'fr_COO2', 'fr_aryl_methyl', 'fr_epoxide', 'fr_isothiocyan', 'fr_phos_ester', 'fr_thiophene', 'Col_136', 'Col_517', 'Col_615', 'Col_863', 'Col_909', 'Col_1030', 'Col_1194', 'Col_1322', 'Col_1342', 'Col_1360', 'Col_1470', 'Col_1527')
                                          feature_idx  \
1                                               (15,)   
2                                            (15, 37)   
3                                        (15, 20, 37)   
4                                    (15, 20, 24, 37)   
5                                (15, 19, 20, 24, 37)   
6                            (14, 15, 19, 2


[2023-04-08 16:50:40] Features: 30/30 -- score: 0.7328420667621474

In [20]:
#Analysing the output features from SelectKBest and sequential selector(which were run more then 50 times with different parameters) to select the final 36 features below that gave the best results

selected_features = ['bit_114',
 'bit_717',
 'assay_id',
 'qed',
 'MolWt',
 'HeavyAtomMolWt',
 'ExactMolWt',
 'MaxAbsPartialCharge',
 'MinAbsPartialCharge',
 'BCUT2D_MWLOW',
 'BCUT2D_CHGLO',
 'BCUT2D_LOGPLOW',
 'Chi0n',
 'Chi1v',
 'Chi2v',
 'Chi3v',
 'Kappa1',
 'Kappa2',
 'LabuteASA',
 'SMR_VSA1',
 'SlogP_VSA5',
 'VSA_EState5',
 'VSA_EState6',
 'MolLogP',
 'MolMR',
 'fr_nitro_arom',
 'fr_Al_COO',
 'fr_COO2',
 'fr_NH2',
 'Col_516','n_lipinski_hbd', 'tpsa', 'n_aliphatic_carbocycles']

In [21]:
# Impute missing values with mean and convert back to a pandas dataframe
imputer = SimpleImputer(strategy='mean')
X_train_imputed = pd.DataFrame(imputer.fit_transform(train_data[selected_features]), columns=train_data[selected_features].columns)

# Impute missing values with mean and convert back to a pandas dataframe
imputer = SimpleImputer(strategy='mean')
X_test_imputed = pd.DataFrame(imputer.fit_transform(test_data[selected_features]), columns=test_data[selected_features].columns)


In [22]:
#defining base estimators
clf1 = xgb.XGBClassifier(random_state=1,booster="gbtree",learning_rate=0.25,n_estimators=1200,max_depth=5, min_child_weight=5, n_jobs=-1)
clf2 = lgb.LGBMClassifier(boosting_type= 'goss',learning_rate=0.1,n_estimators=1000,max_depth=5,num_leaves=100,max_bin = 5000, n_jobs=-1)
# clf3 = GradientBoostingClassifier(n_estimators=1500, learning_rate=0.1, max_depth=5, random_state=1)
# clf4 = MLPClassifier(hidden_layer_sizes=(100,50), activation='relu', solver='adam', alpha=0.0001, learning_rate_init=0.001, max_iter=200, random_state=1)

clf5 = BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=1, criterion='entropy', max_depth=35,class_weight='balanced')
                            , n_estimators=1500, random_state=1, n_jobs=-1)

# estimators = [('XGB', clf1), ('LGBM', clf2), ('GB', clf3), ('NN', clf4)]

estimators = [('XGB', clf1), ('BGM', clf5)]
#loop iteration to tune weight
# for i in range(1,3):
#      for j in range(1,3):

vc = VotingClassifier(estimators=estimators,voting='hard',weights=[1,1])
f1 = cross_val_score(vc,X_train_imputed[selected_features], train_data['Expected'],cv=5,scoring='f1_macro')
# acc = cross_val_score(vc,X_train_imputed[selected_features], train_data['Expected'],cv=5)
print("F1 Macro score for weights (1,1): ",np.round(np.mean(f1),5))
# print("Accuracy score for weights (1,1,1): ",np.round(np.mean(acc),5))

vc = vc.fit(X_train_imputed[selected_features], train_data['Expected'])
# val_predictions = vc.predict(X_test)
y_pred = vc.predict(X_test_imputed[selected_features])

# 79949 80038 80192 80156 80068 80132 80129 80253 803(2100)

F1 Macro score for weights (1,1):  0.79983


In [23]:
op = test_data[['Id']]
op['Predicted'] = y_pred
op["Predicted"] = op["Predicted"].astype(int)
op

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  op['Predicted'] = y_pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  op["Predicted"] = op["Predicted"].astype(int)


Unnamed: 0,Id,Predicted
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682,2
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,2
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,2
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850,2
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,1
...,...,...
10989,CC(=NOCC1=CC=CC=C1C(=NOC)C(=O)OC)C2=CC(=CC=C2)...,2
10990,C1=CC=C(C(=C1)C(C2=CC=C(C=C2)O)C3=CC=C(C=C3)O)...,2
10991,CC1=NC(=NC(=N1)OC)NC(=O)NS(=O)(=O)C2=CC=CC=C2C...,2
10992,COP(=O)(OC)OC=C(Cl)Cl;28,2


In [24]:
op['Predicted'].value_counts()
# 9645;1349 9567 1427 9673 1321 9602 1392 9606 1388 9422 1572 8421 1573 9005 1989 9433 1561 904701974 9550 1444 9579 9551

2    9424
1    1570
Name: Predicted, dtype: int64

In [26]:
op.to_csv('toxic_ouput_FinalSubmission.csv', index = False)