In [63]:
import pandas as pd
import numpy as np 
from rdkit.Chem import AllChem, PandasTools
import matplotlib.pyplot as plt
from sklearn.experimental import enable_hist_gradient_boosting 
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


In [64]:
train_descriptors = pd.read_csv("./train_data_descriptors.csv")
test_descriptors = pd.read_csv("./test_data_descriptors.csv")

In [65]:
le = LabelEncoder()

In [66]:
# Replacing null values with mean based on techniques in titanic dataset 
train_dataset = train_dataset.fillna(train_dataset.mean())
test_dataset = test_dataset.fillna(test_dataset.mean())

In [67]:
# Drop the "Expected" column from the DataFrame
X = train_dataset.drop("label", axis=1)
y = train_dataset["label"]

In [68]:
y = le.fit_transform(y)

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [44]:
from sklearn.feature_selection import VarianceThreshold

var_thres=VarianceThreshold(threshold=62)
var_thres.fit(X_train)

VarianceThreshold(threshold=62)

In [45]:
var_thres.get_support()

array([False, False, False, False, False,  True,  True,  True,  True,
       False, False, False, False, False, False, False, False,  True,
       False, False, False, False, False, False, False, False,  True,
        True,  True,  True, False, False, False, False, False, False,
       False, False, False, False,  True,  True, False,  True,  True,
        True,  True,  True, False, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True, False,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False,

In [46]:
### lets find non constant feature
len(X_train.columns[var_thres.get_support()])

63

In [47]:
constant_columns = [column for column in X_train.columns
                    if column not in X_train.columns[var_thres.get_support()]]

print(len(constant_columns))

146


In [48]:
X_train.drop(constant_columns,axis=1)

Unnamed: 0,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,BCUT2D_MWHI,BertzCT,Chi0,Chi0n,Chi0v,Ipc,...,EState_VSA9,VSA_EState1,VSA_EState10,VSA_EState2,VSA_EState3,VSA_EState5,VSA_EState6,HeavyAtomCount,MolMR,Assay_id
43222,257.721,241.593,257.093104,94.0,35.453001,394.057863,11.966255,10.005923,10.822420,3.496995e+03,...,4.736863,4.973459,0.000000,16.993029,2.375328,0.465515,6.996505,17.0,70.6127,1376
29120,413.543,386.327,413.177313,154.0,32.133501,1060.521458,20.802754,17.079842,17.896339,3.730670e+06,...,4.736863,7.105160,1.562706,26.547813,7.474727,0.662934,9.482864,29.0,121.3344,1686
73865,555.571,538.435,555.036575,186.0,26.919799,1552.212078,25.827698,19.626561,23.566164,1.249487e+08,...,9.473726,80.276775,0.000000,12.206956,12.556315,-1.188600,12.490147,37.0,120.5228,1376
6071,293.407,266.191,293.199094,118.0,16.474029,432.043439,15.501789,13.152615,13.152615,4.540769e+04,...,4.736863,5.041357,0.000000,11.734896,12.400514,0.620243,5.094540,21.0,84.4985,2452
44430,329.831,309.671,329.129490,120.0,35.453001,877.654397,15.267220,13.296346,14.112842,2.098514e+05,...,0.000000,1.945259,0.000000,20.454075,1.196713,0.233941,8.196660,23.0,93.3892,2444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21243,197.448,194.424,195.924948,54.0,35.499015,209.164996,7.723615,4.735808,7.003594,1.564958e+02,...,39.909347,0.000000,16.586605,0.000000,9.785093,-0.059198,2.687500,10.0,43.1368,28
45891,428.576,400.352,428.143964,156.0,32.239638,891.046276,22.076986,17.192283,18.825276,3.551712e+05,...,10.286417,25.868056,1.176204,16.048235,10.974408,0.017137,3.860960,28.0,115.1768,1385
42613,102.137,92.057,102.079313,42.0,16.289936,51.651484,5.698671,4.716889,4.716889,3.439946e+01,...,0.000000,0.000000,0.000000,9.643056,4.152778,0.000000,0.000000,7.0,28.7160,2453
43567,225.295,210.175,225.126597,86.0,15.101427,559.143822,12.250712,10.013229,10.013229,8.193699e+03,...,5.733667,0.000000,0.000000,0.000000,8.439684,0.000000,13.547251,17.0,70.9464,2443


In [49]:
test_dataset.drop(constant_columns,axis=1)

Unnamed: 0,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,BCUT2D_MWHI,BertzCT,Chi0,Chi0n,Chi0v,Ipc,...,EState_VSA9,VSA_EState1,VSA_EState10,VSA_EState2,VSA_EState3,VSA_EState5,VSA_EState6,HeavyAtomCount,MolMR,Assay_id
0,164.248,148.120,164.120115,66.0,16.255483,281.545854,9.353371,8.179264,8.179264,3.195721e+02,...,0.000000,0.000000,0.000000,0.000000,9.626968,0.405093,5.824954,12.0,51.5438,1682
1,431.452,414.316,431.056940,152.0,32.234922,1064.759045,20.949383,15.032383,16.665376,1.027850e+06,...,9.473726,60.658255,0.000000,22.815037,1.305518,-0.519776,2.474888,28.0,96.4400,1656
2,696.264,655.944,695.250845,254.0,35.495694,1963.361333,33.793953,27.132362,28.704788,6.880444e+10,...,16.584918,58.485835,6.272360,22.175262,-0.146035,-0.711310,18.366259,48.0,181.7450,36
3,201.244,197.212,200.949810,56.0,27.065780,293.213919,7.776021,6.065805,10.005408,1.064563e+02,...,0.000000,27.731250,0.000000,10.386875,0.000000,-0.760417,0.000000,11.0,32.1278,1850
4,418.574,380.270,418.271924,168.0,16.550330,706.327544,22.051677,19.096350,19.096350,3.230270e+06,...,9.473726,11.622232,0.000000,24.565087,9.917687,0.775870,0.000000,30.0,115.4548,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10989,408.376,389.224,408.129692,154.0,19.413185,923.937484,21.440947,15.987223,15.987223,2.069503e+06,...,14.412040,43.258535,0.000000,21.946205,7.579780,-0.691715,11.544856,29.0,100.4480,38
10990,320.344,304.216,320.104859,120.0,16.365723,806.067143,17.104084,12.755443,12.755443,3.149126e+05,...,0.000000,0.000000,0.000000,11.617073,28.579835,-1.017492,20.233169,24.0,90.5599,34
10991,381.370,366.250,381.074304,138.0,32.233272,946.593587,19.319626,14.008780,14.825276,4.333140e+05,...,4.736863,35.960712,0.000000,34.748815,2.158403,-0.806890,4.115621,26.0,88.2377,1640
10992,220.976,213.920,219.945901,66.0,35.539089,180.531080,8.905777,5.913486,8.319771,2.002582e+02,...,23.201880,24.107060,10.334157,0.000000,0.000000,0.000000,0.000000,11.0,42.4985,28


In [50]:
X_test.drop(constant_columns,axis=1)

Unnamed: 0,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,BCUT2D_MWHI,BertzCT,Chi0,Chi0n,Chi0v,Ipc,...,EState_VSA9,VSA_EState1,VSA_EState10,VSA_EState2,VSA_EState3,VSA_EState5,VSA_EState6,HeavyAtomCount,MolMR,Assay_id
44598,401.887,373.663,401.160515,152.0,35.495664,629.689415,19.819626,16.249739,17.005668,1.513884e+06,...,30.548392,23.230112,5.417383,23.305104,2.110781,-0.948643,0.000000,27.0,98.6617,32
20228,148.205,136.109,148.088815,58.0,16.465249,233.554619,8.104448,6.872350,6.872350,3.801758e+02,...,4.736863,5.023216,0.000000,0.000000,0.000000,0.901160,7.960648,11.0,47.7020,1852
21828,323.310,309.198,323.038116,110.0,32.450277,660.434042,15.294682,11.338928,13.049852,5.408086e+04,...,20.854350,11.511251,5.566729,10.189131,11.453596,0.459478,15.174631,21.0,85.7114,1383
65579,216.668,203.564,216.066555,78.0,35.495691,465.171015,11.093858,8.641675,9.397604,7.529673e+02,...,11.600940,1.113426,5.757361,25.706474,0.069005,0.000000,0.000000,14.0,56.0657,1691
6196,128.986,122.938,127.979570,38.0,35.496829,27.974168,4.991564,3.194706,4.706564,1.980027e+01,...,28.308407,0.000000,10.235340,0.000000,8.402778,0.451389,0.000000,6.0,27.4468,2452
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47417,230.909,227.885,228.901960,60.0,35.582556,255.004350,8.483128,5.191122,8.214838,2.282125e+02,...,46.403760,-1.495139,22.184599,3.814043,0.313727,0.000000,4.881844,11.0,48.4840,21
10711,269.772,249.612,269.118257,100.0,35.495663,384.530507,13.543606,11.202153,11.958082,1.065122e+04,...,16.337803,5.126806,5.676669,13.582280,0.000000,-0.167588,6.105805,18.0,75.2800,33
65567,149.190,134.070,149.105193,62.0,16.259165,55.748606,7.819991,6.031495,6.031495,1.535970e+02,...,15.319582,0.000000,0.000000,1.791667,25.455000,0.000000,0.000000,10.0,37.6974,39
39654,174.159,168.111,174.042927,64.0,16.130775,413.622920,9.681798,6.942975,6.942975,8.933614e+02,...,0.000000,0.000000,0.000000,26.773801,0.000000,0.000000,4.876524,13.0,46.7420,37


In [116]:
# build the lightgbm model
from xgboost import XGBClassifier

clf = XGBClassifier(n_estimators=3570,learning_rate = 0.1)
clf.fit(X_train, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=3570, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [117]:
# predict the results
predict=clf.predict(test_dataset)

In [118]:
predict2 = clf.predict(X_test)

In [113]:
from sklearn.metrics import f1_score

f1_score(y_test, predict2, average= None).mean()

0.7958569992010811

In [114]:
predict3 = le.inverse_transform(predict)
predict3

array([2, 2, 2, ..., 2, 2, 2])

In [115]:
print(np.count_nonzero(predict3==1))
print(np.count_nonzero(predict3==2))

1368
9626


In [19]:
# Create submission file 
submission_file = pd.read_csv("./sample_submission.csv")

In [20]:
submission_file

Unnamed: 0,Id,Predicted
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682,1
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,2
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,1
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850,2
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,2
...,...,...
10989,CC(=NOCC1=CC=CC=C1C(=NOC)C(=O)OC)C2=CC(=CC=C2)...,1
10990,C1=CC=C(C(=C1)C(C2=CC=C(C=C2)O)C3=CC=C(C=C3)O)...,2
10991,CC1=NC(=NC(=N1)OC)NC(=O)NS(=O)(=O)C2=CC=CC=C2C...,1
10992,COP(=O)(OC)OC=C(Cl)Cl;28,2


In [21]:
submission_file["Predicted"] = predict3
submission_file.to_csv("rdkit_submission_44.csv",index=False)