In [1]:
import pandas as pd
import numpy as np 
from rdkit.Chem import AllChem, PandasTools
import matplotlib.pyplot as plt
from sklearn.experimental import enable_hist_gradient_boosting 
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score



In [2]:
train = pd.read_csv("./train_with_descriptors.csv")
test = pd.read_csv("./test_with_descriptors.csv")

In [3]:
# Dropping all null values
train = train.fillna(train.mean())
# Replacing null values with mean based on techniques in titanic dataset 
test = test.fillna(test.mean())

In [4]:
train

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,Assay_ID,Expected
0,9.316200,-1.533785,9.316200,0.150485,0.794714,317.599,306.511,315.982463,100.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1644,2
1,10.532611,0.333788,10.532611,0.333788,0.516641,156.269,136.109,156.151415,66.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,2451,2
2,2.433032,0.000000,2.433032,0.000000,0.251327,362.086,313.702,361.347528,148.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,1384,2
3,10.355080,-0.613825,10.355080,0.282361,0.487998,255.665,245.585,255.052302,90.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16,2
4,0.000000,0.000000,0.000000,0.000000,0.237972,149.894,149.894,149.894242,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1856,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75378,11.460021,-3.868472,11.460021,0.053611,0.712426,230.245,220.165,230.036128,82.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33,2
75379,5.928972,-2.841623,5.928972,0.082346,0.720533,313.747,296.611,313.041677,104.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1632,1
75380,4.975926,0.848333,4.975926,0.848333,0.596343,167.258,162.218,166.986341,50.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1373,1
75381,10.241948,0.324028,10.241948,0.324028,0.519485,128.215,112.087,128.120115,54.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,2


In [6]:
X = train.drop("Expected",axis=1)
y = train["Expected"]

In [7]:
#splitting the datasets for training and testing process
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.9, random_state=5)

In [8]:
#size for the sets
print('size of X_train:', X_train.shape)
print('size of X_test:', X_test.shape)
print('size of y_train:', y_train.shape)
print('size of y_test:', y_test.shape)

size of X_train: (7538, 209)
size of X_test: (67845, 209)
size of y_train: (7538,)
size of y_test: (67845,)


In [9]:
#histogram boosting gradient classifer
from sklearn.ensemble import HistGradientBoostingClassifier
hgb_classifier = HistGradientBoostingClassifier()
hgb_classifier.fit(X_train,y_train)
y_pred_hgb = hgb_classifier.predict(X_test)

In [10]:

cm_hgb = confusion_matrix(y_test, y_pred_hgb)
print(cm_hgb)


[[ 2965  6624]
 [ 1606 56650]]


In [11]:
accuracy_score(y_test, y_pred_hgb)
roc_auc_score(y_test, y_pred_hgb)

0.6408202461027379

In [12]:
acc_hgb = cross_val_score(estimator = hgb_classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy of hgb: {:.2f} %".format(acc_hgb.mean()*100))
print("SD of hgb: {:.2f} %".format(acc_hgb.std()*100))


Accuracy of hgb: 87.12 %
SD of hgb: 0.92 %


In [13]:
from sklearn.model_selection import GridSearchCV
parameters_hgb = [{'max_iter': [1000,1200,1500],
                'learning_rate': [0.1],
                'max_depth' : [25, 50, 75],
                'l2_regularization': [1.5],
                'scoring': ['f1_micro']}]

In [16]:
grid_search_hgb = GridSearchCV(estimator = hgb_classifier,
                           param_grid = parameters_hgb,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search_hgb.fit(X_train, y_train)
best_accuracy_hgb = grid_search_hgb.best_score_
best_parameter_hgb = grid_search_hgb.best_params_  
print("Best Accuracy of HGB: {:.2f} %".format(best_accuracy_hgb.mean()*100))
print("Best Parameter of HGB:", best_parameter_hgb)

Best Accuracy of HGB: 87.24 %
Best Parameter of HGB: {'l2_regularization': 1.5, 'learning_rate': 0.1, 'max_depth': 25, 'max_iter': 1000, 'scoring': 'f1_micro'}


In [17]:
predict = hgb_classifier.predict(test)

In [18]:
print(np.count_nonzero(predict==1))
print(np.count_nonzero(predict==2))

748
10246


In [19]:
# Create submission file 
submission_file = pd.read_csv("./sample_submission.csv")

In [20]:
submission_file

Unnamed: 0,Id,Predicted
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682,1
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,2
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,1
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850,2
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,2
...,...,...
10989,CC(=NOCC1=CC=CC=C1C(=NOC)C(=O)OC)C2=CC(=CC=C2)...,1
10990,C1=CC=C(C(=C1)C(C2=CC=C(C=C2)O)C3=CC=C(C=C3)O)...,2
10991,CC1=NC(=NC(=N1)OC)NC(=O)NS(=O)(=O)C2=CC=CC=C2C...,1
10992,COP(=O)(OC)OC=C(Cl)Cl;28,2


In [21]:
submission_file["Predicted"] = predict
submission_file.to_csv("rdkit_submission_13.csv",index=False)