In [64]:
import pandas as pd
import numpy as np 
from rdkit.Chem import AllChem, PandasTools
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split


In [65]:
train_descriptors = pd.read_csv("./train_data_descriptors.csv")
test_descriptors = pd.read_csv("./test_data_descriptors.csv")

In [66]:
le = LabelEncoder()

In [67]:
# Replacing null values with mean based on techniques in titanic dataset 
train_dataset = train_descriptors.fillna(train_descriptors.mean())
test_dataset = test_descriptors.fillna(test_descriptors.mean())

In [68]:
# Drop the "Expected" column from the DataFrame
X = train_dataset.drop("label", axis=1)
y = train_dataset["label"]

In [69]:
y = le.fit_transform(y)

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [71]:
# performing preprocessing part
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
 
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [72]:
# Applying PCA function on training
# and testing set of X component
from sklearn.decomposition import PCA
 
pca = PCA(n_components = 50)
 
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
test_dataset = pca.transform(test_dataset)
 
explained_variance = pca.explained_variance_ratio_



In [73]:
# build the XGBoost model
from xgboost import XGBClassifier

clf = XGBClassifier(n_estimators=3570,learning_rate = 0.1)
clf.fit(X_train, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=3570, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [74]:
# predict the results
predict=clf.predict(test_dataset)

In [75]:
predict2 = clf.predict(X_test)

In [76]:
from sklearn.metrics import f1_score

f1_score(y_test, predict2, average= None).mean()

0.637776286575869

In [77]:
predict3 = le.inverse_transform(predict)
predict3

array([2, 1, 1, ..., 1, 2, 2])

In [78]:
print(np.count_nonzero(predict3==1))
print(np.count_nonzero(predict3==2))

7090
3904


In [19]:
# Create submission file 
submission_file = pd.read_csv("./sample_submission.csv")

In [20]:
submission_file

Unnamed: 0,Id,Predicted
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682,1
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,2
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,1
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850,2
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,2
...,...,...
10989,CC(=NOCC1=CC=CC=C1C(=NOC)C(=O)OC)C2=CC(=CC=C2)...,1
10990,C1=CC=C(C(=C1)C(C2=CC=C(C=C2)O)C3=CC=C(C=C3)O)...,2
10991,CC1=NC(=NC(=N1)OC)NC(=O)NS(=O)(=O)C2=CC=CC=C2C...,1
10992,COP(=O)(OC)OC=C(Cl)Cl;28,2


In [21]:
submission_file["Predicted"] = predict3
submission_file.to_csv("rdkit_submission_44.csv",index=False)