<a href="https://colab.research.google.com/github/Kushal-55/chemical_toxicity_prediction_challenge/blob/main/toxicity_prediction_challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install rdkit==2022.9.4


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rdkit==2022.9.4
  Downloading rdkit-2022.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.3/29.3 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2022.9.4


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Load training and testing data
import pandas as pd
import numpy as np
train_raw = pd.read_csv('/content/drive/MyDrive/Data Mining and Machine Learning Assignment 1/train_II.csv')
test_raw = pd.read_csv('/content/drive/MyDrive/Data Mining and Machine Learning Assignment 1/test_II.csv')



In [5]:
train_raw.head(25)

Unnamed: 0,Id,Expected
0,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O;1644,2
1,CCCCCCCCC(=O)C;2451,2
2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-];1384,2
3,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl;16,2
4,[Na+].[I-];1856,2
5,CCCSP(=O)(OCC)SCCC;1646,2
6,C(C(C(C(C(F)(F)S(=O)(=O)O)(F)F)(F)F)(F)F)(C(C(...,1
7,C1=C(C=C(C(=C1Br)O)Br)C#N;1611,2
8,CCCC(=O)OCC(COC(=O)CCC)OC(=O)CCC;1852,2
9,CCOC(=O)CC(C(=O)OCC)SP(=O)(OC)OC;1647,2


In [6]:
# feature extraction
train_raw[['chemical_id', 'assay_id']] = train_raw['Id'].str.split(';', expand=True)
train_raw['label'] =train_raw['Expected']
train_raw = train_raw.drop(['Id', 'Expected'], axis=1)

from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
features = []
for chemical_id in train_raw['chemical_id']:
    mol = Chem.MolFromSmiles(chemical_id)
    if mol is not None:
        features.append([Descriptors.MolWt(mol),
                         Descriptors.MolLogP(mol),
                         Descriptors.TPSA(mol),
                         Descriptors.BalabanJ(mol),
                         Descriptors.MolMR(mol),
                         ])
                         
    else:
        features.append([0, 0, 0, 0, 0,])

df_features = pd.DataFrame(features, columns=['MolecularWeight','MolLogP','TPSA','BalabanJ','MolMR'])
train_raw = pd.concat([train_raw, df_features], axis=1)

# imputing null values
train_raw['TPSA'].fillna(train_raw['TPSA'].mean(), inplace=True)
train_raw['BalabanJ'].fillna(train_raw['BalabanJ'].mean(), inplace=True)
train_raw['MolMR'].fillna(train_raw['MolMR'].mean(), inplace=True)


In [7]:
train_raw.isnull().sum()


chemical_id        0
assay_id           0
label              0
MolecularWeight    0
MolLogP            0
TPSA               0
BalabanJ           0
MolMR              0
dtype: int64

In [8]:
correlation = train_raw.corr()
correlation

Unnamed: 0,label,MolecularWeight,MolLogP,TPSA,BalabanJ,MolMR
label,1.0,-0.097222,-0.108257,0.012733,0.06624,-0.110736
MolecularWeight,-0.097222,1.0,0.418359,0.594094,-0.341084,0.925615
MolLogP,-0.108257,0.418359,1.0,-0.239578,0.083304,0.523516
TPSA,0.012733,0.594094,-0.239578,1.0,-0.234105,0.516856
BalabanJ,0.06624,-0.341084,0.083304,-0.234105,1.0,-0.412969
MolMR,-0.110736,0.925615,0.523516,0.516856,-0.412969,1.0


In [9]:
# feature extraction for test data
test_raw
test_raw[['chemical_id', 'assay_id']] = test_raw['x'].str.split(';', expand=True)
test_raw = test_raw.drop(['x'], axis=1)
test_features = []
for chemical_id in test_raw['chemical_id']:
    mol = Chem.MolFromSmiles(chemical_id)
    if mol is not None:
        test_features.append([Descriptors.MolWt(mol),
                         Descriptors.MolLogP(mol),
                         Descriptors.TPSA(mol),
                         Descriptors.BalabanJ(mol),
                         Descriptors.MolMR(mol),
                         ])
                         
    else:
        test_features.append([0, 0, 0, 0,0,])

df_features_test = pd.DataFrame(test_features, columns=['MolecularWeight', 'MolLogP','TPSA','BalabanJ','MolMR'])
test_raw = pd.concat([test_raw, df_features_test], axis=1)

In [10]:
test_raw.isnull().sum()

chemical_id        0
assay_id           0
MolecularWeight    0
MolLogP            0
TPSA               0
BalabanJ           0
MolMR              0
dtype: int64

In [16]:
# Applying models
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

X_train = train_raw.drop(['chemical_id', 'label'], axis=1).values
y_train = train_raw['label'].values


from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)

xgb = XGBClassifier(n_estimators=300, max_depth=10, random_state = 42)
xgb.fit(X_train, y_train)
y_pred6 = xgb.predict(X_train)
score6 = xgb.score(X_train, y_train)
print('Accuracy for XGBClassifier', score6)


Accuracy for XGBClassifier 0.9889099664380563


In [12]:
X_train[100]

array(['1855', 335.28200000000004, 4.148200000000004, 89.52,
       3.6993779362256016, 77.54780000000001], dtype=object)

In [15]:
# Internal evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
X = X_train
y = y_train
X_test = test_raw.drop(['chemical_id'], axis=1).values

X_train, X_val, y_train, y_val = train_test_split(X,
                                                  y,
                                                  test_size=0.2, 
                                                  random_state=42)

xgb = XGBClassifier(n_estimators=300, max_depth=10 )
xgb.fit(X_train, y_train)
predictions = xgb.predict(X_val)

#f1 score 
f1 = f1_score(y_val, predictions)

# Print F1 score
print("F1 score:", f1)

F1 score: 0.9440158108775796


In [13]:
# grid search for xgb
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

xgb_model = xgb.XGBClassifier()

param_grid = {
    'learning_rate': [0.1],
    'max_depth': [10],
    'n_estimators': [300],
    'gamma': [0, 0.1, 0.2,],
    'subsample': [0.5, 0.8, 1.0],
    'colsample_bytree': [0.5, 0.8, 1.0]
}

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5)

grid_search.fit(X_train, y_train)

# print the best parameters and the corresponding score
print("Best parameters found:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)


Best parameters found: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 300, 'subsample': 0.8}
Best score: 0.9073796230205067


In [17]:
# predicting results
X_test = test_raw.drop(['chemical_id'], axis=1).values

from sklearn.metrics import confusion_matrix, accuracy_score
y_pred_test = xgb.predict(X_test)
y_pred_test = le.inverse_transform(y_pred_test)

# Creating a submission file
submission = pd.DataFrame({'Id': test_raw['chemical_id'] + ';' + test_raw['assay_id'].astype(str), 'Predicted': y_pred_test})
submission
submission.to_csv('/content/drive/MyDrive/Data Mining and Machine Learning Assignment 1/project_submission_xgb_final.csv', index=False)