In [24]:
import pandas as pd
import numpy as np

## Load the dataset

In [25]:
data = pd.read_csv('mutagenicity_kNN.csv')

In [26]:
# Display the first few rows of the dataset
print(data.head())

   Unnamed: 0  Id       CAS                     SMILES    Status  \
0           0   1  100-00-5   O=[N+]([O-])c1ccc(cc1)Cl  Training   
1           1   2  100-01-6    O=[N+]([O-])c1ccc(N)cc1  Training   
2           2   3  100-02-7    O=[N+]([O-])c1ccc(O)cc1  Training   
3           3   4  100-11-8  O=[N+]([O-])c1ccc(cc1)CBr  Training   
4           4   5  100-12-9   O=[N+]([O-])c1ccc(cc1)CC  Training   

   Experimental value Predicted value  NumValenceElectrons       qed   TPSA  \
0                   1               1                   52  0.463602  43.14   
1                   1               1                   52  0.359544  69.16   
2                   0               1                   52  0.470728  63.37   
3                   1               0                   58  0.432586  43.14   
4                   0               0                   58  0.479785  43.14   

     MolMR  BalabanJ     BertzCT    MolWt  MolLogP  
0  38.1064  3.003401  244.429658  157.556   2.2482  
1  37.5088

In [27]:
# Checking for missing values
print(data.isnull().sum())

Unnamed: 0             0
Id                     0
CAS                    0
SMILES                 0
Status                 0
Experimental value     0
Predicted value        0
NumValenceElectrons    0
qed                    0
TPSA                   0
MolMR                  0
BalabanJ               0
BertzCT                0
MolWt                  0
MolLogP                0
dtype: int64


In [28]:
# Separate features and target variable
X = data.drop(['Id', 'CAS', 'SMILES', 'Status', 'Experimental value', 'Predicted value'], axis=1)
y = data['Experimental value']

In [29]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
# Standardize the features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [31]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train a Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [32]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [33]:
# Evaluate the model
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"Test set accuracy with best k: {accuracy:.4f}")


Test set accuracy with best k: 0.7563


In [34]:
# Best K calculation 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {'n_neighbors': list(range(1, 31))}
knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best hyperparameter
best_k = grid_search.best_params_['n_neighbors']
best_f1_score = grid_search.best_score_

In [35]:
# Print classification report
from sklearn.metrics import classification_report

print(f"Best k: {best_k}")
print(classification_report(y_test, y_pred))

Best k: 11
              precision    recall  f1-score   support

           0       0.71      0.75      0.73       503
           1       0.80      0.76      0.78       650

    accuracy                           0.76      1153
   macro avg       0.75      0.76      0.75      1153
weighted avg       0.76      0.76      0.76      1153



In [36]:
# Feature importance
importances = model.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False))

               Feature  Importance
6              BertzCT    0.137641
2                  qed    0.136895
0           Unnamed: 0    0.123196
7                MolWt    0.114340
8              MolLogP    0.106800
5             BalabanJ    0.106580
4                MolMR    0.101677
3                 TPSA    0.096383
1  NumValenceElectrons    0.076488
