In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

final_features = [
    'SpMax3_Bhv', 'SpMax3_Bhi', 'ETA_EtaP_F_L', 'MATS1v',
    'ATS5e', 'ETA_Eta_F_L', 'CIC2', 'SpDiam_Dt',
    'SpMin3_Bhv', 'ATSC7p', 'ATSC5i', 'piPC2'
]

df = pd.read_csv('Molecules_Toxicity_Classification.csv')
df = df.loc[:, df.nunique() > 1]
if 'Unnamed: 0' in df.columns:
    df = df.drop('Unnamed: 0', axis=1)

X = df[final_features]
y = df['Class']

le = LabelEncoder()
y_encoded = le.fit_transform(y)

pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)), #Without SMOTE we got 75%, but keeping the data imbalance means that recall decreases alot (which we dont want) so we sacrifice a bit of accuracy to make more sure we are hitting the toxic predictions.
    ('scaler', StandardScaler()),
    ('xgb', XGBClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=3,
        random_state=42,
        eval_metric='logloss'
    ))
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
acc_scores = []

for fold_idx, (train_idx, test_idx) in enumerate(skf.split(X, y_encoded), start=1):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y_encoded[train_idx], y_encoded[test_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    acc_scores.append(acc)
    print(f"Fold {fold_idx} Accuracy: {acc:.4f}")

print(f"\nMean Accuracy: {np.mean(acc_scores):.4f} ± {np.std(acc_scores):.4f}")







Fold 1 Accuracy: 0.5938
Fold 2 Accuracy: 0.7812
Fold 3 Accuracy: 0.7812
Fold 4 Accuracy: 0.7188
Fold 5 Accuracy: 0.7742

Mean Accuracy: 0.7298 ± 0.0720


In [None]:
def predict_test(pipeline, le, df_new, final_features):
    #df= df_new.tail(12).copy()
    # X_new = df[final_features]
    X_new=df_new[final_features]
    y_pred_enc = pipeline.predict(X_new)
    return le.inverse_transform(y_pred_enc)


df_new = pd.read_csv('data.csv').drop(columns=['Unnamed: 0'], errors='ignore')
preds = predict_test(pipeline, le, df_new, final_features)
print("Predictions on  rows:", preds)