In [98]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

final_features = [
    'SpMax3_Bhv', 'SpMax3_Bhi', 'ETA_EtaP_F_L', 'MATS1v',
    'ATS5e', 'ETA_Eta_F_L', 'CIC2', 'SpDiam_Dt',
    'SpMin3_Bhv', 'ATSC7p', 'ATSC5i', 'piPC2'
]

df = pd.read_csv('Molecules_Toxicity_Classification.csv')
df = df.loc[:, df.nunique() > 1]
if 'Unnamed: 0' in df.columns:
    df = df.drop('Unnamed: 0', axis=1)

X = df[final_features]
y = df['Class']

le = LabelEncoder()
y_encoded = le.fit_transform(y)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

acc_scores = []

for fold_idx, (train_idx, test_idx) in enumerate(skf.split(X, y_encoded), start=1):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y_encoded[train_idx], y_encoded[test_idx]

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model = XGBClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=3,
        random_state=42,
        eval_metric='logloss'
    )
    model.fit(X_train_scaled, y_train)

    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    acc_scores.append(acc)
    print(f"Fold {fold_idx} Accuracy: {acc:.4f}")

print(f"\nMean Accuracy: {np.mean(acc_scores):.4f} ± {np.std(acc_scores):.4f}")







Fold 1 Accuracy: 0.6562
Fold 2 Accuracy: 0.7500
Fold 3 Accuracy: 0.8125
Fold 4 Accuracy: 0.7500
Fold 5 Accuracy: 0.7742

Mean Accuracy: 0.7486 ± 0.0515
