In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

from sklearn.metrics import (
    log_loss,
    accuracy_score,
    roc_auc_score,
    classification_report,
    confusion_matrix
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier
)
from sklearn.svm import SVC

import warnings
warnings.filterwarnings("ignore")

RANDOM_STATE = 42


In [None]:
train = pd.read_csv("/kaggle/input/mock-test-2-mse-2/train.csv")
test  = pd.read_csv("/kaggle/input/mock-test-2-mse-2/test.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)

train = train.drop_duplicates()


In [None]:
TARGET_COL = "Status"
ID_COL = "id"

print(train[TARGET_COL].value_counts())
print("Number of classes:", train[TARGET_COL].nunique())


In [None]:
cols_to_drop = [
    'Drug', 'Ascites', 'Hepatomegaly', 'Spiders',
    'Cholesterol', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides'
]

train.drop(columns=cols_to_drop, inplace=True, errors="ignore")
test.drop(columns=cols_to_drop, inplace=True, errors="ignore")


In [None]:
train_ids = train[ID_COL]
test_ids  = test[ID_COL]

train.drop(columns=[ID_COL], inplace=True)
test.drop(columns=[ID_COL], inplace=True)

X = train.drop(columns=[TARGET_COL])
y = train[TARGET_COL]


In [None]:
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object", "category"]).columns


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=RANDOM_STATE
)


In [None]:
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, numeric_features),
    ("cat", categorical_pipeline, categorical_features)
])


In [None]:
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc  = le.transform(y_test)


In [None]:
models = {
    "LogisticRegression": LogisticRegression(
        max_iter=1000,
        class_weight="balanced"
    ),

    "DecisionTree": DecisionTreeClassifier(
        max_depth=6,
        min_samples_leaf=20,
        class_weight="balanced",
        random_state=RANDOM_STATE
    ),

    "RandomForest": RandomForestClassifier(
        n_estimators=300,
        max_depth=10,
        min_samples_leaf=20,
        class_weight="balanced",
        random_state=RANDOM_STATE
    ),

    "GradientBoosting": GradientBoostingClassifier(
        n_estimators=600,
        learning_rate=0.02,
        max_depth=4,
        min_samples_leaf=18,
        subsample=0.7,
        random_state=RANDOM_STATE
    ),

    "SVM": SVC(
        probability=True,
        class_weight="balanced",
        random_state=RANDOM_STATE
    )
}


In [None]:
results = {}

for name, model in models.items():
    print(f"\nðŸ”¹ Training {name}")

    pipeline = Pipeline([
        ("preprocessing", preprocessor),
        ("model", model)
    ])

    pipeline.fit(X_train, y_train_enc)

    y_pred = pipeline.predict(X_test)
    y_proba = pipeline.predict_proba(X_test)

    loss = log_loss(y_test_enc, y_proba)
    acc  = accuracy_score(y_test_enc, y_pred)
    roc  = roc_auc_score(y_test_enc, y_proba, multi_class="ovr")

    results[name] = {
        "pipeline": pipeline,
        "log_loss": loss,
        "accuracy": acc,
        "roc_auc": roc
    }

    print(f"Log Loss: {loss:.4f}")
    print(f"Accuracy: {acc:.4f}")
    print(f"ROC-AUC: {roc:.4f}")


In [None]:
best_model_name = min(results, key=lambda x: results[x]["log_loss"])
best_pipeline = results[best_model_name]["pipeline"]

print("\nâœ… BEST MODEL SELECTED")
print("Model:", best_model_name)
print("Log Loss:", results[best_model_name]["log_loss"])


In [None]:
y_pred = best_pipeline.predict(X_test)
y_proba = best_pipeline.predict_proba(X_test)

print("\nClassification Report:\n")
print(classification_report(y_test_enc, y_pred, target_names=le.classes_))

cm = confusion_matrix(y_test_enc, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.show()


In [None]:
test_proba = best_pipeline.predict_proba(test)

submission = pd.DataFrame(
    test_proba,
    columns=[f"Status_{cls}" for cls in le.classes_]
)

submission.insert(0, "id", test_ids)
submission.to_csv("submission_best_model.csv", index=False)

print("âœ… Submission file created successfully")
submission.head()
