In [None]:
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC

# Metrics
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_curve,
    roc_auc_score,
    precision_recall_curve,
    log_loss
)

import warnings
warnings.filterwarnings("ignore")

RANDOM_STATE = 42


In [None]:
train = pd.read_csv("/kaggle/input/mle-ese-mock/train (5).csv")   # CHANGE PATH
test  = pd.read_csv("/kaggle/input/mle-ese-mock/test (4).csv")    # CHANGE PATH

print(train.shape, test.shape)
train.head()


In [None]:
print(train.isnull().sum())
print("Duplicates:", train.duplicated().sum())

train = train.drop_duplicates()


In [None]:
TARGET_COL = "quality_grade"   # CHANGE
ID_COL = "id"

train = train.dropna(subset=[TARGET_COL])

print(train[TARGET_COL].value_counts())
print("Classes:", train[TARGET_COL].nunique())


In [None]:
X = train.drop(columns=[ID_COL, TARGET_COL])
y = train[TARGET_COL]

numeric_features = X.select_dtypes(include=["int64","float64"]).columns
categorical_features = X.select_dtypes(include=["object","category"]).columns

print("Numerical:", numeric_features)
print("Categorical:", categorical_features)


In [None]:
sns.countplot(x=y)
plt.title("Target Distribution")
plt.show()


In [None]:
for col in numeric_features:
    fig, ax = plt.subplots(1,2,figsize=(10,4))
    sns.histplot(train[col], kde=True, ax=ax[0])
    sns.boxplot(x=train[col], ax=ax[1])
    plt.suptitle(col)
    plt.show()


In [None]:
for col in categorical_features:
    sns.countplot(y=train[col])
    plt.title(col)
    plt.show()


In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(train[numeric_features].corr(), cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=y
)


In [None]:
numerical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numerical_pipeline, numeric_features),
    ("cat", categorical_pipeline, categorical_features)
])


In [None]:
le = LabelEncoder()

y_train_enc = le.fit_transform(y_train)
y_test_enc  = le.transform(y_test)


In [None]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, class_weight="balanced"),
    "DecisionTree": DecisionTreeClassifier(max_depth=6, class_weight="balanced"),
    "RandomForest": RandomForestClassifier(
        n_estimators=300, max_depth=10, class_weight="balanced", random_state=42
    ),
    "HistGradientBoosting": HistGradientBoostingClassifier(
        loss="log_loss",
        max_iter=100,
        learning_rate=0.05,
        max_depth=6,
        min_samples_leaf=30,
        l2_regularization=0.1,
        random_state=42
    ),
    "SVM": SVC(probability=True, class_weight="balanced")
}


In [None]:
results = {}

for name, model in models.items():
    print(f"\nTraining {name}")

    pipeline = Pipeline([
        ("preprocessing", preprocessor),
        ("model", model)
    ])

    pipeline.fit(X_train, y_train_enc)

    y_pred = pipeline.predict(X_test)
    y_proba = pipeline.predict_proba(X_test)

    acc = accuracy_score(y_test_enc, y_pred)
    loss = log_loss(y_test_enc, y_proba)

    results[name] = acc

    print("Accuracy:", acc)
    print("Log Loss:", loss)
    print(classification_report(y_test_enc, y_pred))


In [None]:
plt.bar(results.keys(), results.values())
plt.title("Model Accuracy Comparison")
plt.ylabel("Accuracy")
plt.xticks(rotation=45)
plt.show()


In [None]:
best_model_name = max(results, key=results.get)
print("Best Model:", best_model_name)

best_model = models[best_model_name]

final_pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("model", best_model)
])

final_pipeline.fit(X_train, y_train_enc)


In [None]:
y_pred = final_pipeline.predict(X_test)
y_proba = final_pipeline.predict_proba(X_test)

cm = confusion_matrix(y_test_enc, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.show()

if len(le.classes_) == 2:
    fpr, tpr, _ = roc_curve(y_test_enc, y_proba[:,1])
    plt.plot(fpr, tpr)
    plt.plot([0,1],[0,1],"--")
    plt.title("ROC Curve")
    plt.show()


In [None]:
test_id = test[ID_COL]
test = test.drop(columns=[ID_COL])

test_proba = final_pipeline.predict_proba(test)

submission = pd.DataFrame(
    test_proba,
    columns=[f"Status_{cls}" for cls in le.classes_]
)

submission.insert(0, "id", test_id)
submission.to_csv("submission.csv", index=False)

submission.head()
