In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

from sklearn.metrics import (
    log_loss,
    accuracy_score,
    roc_auc_score,
    classification_report,
    confusion_matrix
)

from sklearn.ensemble import GradientBoostingClassifier

import warnings
warnings.filterwarnings("ignore")

RANDOM_STATE = 42


In [None]:
train = pd.read_csv("/kaggle/input/mock-test-2-mse-2/train.csv")
test  = pd.read_csv("/kaggle/input/mock-test-2-mse-2/test.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)

train.head()


In [None]:
print("\nMissing Values:\n", train.isnull().sum())
print("\nDuplicate rows:", train.duplicated().sum())

train = train.drop_duplicates()


In [None]:
TARGET_COL = "Status" # change here
ID_COL = "id"

print(train[TARGET_COL].value_counts())
print("Number of classes:", train[TARGET_COL].nunique())


In [None]:
cols_to_drop = [
    'Drug', 'Ascites', 'Hepatomegaly', 'Spiders',
    'Cholesterol', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides'
]

train.drop(columns=cols_to_drop, inplace=True, errors="ignore")
test.drop(columns=cols_to_drop, inplace=True, errors="ignore")


In [None]:
train_ids = train[ID_COL]
test_ids  = test[ID_COL]

train.drop(columns=[ID_COL], inplace=True)
test.drop(columns=[ID_COL], inplace=True)

X = train.drop(columns=[TARGET_COL])
y = train[TARGET_COL]


In [None]:
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object", "category"]).columns

print("Numeric Features:", numeric_features)
print("Categorical Features:", categorical_features)


In [None]:
sns.countplot(x=y)
plt.title("Target Class Distribution")
plt.show()


In [None]:
for col in numeric_features:
    fig, ax = plt.subplots(1, 2, figsize=(10,4))
    sns.histplot(train[col], kde=True, ax=ax[0])
    sns.boxplot(x=train[col], ax=ax[1])
    plt.suptitle(col)
    plt.show()


In [None]:
for col in categorical_features:
    sns.countplot(y=train[col])
    plt.title(col)
    plt.show()


In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(train[numeric_features].corr(), cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=RANDOM_STATE
)


In [None]:
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, numeric_features),
    ("cat", categorical_pipeline, categorical_features)
])


In [None]:
le = LabelEncoder()

y_train_enc = le.fit_transform(y_train)
y_test_enc  = le.transform(y_test)


In [None]:
model = GradientBoostingClassifier(
    n_estimators=600,
    learning_rate=0.02,
    max_depth=4,
    min_samples_split=42,
    min_samples_leaf=18,
    max_features=0.7,
    subsample=0.7,
    random_state=RANDOM_STATE
)


In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=500,
    max_depth=14,
    min_samples_split=30,
    min_samples_leaf=15,
    max_features="sqrt",
    class_weight="balanced",
    random_state=RANDOM_STATE,
    n_jobs=-1
)


In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier

model = HistGradientBoostingClassifier(
    learning_rate=0.05,
    max_depth=8,
    max_iter=500,
    min_samples_leaf=25,
    l2_regularization=0.2,
    max_bins=255,
    random_state=RANDOM_STATE
)


In [None]:

# ---------------------------------------------#
#              XGBOOST MODEL
# ---------------------------------------------#
model = XGBClassifier(
    n_estimators=700,
    learning_rate=0.03,
    max_depth=6,
    subsample=0.85,
    colsample_bytree=0.85,
    gamma=1,
    reg_alpha=0.2,
    reg_lambda=1.0,
    random_state=42,
    objective="multi:softprob",
    eval_metric="mlogloss"
)


In [None]:

from lightgbm import LGBMClassifier

model = LGBMClassifier(
    n_estimators=1200,
    learning_rate=0.03,
    max_depth=-1,
    num_leaves=50,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.2,
    reg_lambda=1.0,
    random_state=42
)


In [None]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(
    iterations=1000,
    depth=6,
    learning_rate=0.03,
    loss_function="MultiClass",
    eval_metric="Accuracy",
    random_seed=42,
    verbose=False
)

In [None]:
pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("model", model)
])


In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

cv_scores = cross_val_score(
    pipeline,
    X_train,
    y_train_enc,
    scoring="neg_log_loss",
    cv=cv,
    n_jobs=-1
)

print("CV Log Loss:", -cv_scores.mean())


In [None]:
pipeline.fit(X_train, y_train_enc)


In [None]:
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)

print("Log Loss:", log_loss(y_test_enc, y_proba))
print("Accuracy:", accuracy_score(y_test_enc, y_pred))
print("ROC-AUC (OVR):", roc_auc_score(y_test_enc, y_proba, multi_class="ovr"))

print("\nClassification Report:\n")
print(classification_report(y_test_enc, y_pred, target_names=le.classes_))


In [None]:
cm = confusion_matrix(y_test_enc, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.show()


In [None]:
test_proba = pipeline.predict_proba(test)

submission = pd.DataFrame(
    test_proba,
    columns=[f"Status_{cls}" for cls in le.classes_]
)

submission.insert(0, "id", test_ids)
submission.to_csv("submission_pipeline_final.csv", index=False)

print("‚úÖ Submission file created successfully")
submission.head()


In [None]:
cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=RANDOM_STATE
)

cv_scores = cross_val_score(
    pipeline,
    X_train,
    y_train_enc,
    scoring="neg_log_loss",
    cv=cv,
    n_jobs=-1
)

print("CV Log Loss:", -cv_scores.mean())


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    log_loss,
    accuracy_score,
    roc_auc_score,
    classification_report,
    confusion_matrix
)
from sklearn.ensemble import GradientBoostingClassifier
from scipy.sparse import hstack

# ============================
# 1. Load Data
# ============================
train = pd.read_csv("/kaggle/input/mock-test-2-mse-2/train.csv")
test = pd.read_csv("/kaggle/input/mock-test-2-mse-2/test.csv")

# ============================
# 2. Drop unwanted columns
# ============================
cols_to_drop = ['Drug', 'Ascites', 'Hepatomegaly', 'Spiders',
                'Cholesterol', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides']

train.drop(columns=cols_to_drop, inplace=True, errors='ignore')
test.drop(columns=cols_to_drop, inplace=True, errors='ignore')

# ============================
# 3. Fill specific numeric columns
# ============================
for col in ['Platelets', 'Prothrombin']:
    train[col] = train[col].fillna(train[col].mean())
    test[col] = test[col].fillna(test[col].mean())

# ============================
# 4. Separate ID + Target
# ============================
train_ids = train['id']
test_ids = test['id']

train.drop(columns=['id'], inplace=True)
test.drop(columns=['id'], inplace=True)

X = train.drop(columns=['Status'])
y = train['Status']

# ============================
# 5. Train-Test split
# ============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ============================
# 6. Identify numeric + categorical features
# ============================
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# ============================
# 7. Impute + Scale numeric features
# ============================
num_imputer = SimpleImputer(strategy='mean')
scaler = StandardScaler()

X_train_num = scaler.fit_transform(num_imputer.fit_transform(X_train[numeric_features]))
X_test_num = scaler.transform(num_imputer.transform(X_test[numeric_features]))
test_num = scaler.transform(num_imputer.transform(test[numeric_features]))

# ============================
# 8. Impute + Encode categorical features
# ============================
cat_imputer = SimpleImputer(strategy='most_frequent')
ohe = OneHotEncoder(handle_unknown='ignore')

X_train_cat = ohe.fit_transform(cat_imputer.fit_transform(X_train[categorical_features]))
X_test_cat = ohe.transform(cat_imputer.transform(X_test[categorical_features]))
test_cat = ohe.transform(cat_imputer.transform(test[categorical_features]))

# ============================
# 9. Combine numeric + categorical
# ============================
X_train_final = hstack([X_train_num, X_train_cat])
X_test_final = hstack([X_test_num, X_test_cat])
test_final = hstack([test_num, test_cat])

# ============================
# 10. Label Encode Target
# ============================
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

# ============================
# 11. Model Training
# ============================
model = GradientBoostingClassifier(
    n_estimators=600,
    learning_rate=0.02,
    max_depth=4,
    min_samples_split=42,
    min_samples_leaf=18,
    max_features=0.7,
    subsample=0.7,
    random_state=42
)

model.fit(X_train_final, y_train_enc)

# ============================
# 12. Predictions
# ============================
y_pred = model.predict(X_test_final)
y_pred_proba = model.predict_proba(X_test_final)

# ============================
# 13. Evaluation Metrics
# ============================

# Log Loss
loss = log_loss(y_test_enc, y_pred_proba)
print(f"\nüîç Log Loss: {loss:.5f}")

# Accuracy
acc = accuracy_score(y_test_enc, y_pred)
print(f"üéØ Accuracy: {acc:.5f}")

# ROC-AUC (multiclass)
roc_auc = roc_auc_score(y_test_enc, y_pred_proba, multi_class='ovr')
print(f"üß≤ ROC-AUC (OVR): {roc_auc:.5f}")

# Classification Report
print("\nüìÑ Classification Report:")
print(classification_report(y_test_enc, y_pred, target_names=le.classes_))

# Confusion Matrix
print("\nüß© Confusion Matrix:")
print(confusion_matrix(y_test_enc, y_pred))

# ============================
# 14. Predict on test.csv
# ============================
probs = model.predict_proba(test_final)
class_names = le.classes_

submission = pd.DataFrame(probs, columns=[f"Status_{cls}" for cls in class_names])
submission.insert(0, 'id', test_ids)
submission.to_csv("submission_no_pipeline_metrics.csv", index=False)
print("\n‚úÖ Submission file created successfully!")
print(submission.head())
üîç Log Loss: 0.38177
üéØ Accuracy: 0.85033
üß≤ ROC-AUC (OVR): 0.91439

üìÑ Classification Report:
              precision    recall  f1-score   support

           C       0.86      0.94      0.90      2004
          CL       0.62      0.12      0.20        67
           D       0.82      0.72      0.76       929

    accuracy                           0.85      3000
   macro avg       0.77      0.59      0.62      3000
weighted avg       0.84      0.85      0.84      3000


üß© Confusion Matrix:
[[1877    1  126]
 [  34    8   25]
 [ 259    4  666]]

‚úÖ Submission file created successfully!
      id  Status_C  Status_CL  Status_D
0  15000  0.949418   0.004539  0.046043
1  15001  0.980001   0.003756  0.016243
2  15002  0.927062   0.008343  0.064594
3  15003  0.061800   0.164013  0.774188
4  15004  0.980146   0.003517  0.016338