In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt



In [1]:
!unzip playground-series-s4e1.zip

Archive:  playground-series-s4e1.zip
replace sample_submission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [4]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

test_ids = test_df.id
target = train_df.Exited
train_df.drop(["Exited"], axis=1, inplace=True)

data = pd.concat([train_df, test_df], axis=0)

In [6]:
assert len(test_df) + len(train_df) == len(data)

In [5]:
categ_cols = data.select_dtypes(include="object").columns.tolist()
num_cols = data.select_dtypes(exclude="object").columns.tolist()


In [6]:
data = pd.get_dummies(data, columns=categ_cols, drop_first=True, dtype=np.uint16)


In [7]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='most_frequent')
X_imputed = imputer.fit_transform(data)
data = pd.DataFrame(X_imputed, columns=data.columns)

: 

In [None]:
from sklearn.preprocessing import RobustScaler

# Extracting numerical columns
num_cols = data.select_dtypes(exclude='object').columns.tolist()

# Initializing RobustScaler
scaler = RobustScaler()

# Scaling numerical features in the train dataset
data[num_cols] = scaler.fit_transform(data[num_cols])

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold

# Initializing Stratified K-Fold with 5 folds
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
train_final = data.loc[:train_df.index.max(), :].copy()
test_final = data.loc[train_df.index.max():, :].reset_index(drop=True).copy()

In [None]:
from sklearn.model_selection import train_test_split

# Splitting the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(train_final, target, test_size=0.2, random_state=3935, stratify=target)

In [None]:
%%capture

import lightgbm as lgb
from sklearn.metrics import log_loss

# List to store trained LightGBM models
lg_models = []

# Parameters for the LightGBM model
params = {
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'max_depth': 15,
    'min_child_samples': 13,
    'learning_rate': 0.05285597081335651,
    'n_estimators': 284,
    'min_child_weight': 5,
    'subsample': 0.7717873512945741,
    'colsample_bytree': 0.10012816493265511,
    'reg_alpha': 0.8767668608061822,
    'reg_lambda': 0.8705834466355764,
    'random_state': 42,
    'verbose': -1
}

# Training multiple LightGBM models using Stratified K-Fold
for x_idx, val_idx in skf.split(X_train, y_train):
    LGBModel = lgb.LGBMClassifier(**params)
    LGBModel.fit(X_train.iloc[x_idx], y_train[x_idx], eval_set=[(X_train.iloc[val_idx], y_train[val_idx])])
    lg_models.append(LGBModel)

In [None]:
for i, LGBModel in enumerate(lg_models):
    y_pred = LGBModel.predict_proba(X_test)
    print(f'Model {i+1} Log Loss: ', log_loss(y_test, y_pred))

In [None]:
%%capture

import xgboost as xgb

# List to store trained XGBoost models
xgb_models = []

# Parameters for the XGBoost model
params = {
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'n_estimators': 567,
    'learning_rate': 0.03820381248841593,
    'max_depth': 9,
    'subsample': 0.5717706003972762,
    'colsample_bytree': 0.1386492762520236,
    'min_child_weight': 6,
    'random_state': 42,
    'verbose': -1
}

# Training multiple XGBoost models using Stratified K-Fold
for x_idx, val_idx in skf.split(X_train, y_train):
    xgb_model = xgb.XGBClassifier(**params)
    xgb_model.fit(X_train.iloc[x_idx], y_train[x_idx], eval_set=[(X_train.iloc[val_idx], y_train[val_idx])], verbose=0)
    xgb_models.append(xgb_model)

In [None]:
# Evaluating the models on the test set
for i, xgb_model in enumerate(xgb_models):
    y_pred = xgb_model.predict_proba(X_test)
    print(f'Model {i+1} Log Loss: ', log_loss(y_test, y_pred))

In [None]:
from catboost import CatBoostClassifier

# List to store trained CatBoost models
cat_models = []

# Parameters for the CatBoost model
params = {
    'logging_level': 'Silent',
    'random_seed': 42,
    'iterations': 593,
    'depth': 43,
    'min_data_in_leaf': 42,
    'learning_rate': 0.023456006693305914,
    'subsample': 0.8018560299887264,
    'random_strength': 0.04176274518438195,
    'grow_policy': 'Lossguide',
    'bootstrap_type' : 'Bernoulli',
    # 'bootstrap_type': 'Poisson'
}

# Training multiple CatBoost models using Stratified K-Fold
for x_idx, val_idx in skf.split(X_train, y_train):
    cat_model = CatBoostClassifier(**params)
    cat_model.fit(X=X_train.iloc[x_idx], y=y_train[x_idx], eval_set=[(X_train.iloc[val_idx], y_train[val_idx])])
    cat_models.append(cat_model)

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.neural_network import MLPClassifier
# Initializing an MLPClassifier
mlp = MLPClassifier(
    hidden_layer_sizes=(64, 32),
    max_iter=1000,
    random_state=42,
    activation='relu',
    learning_rate_init=0.001,
    solver='adam',
    validation_fraction=0.1,
    momentum=0.9,
    nesterovs_momentum=True,
    batch_size=32,
    beta_1=0.9,
    beta_2=0.999
)

# Creating a StackingClassifier
stacking_model = StackingClassifier(
    estimators=[
        ('LGBM', LGBModel),
        ('XGB', xgb_model),
        ('CAT', cat_model)
    ],
    final_estimator=mlp,
    cv=skf
)

In [None]:
%%capture
# Fitting the StackingClassifier on the training data
stacking_model.fit(X_train, y_train)

In [None]:
test_df = pd.get_dummies(test_df, columns=cat_cols, drop_first=True, dtype=int)

X_pred = test_df.copy()
X_pred = scaler.transform(X_pred)

y_test_pred = stacking_model.predict_proba(X_pred)
class_labels = ['Status_C', 'Status_CL', 'Status_D']

# Create a DataFrame with the given output array
output_df = pd.DataFrame(y_test_pred, columns=class_labels)

# Add ID column to the DataFrame
output_df['id'] = test_df.index

# Reorder columns to have 'id' as the first column
output_df = output_df[['id'] + class_labels]

# Create csv of y_test with columns "id" and "Hardness"
output_df.to_csv('submission.csv', index=False)