In [None]:
import kagglehub
kagglehub.login()

In [None]:
data_path = kagglehub.competition_download('playground-series-s5e8')

print('Data source import complete.')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
import warnings
warnings.filterwarnings("ignore")

Import train and test dataset

In [None]:
data = pd.read_csv(os.path.join(data_path, 'train.csv'))
test_data = pd.read_csv(os.path.join(data_path, 'test.csv'))

# EDA

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
missing_table = pd.DataFrame({
    'Missing Values': data.isna().sum(),
    'Percentage (%)': (data.isnull().mean() * 100).round(2)
})

print(missing_table.sort_values(by='Missing Values', ascending=False))

In [None]:
data.nunique()

# Data visualisation

Check dataset balance

In [None]:
data['y'].value_counts().plot.pie(autopct='%1.1f%%', startangle=90, explode=(0,0.1), colors=['#4c72b0', '#aec7e8'])
plt.title('Target distribution')
plt.ylabel('')
plt.show()

In [None]:
sns.histplot(data=data, x="age", kde=True, bins=70)

In [None]:
plt.figure(figsize=(12, 5))
sns.countplot(data=data, x="job")
plt.xlabel('Type of job')
plt.ylabel('')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
sns.countplot(data=data, x="marital")
plt.xlabel('Marital status')
plt.ylabel('')
plt.tight_layout()
plt.show()

In [None]:
sns.countplot(data=data, x="education")
plt.xlabel('Level of education')
plt.ylabel('')
plt.tight_layout()
plt.show()

In [None]:
data['default'].value_counts().plot.pie(autopct='%1.1f%%', startangle=90, explode=(0,0.1), colors=['#4c72b0', '#aec7e8'])
plt.title('Has credit in default?')
plt.ylabel('')
plt.show()

In [None]:
sns.histplot(data=data, x="balance", kde=True, bins=50)

In [None]:
data['housing'].value_counts().plot.pie(autopct='%1.1f%%', startangle=90, explode=(0,0.1), colors=['#4c72b0', '#aec7e8'])
plt.title('Has a housing loan?')
plt.ylabel('')
plt.show()

In [None]:
data['loan'].value_counts().plot.pie(autopct='%1.1f%%', startangle=90, explode=(0,0.1), colors=['#4c72b0', '#aec7e8'])
plt.title('Has a personal loan?')
plt.ylabel('')
plt.show()

In [None]:
sns.countplot(data=data, x="contact")
plt.xlabel('Type of communication contact')
plt.ylabel('')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 5))
sns.countplot(data=data, x="day")
plt.xlabel('Last contact day of the month')
plt.ylabel('')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 5))
sns.countplot(data=data, x="month")
plt.xlabel('Last contact month of the year')
plt.ylabel('')
plt.tight_layout()
plt.show()

In [None]:
sns.histplot(data=data, x="duration", kde=True, bins=50)

In [None]:
sns.histplot(data=data, x="campaign", kde=True, bins=50)

In [None]:
plt.figure(figsize=(12, 5))
sns.countplot(data=data, x="poutcome")
plt.xlabel('Outcome of the previous marketing campaign')
plt.ylabel('')
plt.tight_layout()
plt.show()

Correlation matrix

In [None]:
plt.figure(figsize=(16, 6))

data_corr = data.corr(numeric_only=True)

heatmap = sns.heatmap(data_corr.corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12})

plt.show()

# Data preprocessing

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import shuffle

In [None]:
X = data.drop(['y', 'id'], axis=1)
y = data['y']

In [None]:
numerical_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

In [None]:
categorical_pipeline = Pipeline([
    ('encoder', OrdinalEncoder())
])

In [None]:
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, make_column_selector(dtype_include=['int64', 'float64'])),
    ('cat', categorical_pipeline, make_column_selector(dtype_include=['object']))
])

In [None]:
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
      pass

    def fit(self, X, y = None):
      return self

    def transform(self, X):
        X = X.copy()

        X['long_call'] = X['duration'] > 200
        X['balance_positive'] = (X['balance'] > 0).astype(int)
        X['campaign_multiple'] = (X['campaign'] > 2).astype(int)
        X['age_bin'] = pd.cut(X['age'], bins=[17, 30, 60, 100], labels=['young', 'middle', 'senior'])

        X['duration_log'] = np.log1p(X['duration'])
        X['campaign_log'] = np.log1p(X['campaign'])
        X['pdays_log'] = np.log1p(X['pdays'] + 1)
        X['previous_log'] = np.log1p(X['previous'])
        X['balance_sqrt'] = np.sqrt(X['balance'] - X['balance'].min() + 1)
        X['age_squared'] = X['age'] ** 2

        return X

# Model building

In [None]:
import xgboost as xgb
from collections import Counter
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
import lightgbm as lgb

In [None]:
counter = Counter(y)
scale_pos_weight = counter[0] / counter[1]

In [None]:
best_params =  {'max_depth': 11,
                  'learning_rate': 0.08247101477015132,
                  'n_estimators': 1000,
                  'subsample': 0.9808690492838653,
                  'colsample_bytree': 0.5831655543160346,
                  'min_child_weight': 1,
                  'gamma': 0.1832798205532591,
                  'lambda': 4.510522889747622,
                  'alpha': 5.007953193043952,
                  'n_jobs': -1,
                  "objective" : "binary:logistic",
                  "eval_metric" : "auc"
                  }

In [None]:
params = {
    'objective': "binary",
    'metric': 'auc',
    'verbosity': -1,
    'boosting_type': "gbdt",
    'learning_rate': 0.01,
    'max_depth': 20,
    'num_leaves': 200,
    'max_bin': 400,
    'subsample': 0.85,
    'colsample_bytree': 0.7,
    'subsample_freq': 1,
    'reg_alpha': 6.0,
    'reg_lambda': 4.0,
    'min_child_samples': 25,
    'min_split_gain': 0.001,
    'n_jobs': -1,
    'lambda_l1': 0.5,
    'lambda_l2': 0.3
}


In [None]:
pipeline = Pipeline([
    ('features', FeatureEngineer()),
    ('preprocessing', preprocessor)
    ])

In [None]:
X_preprocessed = pipeline.fit_transform(X)
X_test_preprocessed = pipeline.transform(test_data)

In [None]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X_preprocessed))
test_preds = np.zeros(len(X_test_preprocessed))

for fold, (train_idx, val_idx) in enumerate(skf.split(X_preprocessed, y)):
    print(f"Fold {fold + 1}")

    X_train, X_val = X_preprocessed[train_idx], X_preprocessed[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    dtrain = lgb.Dataset(X_train, label=y_train)
    dval = lgb.Dataset(X_val, label=y_val, reference=dtrain)

    lgb_model = lgb.train(
        params,
        train_set=dtrain,
        num_boost_round=1000,
        valid_sets=[dval]
        )

    oof_preds[val_idx] = lgb_model.predict(X_val)
    test_preds += lgb_model.predict(X_test_preprocessed) / skf.n_splits

# Calculate AUC
cv_roc = roc_auc_score(y, oof_preds)
print(f"Cross-Validation ROC AUC: {cv_roc:.4f}")


# Submition

In [None]:
submission = pd.DataFrame({
    'id': test_data["id"],
    'y': test_preds
})

In [None]:
submission

In [None]:
submission.to_csv('submission.csv', index=False)