In [10]:
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
import numpy as np
from itertools import combinations

In [12]:
FILE_PATH="dataset"

train_df = pd.read_csv(f"{FILE_PATH}/train.csv")
test_df = pd.read_csv(f"{FILE_PATH}/test.csv")

print(train_df.shape)
print(test_df.shape)

(750000, 18)
(250000, 17)


# FUNCTIONS FOR FEATURE ENGINEERING

In [13]:
def drop_columns(X: pd.DataFrame):
    X_copy = X.copy()
    if "default" in X_copy: 
        X_copy.drop("default",axis=1,inplace=True)
    if "previous" in X_copy:
        X_copy.drop("previous",axis=1,inplace=True)
    if "id" in X:
        X_copy.drop("id",axis=1, inplace=True)
    
    return X_copy

def add_columns(X: pd.DataFrame):
    X_copy = X.copy()
    
    # Use .isin() instead of 'in'
    quarter_conditions = [
        X_copy['month'].isin(['jan','feb','mar']),
        X_copy['month'].isin(['apr','may','jun']),
        X_copy['month'].isin(['jul','aug','sep']),
        X_copy['month'].isin(['oct','nov','dec'])
    ]
    
    choices = ['q1','q2','q3','q4']
    
    X_copy['age_category'] = pd.cut(
        X_copy['age'],
        bins=[0, 20, 40, 60, np.inf],
        labels=['Child', 'Young', 'Middle-aged', 'Senior']
    )
    
    X_copy['quarter'] = np.select(quarter_conditions, choices, default='Unknown')
    
    return X_copy


def add_combine_columns(X: pd.DataFrame):
    
    cat_cols = [col for col in X.columns if X[col].dtype == 'object']
    
    X_copy = X.copy()
    
    for col1, col2 in combinations(cat_cols, 2):
        new_col = '_'.join([col1, col2])
        combine_val = X_copy[col1].astype(str) + '_' + X_copy[col2].astype(str)
        X_copy[new_col] = combine_val
    
    return X_copy






In [19]:
final_train_df = add_combine_columns(add_columns(drop_columns(train_df)))
final_test_df = add_combine_columns(add_columns(drop_columns(test_df)))

test_id = pd.DataFrame(test_df['id'])

target = final_train_df['y']
final_train_df.drop('y',axis=1,inplace=True)

In [15]:
final_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 52 columns):
 #   Column              Non-Null Count   Dtype   
---  ------              --------------   -----   
 0   age                 750000 non-null  int64   
 1   job                 750000 non-null  object  
 2   marital             750000 non-null  object  
 3   education           750000 non-null  object  
 4   balance             750000 non-null  int64   
 5   housing             750000 non-null  object  
 6   loan                750000 non-null  object  
 7   contact             750000 non-null  object  
 8   day                 750000 non-null  int64   
 9   month               750000 non-null  object  
 10  duration            750000 non-null  int64   
 11  campaign            750000 non-null  int64   
 12  pdays               750000 non-null  int64   
 13  poutcome            750000 non-null  object  
 14  age_category        750000 non-null  category
 15  quarter          

# Feature Engineering PipeLine

In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline

# Define column groups BEFORE dropping
label_cols = [
    col for col in final_train_df.columns 
    if final_train_df[col].dtype in ('object','category')
    and col not in ['housing', 'loan', 'default', 'previous', 'id']  # Exclude columns to drop
]

one_hot_cols = ['housing', 'loan']

num_cols = [col for col in final_train_df.columns 
    if final_train_df[col].dtype not in ('object','category') and col not in ['id','previous']
]

print(f"Label Columns: {label_cols}")
print(f"one_hot Columns: {one_hot_cols}")
print(f"num Columns: {num_cols}")



Label Columns: ['job', 'marital', 'education', 'contact', 'month', 'poutcome', 'age_category', 'quarter', 'job_marital', 'job_education', 'job_housing', 'job_loan', 'job_contact', 'job_month', 'job_poutcome', 'job_quarter', 'marital_education', 'marital_housing', 'marital_loan', 'marital_contact', 'marital_month', 'marital_poutcome', 'marital_quarter', 'education_housing', 'education_loan', 'education_contact', 'education_month', 'education_poutcome', 'education_quarter', 'housing_loan', 'housing_contact', 'housing_month', 'housing_poutcome', 'housing_quarter', 'loan_contact', 'loan_month', 'loan_poutcome', 'loan_quarter', 'contact_month', 'contact_poutcome', 'contact_quarter', 'month_poutcome', 'month_quarter', 'poutcome_quarter']
one_hot Columns: ['housing', 'loan']
num Columns: ['age', 'balance', 'day', 'duration', 'campaign', 'pdays']


In [17]:
# Preprocessing only uses remaining columns
preprocessing = ColumnTransformer([
    ('ordinal_encode', 
     OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), 
     label_cols),
    
    ('one_hot', 
     OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), 
     one_hot_cols),
    
    ('scale_numeric', 
     MinMaxScaler(), 
     num_cols)
])


# Fit and transform
# X_train_processed = preprocessing.fit_transform(final_train_df)


In [None]:
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.base import clone

cat_clf = CatBoostClassifier(
    allow_writing_files=False,
    verbose=500,
    loss_function='CrossEntropy',
    use_best_model=True,
    n_estimators=10000,
    learning_rate=0.1
)

N_SPLITS = 5
skfold = StratifiedKFold(n_splits=N_SPLITS, random_state=42, shuffle=True)

test_pred = np.zeros(len(final_test_df))
roc_scores = []

for fold, (train_idx, test_idx) in enumerate(skfold.split(final_train_df, target), 1):
    X_train = final_train_df.iloc[train_idx]
    X_test = final_train_df.iloc[test_idx]
    Y_train = target.iloc[train_idx]
    Y_test = target.iloc[test_idx]
    
    # Fit preprocessing ONLY on training fold
    preprocessing.fit(X_train)
    X_train_transformed = preprocessing.transform(X_train)
    X_test_transformed = preprocessing.transform(X_test)
    
    # Convert to DataFrame with feature names
    feature_names = preprocessing.get_feature_names_out().tolist()
    X_train_df = pd.DataFrame(X_train_transformed, columns=feature_names, index=X_train.index)
    X_test_df = pd.DataFrame(X_test_transformed, columns=feature_names, index=X_test.index)
    
    # Get model
    model = clone(cat_clf)
    model.fit(
        X_train_df, Y_train,
        eval_set=[(X_test_df, Y_test)],
        early_stopping_rounds=200
    )
    
    # Calculate roc_score
    Y_pred = model.predict_proba(X_test_df)[:, 1]
    roc_score = roc_auc_score(Y_test, Y_pred)
    roc_scores.append(roc_score)
    
    # Transform final test using training fold's preprocessing
    X_final_test_transformed = preprocessing.transform(final_test_df)
    X_final_test_df = pd.DataFrame(X_final_test_transformed, columns=feature_names)
    test_pred += model.predict_proba(X_final_test_df)[:, 1]
    
    print(f"Fold {fold} -> ROC-AUC: {roc_score:.5f}")

print(f"Average Fold ROC-AUC: {np.mean(roc_scores):.5f} ± {np.std(roc_scores):.5f}")
test_pred = test_pred / N_SPLITS

0:	learn: 0.5381433	test: 0.5380810	best: 0.5380810 (0)	total: 81.2ms	remaining: 13m 31s
500:	learn: 0.1456294	test: 0.1482413	best: 0.1482413 (500)	total: 9.79s	remaining: 3m 5s
1000:	learn: 0.1390432	test: 0.1456845	best: 0.1456845 (1000)	total: 19.4s	remaining: 2m 54s
1500:	learn: 0.1341604	test: 0.1444512	best: 0.1444475 (1499)	total: 29s	remaining: 2m 44s
2000:	learn: 0.1301277	test: 0.1438137	best: 0.1438116 (1991)	total: 38.7s	remaining: 2m 34s
2500:	learn: 0.1266166	test: 0.1435151	best: 0.1435139 (2497)	total: 48.6s	remaining: 2m 25s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.143475749
bestIteration = 2534

Shrink model to first 2535 iterations.
Fold 1 -> ROC-AUC: 0.96858
0:	learn: 0.5328947	test: 0.5328592	best: 0.5328592 (0)	total: 22.1ms	remaining: 3m 40s
500:	learn: 0.1452700	test: 0.1505684	best: 0.1505684 (500)	total: 9.89s	remaining: 3m 7s
1000:	learn: 0.1386797	test: 0.1482059	best: 0.1482059 (1000)	total: 19.6s	remaining: 2m 56s
1500:	learn: 

In [24]:
from datetime import datetime

# Current datetime as YYMMDDHHMMSS
now = datetime.now()
datetime_string = now.strftime("%y%m%d%H%M%S")

test_id['y'] = test_pred


# Use CSV if Parquet causes issues
test_id.to_csv(
    f"submission/submission_{datetime_string}.csv",
    index=False
)

In [23]:
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.base import clone
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd

cat_clf = CatBoostClassifier(
    allow_writing_files=False,
    verbose=500,
    loss_function='CrossEntropy',
    use_best_model=True,
    n_estimators=10000,
    learning_rate=0.1
)

N_SPLITS = 5
skfold = StratifiedKFold(n_splits=N_SPLITS, random_state=42, shuffle=True)

test_pred = np.zeros(len(final_test_df))
roc_scores = []

for fold, (train_idx, test_idx) in enumerate(skfold.split(final_train_df, target), 1):
    X_train = final_train_df.iloc[train_idx].copy()
    X_test = final_train_df.iloc[test_idx].copy()
    Y_train = target.iloc[train_idx]
    Y_test = target.iloc[test_idx]
    
    # Fit preprocessing ONLY on training fold
    preprocessing.fit(X_train)
    X_train_transformed = preprocessing.transform(X_train)
    X_test_transformed = preprocessing.transform(X_test)
    
    # Convert to DataFrame with feature names
    feature_names = preprocessing.get_feature_names_out().tolist()
    X_train_df = pd.DataFrame(X_train_transformed, columns=feature_names, index=X_train.index)
    X_test_df = pd.DataFrame(X_test_transformed, columns=feature_names, index=X_test.index)
    
    # Apply SMOTE to training data ONLY
    smote = SMOTE(random_state=42)
    X_train_smote, Y_train_smote = smote.fit_resample(X_train_df, Y_train)
    X_train_smote = pd.DataFrame(X_train_smote, columns=feature_names)
    
    # Get model
    model = clone(cat_clf)
    model.fit(
        X_train_smote, Y_train_smote,  # Use SMOTE-augmented training data
        eval_set=[(X_test_df, Y_test)],
        early_stopping_rounds=200
    )
    
    # Calculate roc_score on ORIGINAL test (not SMOTE)
    Y_pred = model.predict_proba(X_test_df)[:, 1]
    roc_score = roc_auc_score(Y_test, Y_pred)
    roc_scores.append(roc_score)
    
    # Transform final test using training fold's preprocessing
    X_final_test_transformed = preprocessing.transform(final_test_df)
    X_final_test_df = pd.DataFrame(X_final_test_transformed, columns=feature_names)
    test_pred += model.predict_proba(X_final_test_df)[:, 1]
    
    print(f"Fold {fold} -> ROC-AUC: {roc_score:.5f}")

print(f"Average Fold ROC-AUC: {np.mean(roc_scores):.5f} ± {np.std(roc_scores):.5f}")
test_pred = test_pred / N_SPLITS


0:	learn: 0.5850744	test: 0.5665244	best: 0.5665244 (0)	total: 42.3ms	remaining: 7m 3s
500:	learn: 0.0979269	test: 0.1536242	best: 0.1536242 (500)	total: 16.9s	remaining: 5m 19s
1000:	learn: 0.0908666	test: 0.1491804	best: 0.1491804 (1000)	total: 33.7s	remaining: 5m 2s
1500:	learn: 0.0867470	test: 0.1472003	best: 0.1472003 (1500)	total: 50.1s	remaining: 4m 43s
2000:	learn: 0.0838217	test: 0.1462185	best: 0.1462185 (2000)	total: 1m 6s	remaining: 4m 27s
2500:	learn: 0.0814072	test: 0.1456973	best: 0.1456951 (2499)	total: 1m 23s	remaining: 4m 10s
3000:	learn: 0.0791424	test: 0.1453159	best: 0.1452908 (2941)	total: 1m 39s	remaining: 3m 53s
3500:	learn: 0.0771675	test: 0.1451574	best: 0.1451352 (3455)	total: 1m 56s	remaining: 3m 35s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.1451266053
bestIteration = 3573

Shrink model to first 3574 iterations.
Fold 1 -> ROC-AUC: 0.96805
0:	learn: 0.5852902	test: 0.5665958	best: 0.5665958 (0)	total: 44.2ms	remaining: 7m 21s
500:	l