In [2]:
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
import numpy as np
from itertools import combinations

In [3]:
FILE_PATH="dataset"

train_df = pd.read_csv(f"{FILE_PATH}/train.csv")
test_df = pd.read_csv(f"{FILE_PATH}/test.csv")

print(train_df.shape)
print(test_df.shape)

(750000, 18)
(250000, 17)


# FUNCTIONS FOR FEATURE ENGINEERING

In [4]:
def drop_columns(X: pd.DataFrame):
    X_copy = X.copy()
    if "default" in X_copy: 
        X_copy.drop("default",axis=1,inplace=True)
    if "previous" in X_copy:
        X_copy.drop("previous",axis=1,inplace=True)
    if "id" in X:
        X_copy.drop("id",axis=1, inplace=True)
    
    return X_copy

def add_columns(X: pd.DataFrame):
    X_copy = X.copy()
    
    # Use .isin() instead of 'in'
    quarter_conditions = [
        X_copy['month'].isin(['jan','feb','mar']),
        X_copy['month'].isin(['apr','may','jun']),
        X_copy['month'].isin(['jul','aug','sep']),
        X_copy['month'].isin(['oct','nov','dec'])
    ]
    
    choices = ['q1','q2','q3','q4']
    
    X_copy['age_category'] = pd.cut(
        X_copy['age'],
        bins=[0, 20, 40, 60, np.inf],
        labels=['Child', 'Young', 'Middle-aged', 'Senior']
    )
    
    X_copy['quarter'] = np.select(quarter_conditions, choices, default='Unknown')
    
    return X_copy


def add_combine_columns(X: pd.DataFrame):
    
    cat_cols = [col for col in X.columns if X[col].dtype == 'object']
    
    X_copy = X.copy()
    
    for col1, col2 in combinations(cat_cols, 2):
        new_col = '_'.join([col1, col2])
        combine_val = X_copy[col1].astype(str) + '_' + X_copy[col2].astype(str)
        X_copy[new_col] = combine_val
    
    return X_copy






In [5]:
final_train_df = add_columns(drop_columns(train_df))
final_test_df = add_columns(drop_columns(test_df))

test_id = pd.DataFrame(test_df['id'])

Y_Train = final_train_df['y']
final_train_df.drop('y',axis=1,inplace=True)

In [6]:
final_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 16 columns):
 #   Column        Non-Null Count   Dtype   
---  ------        --------------   -----   
 0   age           750000 non-null  int64   
 1   job           750000 non-null  object  
 2   marital       750000 non-null  object  
 3   education     750000 non-null  object  
 4   balance       750000 non-null  int64   
 5   housing       750000 non-null  object  
 6   loan          750000 non-null  object  
 7   contact       750000 non-null  object  
 8   day           750000 non-null  int64   
 9   month         750000 non-null  object  
 10  duration      750000 non-null  int64   
 11  campaign      750000 non-null  int64   
 12  pdays         750000 non-null  int64   
 13  poutcome      750000 non-null  object  
 14  age_category  750000 non-null  category
 15  quarter       750000 non-null  object  
dtypes: category(1), int64(6), object(9)
memory usage: 86.5+ MB


# Feature Engineering PipeLine

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline

# Define column groups BEFORE dropping
label_cols = [
    col for col in final_train_df.columns 
    if final_train_df[col].dtype in ('object','category')
    and col not in ['housing', 'loan', 'default', 'previous', 'id']  # Exclude columns to drop
]

one_hot_cols = ['housing', 'loan']

num_cols = [col for col in final_train_df.columns 
    if final_train_df[col].dtype not in ('object','category') and col not in ['id','previous']
]

print(f"Label Columns: {label_cols}")
print(f"one_hot Columns: {one_hot_cols}")
print(f"num Columns: {num_cols}")



Label Columns: ['job', 'marital', 'education', 'contact', 'month', 'poutcome', 'age_category', 'quarter']
one_hot Columns: ['housing', 'loan']
num Columns: ['age', 'balance', 'day', 'duration', 'campaign', 'pdays']


In [8]:
# Preprocessing only uses remaining columns
preprocessing = ColumnTransformer([
    ('ordinal_encode', 
     OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), 
     label_cols),
    
    ('one_hot', 
     OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), 
     one_hot_cols),
    
    ('scale_numeric', 
     MinMaxScaler(), 
     num_cols)
])


# Fit and transform
X_train_processed = preprocessing.fit_transform(final_train_df)


In [9]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from xgboost import XGBClassifier

model = XGBClassifier(
    objective="binary:logistic",
    eval_metric='auc',
    subsample=0.8,
    colsample_bytree=0.7,
    grow_policy="lossguide",
    seed=42,
    max_leaves=64,
    learning_rate=0.1
)

skfold = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

# ✅ This gives you AUC scores for each fold
cv_scores = cross_val_score(model, X_train_processed, Y_Train, cv=skfold, scoring='roc_auc')

print(f"AUC Scores per fold: {cv_scores}")
print(f"Mean AUC: {cv_scores.mean():.4f}")
print(f"Std AUC: {cv_scores.std():.4f}")


AUC Scores per fold: [0.96431386 0.96299724 0.96324252 0.96433939 0.96372939]
Mean AUC: 0.9637
Std AUC: 0.0005


In [10]:
model.fit(X_train_processed,Y_Train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.7
,device,
,early_stopping_rounds,
,enable_categorical,False


In [11]:
X_test = preprocessing.transform(final_test_df)

In [12]:
Y_test = model.predict_proba(X_test)[-1]
Y_test

array([0.8958755 , 0.10412451], dtype=float32)

In [13]:
# Get probability of positive class for ALL samples
Y_test = model.predict_proba(X_test)[:, 1]  # ✅ Gets all rows, class 1 probability

test_id['y'] = Y_test

In [14]:
from datetime import datetime

# Current datetime as YYMMDDHHMMSS
now = datetime.now()
datetime_string = now.strftime("%y%m%d%H%M%S")

# Use CSV if Parquet causes issues
test_id.to_csv(
    f"submission/submission_{datetime_string}.csv",
    index=False
)