In [29]:
#libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [30]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

print("First 5 rows of train_df: \n", train_df.head(5))
print("First 5 rows of test_df: \n", test_df.head(5))

First 5 rows of train_df: 
    id  age          job  marital  education default  balance housing loan  \
0   0   42   technician  married  secondary      no        7      no   no   
1   1   38  blue-collar  married  secondary      no      514      no   no   
2   2   36  blue-collar  married  secondary      no      602     yes   no   
3   3   27      student   single  secondary      no       34     yes   no   
4   4   26   technician  married  secondary      no      889     yes   no   

    contact  day month  duration  campaign  pdays  previous poutcome  y  
0  cellular   25   aug       117         3     -1         0  unknown  0  
1   unknown   18   jun       185         1     -1         0  unknown  0  
2   unknown   14   may       111         2     -1         0  unknown  0  
3   unknown   28   may        10         2     -1         0  unknown  0  
4  cellular    3   feb       902         1     -1         0  unknown  1  
First 5 rows of test_df: 
        id  age            job  marital

In [31]:
train_df['was_contacted_previous'] = train_df['pdays'].apply(lambda x: 1 if x > 0 else 0)
test_df['was_contacted_previous'] = test_df['pdays'].apply(lambda x: 1 if x > 0 else 0)

In [32]:
train_df.loc[train_df['pdays'] == -1, 'pdays'] = 99999
test_df.loc[test_df['pdays'] == -1, 'pdays'] = 99999

In [33]:
test_id_placeholder = test_df['id']

In [34]:
cat_cols=train_df.select_dtypes(include=['object']).columns
num_cols=train_df.select_dtypes(include=['int']).columns

print(f'Total Categorical Columns {len(cat_cols)}')
print(f'Total Numerical Columns {len(num_cols)}')

Total Categorical Columns 9
Total Numerical Columns 10


In [35]:
from sklearn.model_selection import train_test_split

X = train_df.drop(columns=['id', 'y'])
y = train_df['y']

test_df = test_df.drop(columns='id')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=42)

In [36]:
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
cat_cols.append('was_contacted_previous')

In [37]:
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score

train_pool = Pool(X_train, y_train, cat_features=cat_cols)
test_pool = Pool(X_test, y_test, cat_features=cat_cols)

In [38]:
cat_model = CatBoostClassifier(
    depth=6,
    learning_rate= 0.05,
    iterations=1000,
    eval_metric= 'AUC',
    random_state=42,
    verbose=100
)
cat_model.fit(train_pool, eval_set=test_pool, use_best_model=True)

y_prob = cat_model.predict_proba(test_pool)[:,1]
print("AUC: ", roc_auc_score(y_test, y_prob))


0:	test: 0.9177710	best: 0.9177710 (0)	total: 561ms	remaining: 9m 20s
100:	test: 0.9564367	best: 0.9564367 (100)	total: 36.3s	remaining: 5m 23s
200:	test: 0.9601111	best: 0.9601111 (200)	total: 1m 12s	remaining: 4m 49s
300:	test: 0.9618927	best: 0.9618927 (300)	total: 1m 48s	remaining: 4m 11s
400:	test: 0.9629755	best: 0.9629755 (400)	total: 2m 23s	remaining: 3m 34s
500:	test: 0.9636413	best: 0.9636413 (500)	total: 3m	remaining: 2m 59s
600:	test: 0.9641245	best: 0.9641245 (600)	total: 3m 36s	remaining: 2m 23s
700:	test: 0.9644868	best: 0.9644868 (700)	total: 4m 12s	remaining: 1m 47s
800:	test: 0.9648233	best: 0.9648233 (800)	total: 4m 50s	remaining: 1m 12s
900:	test: 0.9650942	best: 0.9650942 (900)	total: 5m 27s	remaining: 36s
999:	test: 0.9653189	best: 0.9653189 (999)	total: 6m 3s	remaining: 0us

bestTest = 0.9653189304
bestIteration = 999

AUC:  0.9653189303975835


In [43]:
# submit_pool = Pool(test_df, cat_features=cat_cols)

# submit_prob = cat_model.predict_proba(submit_pool)[:, 1]

# cat_df = pd.DataFrame({
#     "id": test_id_placeholder.values,
#     "y": submit_prob                   
# })

# cat_df.to_csv("cat_preds.csv", index=False)
# print("Success!")


Success!


In [47]:
import lightgbm as lgb

X_lgb = X.copy()

for c in cat_cols:
    X_lgb[c] = X_lgb[c].astype('category')

X_train, X_valid, y_train, y_valid = train_test_split(X_lgb, y, test_size=0.2, random_state=42)

In [49]:
from lightgbm import LGBMClassifier

lgbm_model = LGBMClassifier(
    n_estimators=1000,
    learning_rate= 0.05,
    num_leaves=63,
    objective='binary',
    random_state=42,
    verbose=100
)

lgbm_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric='auc')

y_prob_light = lgbm_model.predict_proba(X_valid)[:,1]
print("AUC: ", roc_auc_score(y_valid, y_prob_light))

[LightGBM] [Info] Number of positive: 72283, number of negative: 527717
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.868694
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.414891
[LightGBM] [Debug] init for col-wise cost 0.012934 seconds, init for row-wise cost 0.093656 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019508 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 1004
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120472 -> initscore=-1.987971
[LightGBM] [Info] Start training from score -1.987971
[LightGBM] [Debug] Trained a tree with leaves = 63 and depth = 15
[LightGBM] [Debug] Trained a tree with leaves = 63 and depth = 17
[Ligh

In [None]:
# X_submit = test_df.copy()

# for c in cat_cols:
#     X_submit[c] = X_submit[c].astype('category')

# submit_prob = lgbm_model.predict_proba(X_submit)[:, 1]

# lgbm_df = pd.DataFrame({
#     "id": test_id_placeholder.values,
#     "y": submit_prob
# })
# lgbm_df.to_csv("lgbm_preds.csv", index=False)
# print("Success!")

Success!


In [55]:
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

def cast_cats(df):
    df = df.copy()
    for c in cat_cols:
        if c in df.columns:
            df[c] = df[c].astype('category')
    return df

cast_cats_tf = FunctionTransformer(cast_cats, feature_names_out="one-to-one")

In [62]:
cat_model = CatBoostClassifier(
    depth=6,
    learning_rate= 0.05,
    iterations=1000,
    eval_metric= 'AUC',
    random_state=42,
    verbose=0,
    cat_features=cat_cols
)
lgbm_model = LGBMClassifier(
    n_estimators=1000,
    learning_rate= 0.05,
    num_leaves=63,
    objective='binary',
    random_state=42,
    verbose=0
)

In [63]:
lgbm_pipe = Pipeline([
    ("cast", cast_cats_tf),
    ("lgbm", lgbm_model)
])

cat_pipe = Pipeline([
    ("cat", cat_model)
])

In [64]:
ensemble = VotingClassifier(
    estimators=[
        ('catboost', cat_pipe),
        ('lightgbm', lgbm_pipe)
    ],
    voting='soft', weights=[1.0,1.0], n_jobs= -1
)

ensemble.fit(X_train, y_train)

0,1,2
,estimators,"[('catboost', ...), ('lightgbm', ...)]"
,voting,'soft'
,weights,"[1.0, 1.0]"
,n_jobs,-1
,flatten_transform,True
,verbose,False

0,1,2
,func,<function cas...00216BDD2EA70>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,'one-to-one'
,kw_args,
,inv_kw_args,

0,1,2
,boosting_type,'gbdt'
,num_leaves,63
,max_depth,-1
,learning_rate,0.05
,n_estimators,1000
,subsample_for_bin,200000
,objective,'binary'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [66]:
X_ensemble_sub = test_df.copy()

ensemble_preds = ensemble.predict(X_ensemble_sub)

ensemble_df = pd.DataFrame(
    {"id" : test_id_placeholder,
    "y" : ensemble_preds}
)

ensemble_df.to_csv("Ensemble_preds.csv", index=False)
print("Success!")


Success!


The ensemble led to a lower result than LightGBM. For this reason I'm going to focus on LightGBM and apply hyperparams tuning.