In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from catboost import CatBoostClassifier, Pool
from catboost.utils import eval_metric
import warnings
warnings.filterwarnings("ignore")





In [2]:
from sklearn.preprocessing import MinMaxScaler

In [3]:
RAND_VAL=42
num_folds=5 ## Number of folds
n_est=3000 ## Number of estimators

In [4]:
df_train = pd.read_csv('/kaggle/input/playground-series-s4e1/train.csv')
df_train.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [5]:
df_test = pd.read_csv('/kaggle/input/playground-series-s4e1/test.csv')
df_test.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,165034,15773898,Lucchese,586,France,Female,23.0,2,0.0,2,0.0,1.0,160976.75
1,165035,15782418,Nott,683,France,Female,46.0,2,0.0,1,1.0,0.0,72549.27
2,165036,15807120,K?,656,France,Female,34.0,7,0.0,2,1.0,0.0,138882.09
3,165037,15808905,O'Donnell,681,France,Male,36.0,8,0.0,1,1.0,0.0,113931.57
4,165038,15607314,Higgins,752,Germany,Male,38.0,10,121263.62,1,1.0,0.0,139431.0


In [6]:
scale_cols = ['Age', 'CreditScore', 'Balance', 'EstimatedSalary']

# Initialize the scaler
scaler = MinMaxScaler()

# Fit the scaler on the training data and transform both training and test data
for c in scale_cols:
    min_value = df_train[c].min()
    max_value = df_train[c].max()

    # Fit on training data
    scaler.fit(df_train[[c]])

    # Transform on both training and test data
    df_train[c + "_scaled"] = scaler.transform(df_train[[c]])
    df_test[c + "_scaled"] = scaler.transform(df_test[[c]])

In [7]:
def getFeats(df):
    
    df['IsSenior'] = df['Age'].apply(lambda x: 1 if x >= 60 else 0)
    df['IsActive_by_CreditCard'] = df['HasCrCard'] * df['IsActiveMember']
    df['Products_Per_Tenure'] =  df['Tenure'] / df['NumOfProducts']
    df['AgeCat'] = np.round(df.Age/20).astype('int').astype('category')
    
    
    return df

In [8]:
df_train = getFeats(df_train)
df_test = getFeats(df_test)
##
feat_cols=df_train.columns.drop(['id','Exited'])
feat_cols=feat_cols.drop(scale_cols)
print(feat_cols)
df_train.head()

Index(['CustomerId', 'Surname', 'Geography', 'Gender', 'Tenure',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'Age_scaled',
       'CreditScore_scaled', 'Balance_scaled', 'EstimatedSalary_scaled',
       'IsSenior', 'IsActive_by_CreditCard', 'Products_Per_Tenure', 'AgeCat'],
      dtype='object')


Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,...,EstimatedSalary,Exited,Age_scaled,CreditScore_scaled,Balance_scaled,EstimatedSalary_scaled,IsSenior,IsActive_by_CreditCard,Products_Per_Tenure,AgeCat
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,...,181449.97,0,0.202703,0.636,0.0,0.907279,0,0.0,1.5,2
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,...,49503.5,0,0.202703,0.554,0.0,0.247483,0,1.0,0.5,2
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,...,184866.69,0,0.297297,0.656,0.0,0.924364,0,0.0,5.0,2
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,...,84560.88,0,0.216216,0.462,0.593398,0.422787,0,1.0,2.0,2
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,...,15068.83,0,0.202703,0.732,0.0,0.075293,0,1.0,2.5,2


In [9]:
df_test.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,...,IsActiveMember,EstimatedSalary,Age_scaled,CreditScore_scaled,Balance_scaled,EstimatedSalary_scaled,IsSenior,IsActive_by_CreditCard,Products_Per_Tenure,AgeCat
0,165034,15773898,Lucchese,586,France,Female,23.0,2,0.0,2,...,1.0,160976.75,0.067568,0.472,0.0,0.804903,0,0.0,1.0,1
1,165035,15782418,Nott,683,France,Female,46.0,2,0.0,1,...,0.0,72549.27,0.378378,0.666,0.0,0.362723,0,0.0,2.0,2
2,165036,15807120,K?,656,France,Female,34.0,7,0.0,2,...,0.0,138882.09,0.216216,0.612,0.0,0.694419,0,0.0,3.5,2
3,165037,15808905,O'Donnell,681,France,Male,36.0,8,0.0,1,...,0.0,113931.57,0.243243,0.662,0.0,0.569654,0,0.0,8.0,2
4,165038,15607314,Higgins,752,Germany,Male,38.0,10,121263.62,1,...,0.0,139431.0,0.27027,0.804,0.483318,0.697164,0,0.0,10.0,2


In [10]:
X=df_train[feat_cols]
y=df_train['Exited']
##
cat_features = np.where(X.dtypes != np.float64)[0]
cat_features

array([ 0,  1,  2,  3,  4,  5, 12, 15])

> cat_features is a list of indices or names of columns in your dataset that are categorical. It is important to specify which features are categorical to ensure proper handling by the CatBoost algorithm.
Categorical features often include variables like gender, country, or any other non-numeric variable that represents categories.

In [11]:
folds = StratifiedKFold(n_splits=num_folds,random_state=RAND_VAL,shuffle=True)
test_preds = np.empty((num_folds, len(df_test)))
auc_vals=[]

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, y)):
    
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[valid_idx], y.iloc[valid_idx]
    
    train_pool = Pool(X_train, y_train,cat_features=cat_features)
    val_pool = Pool(X_val, y_val,cat_features=cat_features)
    
    clf = CatBoostClassifier(
    eval_metric='AUC',
    learning_rate=0.03,
    iterations=n_est)
    clf.fit(train_pool, eval_set=val_pool,verbose=300)
    
    y_pred_val = clf.predict_proba(X_val[feat_cols])[:,1]
    auc_val = roc_auc_score(y_val, y_pred_val)
    print("AUC for fold ",n_fold,": ",auc_val)
    auc_vals.append(auc_val)
    
    y_pred_test = clf.predict_proba(df_test[feat_cols])[:,1]
    test_preds[n_fold, :] = y_pred_test
    print("----------------")

0:	test: 0.8618509	best: 0.8618509 (0)	total: 237ms	remaining: 11m 50s


300:	test: 0.8952348	best: 0.8952348 (300)	total: 44s	remaining: 6m 34s


600:	test: 0.8967626	best: 0.8967629 (599)	total: 1m 27s	remaining: 5m 49s


900:	test: 0.8971926	best: 0.8971955 (897)	total: 2m 11s	remaining: 5m 6s


In [None]:
test_preds

In [None]:
"Mean AUC: ",np.mean(auc_vals)

The SHAP (SHapley Additive exPlanations) library to explain the output of a machine learning model, specifically a tree-based model like CatBoost.

In [None]:
import shap
shap.initjs()
explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(train_pool)
shap.summary_plot(shap_values, X_train, plot_type="bar")

In [None]:
y_pred = test_preds.mean(axis=0)
df_sub = df_test[['id']]
df_sub['Exited'] = y_pred
df_sub.head()

In [None]:
df_sub.to_csv("submission.csv",index=False)

In [None]:
df_sub.hist(column='Exited', bins=20, range=[0,1],figsize=(12,6))
plt.show()