In [19]:
import pandas as pd
pd.set_option('display.max_columns', None)

import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8-pastel')

from sklearn import metrics
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, OneHotEncoder, PolynomialFeatures

In [20]:
data = pd.read_csv("data/train.csv")

In [21]:
y = data.pop('Exited')

In [22]:
interesting_columns = ['CreditScore', 'Geography', 'Gender',
        'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(data[interesting_columns], y, test_size=0.33, random_state=894)

In [24]:
y_train.mean(), y_test.mean()

(0.21211518286727202, 0.21055047556094159)

In [25]:
categorical_columns = ['Geography', 'Gender', 'NumOfProducts', 'HasCrCard', 'IsActiveMember']

In [26]:
partial_categorical_columns = ['Geography', 'Gender', 'NumOfProducts'] # ignore  ['HasCrCard', 'IsActiveMember'] --> passthrough :)
X_train[partial_categorical_columns]

Unnamed: 0,Geography,Gender,NumOfProducts
73756,Spain,Female,2
59621,France,Male,1
90280,Spain,Female,2
108726,Germany,Male,2
104144,Spain,Female,2
...,...,...,...
121640,France,Male,2
94035,France,Female,1
78052,Germany,Female,1
5403,France,Male,2


In [27]:
numerical_columns = ['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary']
X_train[numerical_columns]

Unnamed: 0,CreditScore,Age,Tenure,Balance,EstimatedSalary
73756,745,36.0,9,0.00,141872.25
59621,850,48.0,7,0.00,87067.73
90280,554,28.0,4,0.00,158937.57
108726,667,31.0,10,113715.74,142763.27
104144,710,44.0,7,0.00,5459.07
...,...,...,...,...,...
121640,698,25.0,7,0.00,88965.46
94035,758,41.0,9,0.00,199256.98
78052,521,35.0,9,123943.18,159553.27
5403,663,34.0,3,0.00,152167.79


In [28]:
# numeric_features = ["age", "fare"]
# numeric_transformer = Pipeline(
#     steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
# )

# categorical_features = ["embarked", "sex", "pclass"]
# categorical_transformer = Pipeline(
#     steps=[
#         ("encoder", OneHotEncoder(handle_unknown="ignore")),
#         ("selector", SelectPercentile(chi2, percentile=50)),
#     ]
# )
# preprocessor = ColumnTransformer(
#     transformers=[
#         ("num", numeric_transformer, numeric_features),
#         ("cat", categorical_transformer, categorical_features),
#     ]
# )

In [29]:
numerical_transformer = Pipeline(steps=[("scaler", RobustScaler()), ("polynomial", PolynomialFeatures(2))]) # only numerical values

In [30]:
partial_encoder = ColumnTransformer(
                transformers=[("encode", OneHotEncoder(drop='if_binary'), partial_categorical_columns)],
                remainder="passthrough", verbose_feature_names_out=True
            )

In [31]:
categorical_transformer = Pipeline(steps = [   
                ('partial_encode', partial_encoder), 
                ('interactions', PolynomialFeatures(interaction_only=True, include_bias=False))
            ])

In [32]:
preprocessor = ColumnTransformer(
                transformers=[("categorical", categorical_transformer, categorical_columns), 
                                ("numerical", numerical_transformer, numerical_columns)],
                remainder="passthrough", verbose_feature_names_out=True
            )

In [33]:
X_train_features = preprocessor.fit_transform(X_train)

In [34]:
X_train_features = pd.DataFrame(X_train_features, columns=preprocessor.get_feature_names_out())

In [35]:
X_train_features

Unnamed: 0,categorical__encode__Geography_France,categorical__encode__Geography_Germany,categorical__encode__Geography_Spain,categorical__encode__Gender_Male,categorical__encode__NumOfProducts_1,categorical__encode__NumOfProducts_2,categorical__encode__NumOfProducts_3,categorical__encode__NumOfProducts_4,categorical__remainder__HasCrCard,categorical__remainder__IsActiveMember,categorical__encode__Geography_France encode__Geography_Germany,categorical__encode__Geography_France encode__Geography_Spain,categorical__encode__Geography_France encode__Gender_Male,categorical__encode__Geography_France encode__NumOfProducts_1,categorical__encode__Geography_France encode__NumOfProducts_2,categorical__encode__Geography_France encode__NumOfProducts_3,categorical__encode__Geography_France encode__NumOfProducts_4,categorical__encode__Geography_France remainder__HasCrCard,categorical__encode__Geography_France remainder__IsActiveMember,categorical__encode__Geography_Germany encode__Geography_Spain,categorical__encode__Geography_Germany encode__Gender_Male,categorical__encode__Geography_Germany encode__NumOfProducts_1,categorical__encode__Geography_Germany encode__NumOfProducts_2,categorical__encode__Geography_Germany encode__NumOfProducts_3,categorical__encode__Geography_Germany encode__NumOfProducts_4,categorical__encode__Geography_Germany remainder__HasCrCard,categorical__encode__Geography_Germany remainder__IsActiveMember,categorical__encode__Geography_Spain encode__Gender_Male,categorical__encode__Geography_Spain encode__NumOfProducts_1,categorical__encode__Geography_Spain encode__NumOfProducts_2,categorical__encode__Geography_Spain encode__NumOfProducts_3,categorical__encode__Geography_Spain encode__NumOfProducts_4,categorical__encode__Geography_Spain remainder__HasCrCard,categorical__encode__Geography_Spain remainder__IsActiveMember,categorical__encode__Gender_Male encode__NumOfProducts_1,categorical__encode__Gender_Male encode__NumOfProducts_2,categorical__encode__Gender_Male encode__NumOfProducts_3,categorical__encode__Gender_Male encode__NumOfProducts_4,categorical__encode__Gender_Male remainder__HasCrCard,categorical__encode__Gender_Male remainder__IsActiveMember,categorical__encode__NumOfProducts_1 encode__NumOfProducts_2,categorical__encode__NumOfProducts_1 encode__NumOfProducts_3,categorical__encode__NumOfProducts_1 encode__NumOfProducts_4,categorical__encode__NumOfProducts_1 remainder__HasCrCard,categorical__encode__NumOfProducts_1 remainder__IsActiveMember,categorical__encode__NumOfProducts_2 encode__NumOfProducts_3,categorical__encode__NumOfProducts_2 encode__NumOfProducts_4,categorical__encode__NumOfProducts_2 remainder__HasCrCard,categorical__encode__NumOfProducts_2 remainder__IsActiveMember,categorical__encode__NumOfProducts_3 encode__NumOfProducts_4,categorical__encode__NumOfProducts_3 remainder__HasCrCard,categorical__encode__NumOfProducts_3 remainder__IsActiveMember,categorical__encode__NumOfProducts_4 remainder__HasCrCard,categorical__encode__NumOfProducts_4 remainder__IsActiveMember,categorical__remainder__HasCrCard remainder__IsActiveMember,numerical__1,numerical__CreditScore,numerical__Age,numerical__Tenure,numerical__Balance,numerical__EstimatedSalary,numerical__CreditScore^2,numerical__CreditScore Age,numerical__CreditScore Tenure,numerical__CreditScore Balance,numerical__CreditScore EstimatedSalary,numerical__Age^2,numerical__Age Tenure,numerical__Age Balance,numerical__Age EstimatedSalary,numerical__Tenure^2,numerical__Tenure Balance,numerical__Tenure EstimatedSalary,numerical__Balance^2,numerical__Balance EstimatedSalary,numerical__EstimatedSalary^2
0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.761062,-0.1,1.00,0.000000,0.284794,0.579215,-0.076106,0.761062,0.000000,0.216746,0.01,-0.100,-0.000000,-0.028479,1.0000,0.000000,0.284794,0.000000,0.000000,0.081108
1,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.690265,1.1,0.50,0.000000,-0.390092,2.856997,1.859292,0.845133,0.000000,-0.659359,1.21,0.550,0.000000,-0.429101,0.2500,0.000000,-0.195046,0.000000,-0.000000,0.152172
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.929204,-0.9,-0.25,0.000000,0.494944,0.863419,0.836283,0.232301,-0.000000,-0.459904,0.81,0.225,-0.000000,-0.445449,0.0625,-0.000000,-0.123736,0.000000,0.000000,0.244969
3,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.070796,-0.6,1.25,0.948039,0.295767,0.005012,-0.042478,0.088496,0.067118,0.020939,0.36,-0.750,-0.568823,-0.177460,1.5625,1.185048,0.369708,0.898777,0.280398,0.087478
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.451327,0.7,0.50,0.000000,-1.395055,0.203696,0.315929,0.225664,0.000000,-0.629627,0.49,0.350,0.000000,-0.976539,0.2500,0.000000,-0.697528,0.000000,-0.000000,1.946180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110567,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.345133,-1.2,0.50,0.000000,-0.366722,0.119117,-0.414159,0.172566,0.000000,-0.126568,1.44,-0.600,-0.000000,0.440067,0.2500,0.000000,-0.183361,0.000000,-0.000000,0.134485
110568,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.876106,0.4,1.00,0.000000,0.991454,0.767562,0.350442,0.876106,0.000000,0.868619,0.16,0.400,0.000000,0.396582,1.0000,0.000000,0.991454,0.000000,0.000000,0.982981
110569,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.221239,-0.2,1.00,1.033304,0.502526,1.491425,0.244248,-1.221239,-1.261911,-0.613704,0.04,-0.200,-0.206661,-0.100505,1.0000,1.033304,0.502526,1.067717,0.519262,0.252532
110570,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.035398,-0.3,-0.50,0.000000,0.411578,0.001253,-0.010619,-0.017699,0.000000,0.014569,0.09,0.150,-0.000000,-0.123473,0.2500,-0.000000,-0.205789,0.000000,0.000000,0.169396


In [36]:
## takeaways
# you have to do categorical interactions *before* one hot encoding :)

SyntaxError: invalid syntax (2681684362.py, line 1)

In [None]:
X_test_features = preprocessor.transform(X_test)

In [None]:
X_test_features = pd.DataFrame(X_test_features, columns=preprocessor.get_feature_names_out())

In [None]:
X_test_features

In [None]:
import lightgbm as lgb
import datetime
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = dict(n_estimators=[200], 
                  colsample_bytree=[0.8], 
                  subsample=[0.8] , 
                  subsample_freq=[1],
                  learning_rate = [0.1, 0.2],
                  max_depth = [2],
                  num_leaves = [2, 4],
                  reg_alpha = [0.1, 0.2],
                  reg_lambda = [0.1, 0.2])

In [None]:
test = lgb.LGBMClassifier(n_estimators = 200, colsample_bytree=0.8, subsample=0.8, subsample_freq=1, learning_rate=0.1, max_depth=2, force_row_wise=True)
test.fit(X_train_features, y_train)

In [None]:
start= datetime.datetime.now()

grid_search = GridSearchCV(
        estimator=lgb.LGBMClassifier(n_jobs=-1, force_row_wise=True, verbosity=-1),
        param_grid=param_grid,
        return_train_score=True,
        cv=3, 
        scoring='f1', 
        n_jobs=-1, 
        verbose=0
    ).fit(X_train_features, y_train)

end = datetime.datetime.now()

In [None]:
print("Duration: {}".format(end-start), end, start)

In [None]:
lgb_classifier = grid_search.best_estimator_
lgb_classifier

In [None]:
grid_search.best_score_

In [None]:
cv_results = pd.DataFrame(grid_search.cv_results_).sort_values(by = 'mean_test_score', ascending=False)
cv_results['mean_test_score']

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score

from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import classification_report

In [None]:
estimator = cross_validate(estimator=lgb_classifier, X=X_train_features, y=y_train, cv=StratifiedKFold(5), scoring='roc_auc', return_estimator=True)
estimator

In [None]:
plt.scatter(range(1, 6), estimator['test_score'])
plt.xlim(0, 6)
plt.ylim(0, 1)
plt.ylabel("AUC")
plt.xlabel("CV iteration")
plt.title("AUC Scores Across CV")

In [None]:
y_test_pred = lgb_classifier.predict(X_test_features)
y_test_proba = lgb_classifier.predict_proba(X_test_features)

In [None]:
#Generate predicted probabilites
print('AUC: ', roc_auc_score(y_test, y_test_proba[:,1]))
print('Accuracy: ', lgb_classifier.score(X_test_features, y_test))

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))
display = RocCurveDisplay.from_predictions(y_test, y_test_proba[:, 1], ax=ax) 

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Exited ROC")
plt.legend()
plt.show()

In [None]:
cr = classification_report(y_true=y_test, y_pred=y_test_pred)
print(cr)

# Predictions


In [None]:
test_data = pd.read_csv("data/test.csv")

In [None]:
test_features = preprocessor.transform(test_data)

In [None]:
y_pred = lgb_classifier test_features

In [None]:
y_pred_mat = lgb_classifier.predict_proba(test_features)
y_pred = y_pred_mat[:, 1]

In [None]:
submission = test_data[['id']]
submission['Exited'] = y_pred

In [None]:
submission

In [None]:
submission.to_csv('data/predictions_submission.csv', index=False)