In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import category_encoders as ce

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
%cd ..

/Users/karimlulu/Repos/mlops-loan-prediction


In [4]:
from settings import DATA_PATH, LGBM_PARAMS

In [5]:
df = pd.read_csv(DATA_PATH)
df = df.fillna(value=np.nan)

In [6]:
def build_mapping(df):
    col_to_categories = {}
    for col in df:
        dtype = df[col].dtype
        if dtype == "object" or dtype.name == "category":
            df[col] = df[col].astype("category")
            col_to_categories[col] = list(df[col].cat.categories)
    return col_to_categories

In [7]:
mapp = build_mapping(df)

In [8]:
X = df.loc[:, [col for col in df.columns if col not in ["is_bad", "Id"]]]
y = df["is_bad"]
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.25,
                                                    random_state=41,
                                                    stratify=y)
for key, cats in mapp.items():
    X_test.loc[:, key] = X_test.loc[:, key].cat.set_categories(cats)

In [9]:
encoder = ce.CatBoostEncoder(cols=list(mapp.keys()))
X_tr_cat = encoder.fit_transform(X_train, y_train)
X_test_cat = encoder.transform(X_test)

In [10]:
lgb_train = lgb.Dataset(X_train, y_train.values,
                        categorical_feature=list(mapp.keys()),
                        free_raw_data=False,
                        feature_name=list(X_train.columns))
lgb_eval = lgb.Dataset(X_test, y_test.values, 
                       reference=lgb_train,
                       categorical_feature=list(mapp.keys()),
                       free_raw_data=False,
                       feature_name=list(X_train.columns))

In [11]:
eval_log = {}
bst = lgb.train(LGBM_PARAMS,
              lgb_train,
              num_boost_round=100,
              verbose_eval=False,
              valid_sets=[lgb_eval, lgb_train],
              early_stopping_rounds=50,
              valid_names=["Test", "Train"],
                evals_result=eval_log,
              feature_name=list(X_train.columns),
               categorical_feature=list(mapp.keys()))





In [12]:
eval_log["Test"]

OrderedDict([('binary_logloss',
              [0.3797423850567266,
               0.3737186001373367,
               0.3680094742482637,
               0.3641717705032791,
               0.36131734632268775,
               0.36088655423104593,
               0.35843479190459027,
               0.3566024665593509,
               0.3553800906561788,
               0.35437452925854435,
               0.3531011993772654,
               0.3518615928374165,
               0.3502467982414227,
               0.3488851177767234,
               0.3480724351445956,
               0.3479498098543793,
               0.3475660287625827,
               0.34787949002185903,
               0.3479711742063747,
               0.34812054736366205,
               0.34882306098951715,
               0.3492285057347967,
               0.3494223206459863,
               0.34937329262755795,
               0.3493852179583599,
               0.3493037122388549,
               0.3495713054433767,
               