In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [1]:
%%capture
!pip install hyperopt

### 00 Environment

In [3]:
import lightgbm as lgb
import numpy as np
import pandas as pd
import math
from sklearn.datasets import make_classification
import warnings
warnings.filterwarnings('ignore')

from datetime import datetime
todays_date = datetime.today().strftime('%Y%m%d')

# my functions
from lgbm_hyper_parameter_search import *
from basic_functions import *
from model_eval import *
# from sql_connector import *

### 01 Generating Fake Data

In [4]:
#Generating some fake data
x, y = make_classification(n_samples=100000, n_features=100, n_informative=50, n_redundant=0, n_clusters_per_class=10)

#Creating a fake datraframe
data = pd.DataFrame(x)
data.columns = ["var_" + str(int(i)) for i in range(1,101)]
data["target"] = y

#Adding a fake time period 
data["year_quarter"] = np.random.choice(np.arange(2015,2019,0.25),100000)

### 02 Creating our Global Variables

In [5]:
#Our features
features = ["var_" + str(int(i)) for i in range(1,101)]

#Target variable
target = "target"

#Creating the training and testing data 
training_data = data[data.year_quarter < 2018].copy()
testing_data = data[data.year_quarter >= 2018].copy()

#Cross validation indices by time period
indices = k_folds_indices_by_time_period(training_data.year_quarter, training_data.index, n_folds=5, seed=10)

### 03 Feature Reduction Model

In [6]:
#Training the model with all features
model=lgbm(nfold=5, search_rounds=10, eval_metric="ks", fixed_parameters={"n_jobs":20})
model.hyperparameter_search(data=training_data, feature_labels=features, target_label=target, indices_for_cv = indices)
model.train()

#Looking at our hyper-parameters and feature importance
display(model.hyperparameters)
display(model.feature_importance.loc[0:20])

#Feature for final model
final_features = list(model.feature_importance.loc[0:20, "feature"])

Using user provided indices for K-Folds.
100%|██████████| 10/10 [05:31<00:00, 33.17s/trial, best loss: -0.66447]


{'bagging_fraction': 0.5,
 'feature_fraction': 0.7000000000000001,
 'lambda_l1': 14.06003762466616,
 'lambda_l2': 16.846021205983615,
 'learning_rate': 0.04799284386577267,
 'max_depth': 4.0,
 'min_data_in_leaf': 34.0,
 'min_gain_to_split': 0.11306531275700882,
 'objective': 'rmse',
 'eval_metric': 'ks',
 'maximize_metric': True,
 'max_bin': 200,
 'n_trees': 4067,
 'early_stopping_rounds': 25,
 'bagging_freq': 1,
 'random_state': 6,
 'bagging_seed': 7,
 'feature_fraction_seed': 8,
 'data_random_seed': 9,
 'verbosity': -1,
 'n_jobs': 20}

Unnamed: 0,feature,feature_importance
0,var_74,0.034581
1,var_24,0.028962
2,var_88,0.028641
3,var_85,0.025015
4,var_25,0.023836
5,var_23,0.023476
6,var_43,0.023329
7,var_75,0.023248
8,var_31,0.022873
9,var_64,0.022782


### 04 Final Model

In [8]:
#Training the model with all features
final_model=lgbm(nfold=5, search_rounds=100, eval_metric="ks", fixed_parameters={"n_jobs":20})
final_model.hyperparameter_search(data=training_data, feature_labels=final_features, target_label=target, indices_for_cv = indices)
final_model.train()

#Looking at our hyper-parameters and feature importance
display(final_model.hyperparameters)
display(final_model.feature_importance.loc[0:20])

Using user provided indices for K-Folds.
100%|██████████| 100/100 [1:12:41<00:00, 43.61s/trial, best loss: -0.50166]


{'bagging_fraction': 0.6000000000000001,
 'feature_fraction': 1.0,
 'lambda_l1': 0.564291001882038,
 'lambda_l2': 42.906133178776344,
 'learning_rate': 0.01734555715798179,
 'max_depth': 5.0,
 'min_data_in_leaf': 34.0,
 'min_gain_to_split': 0.001154918452276582,
 'objective': 'rmse',
 'eval_metric': 'ks',
 'maximize_metric': True,
 'max_bin': 200,
 'n_trees': 3416,
 'early_stopping_rounds': 25,
 'bagging_freq': 1,
 'random_state': 6,
 'bagging_seed': 7,
 'feature_fraction_seed': 8,
 'data_random_seed': 9,
 'verbosity': -1,
 'n_jobs': 20}

Unnamed: 0,feature,feature_importance
0,var_74,0.062359
1,var_46,0.053938
2,var_88,0.053038
3,var_85,0.052605
4,var_24,0.052509
5,var_64,0.051734
6,var_43,0.051316
7,var_25,0.050888
8,var_75,0.049902
9,var_50,0.047529


### 05 Out of Sample Test

In [11]:
#Testing our best model on the unseen data
testing_data["prediction"] = np.maximum(np.minimum(final_model.predict(testing_data),1),0)

#Metrics
print("KS:",round(ks_statistic(testing_data[target],testing_data["prediction"]), 3))
print("AUC:",round(auc(testing_data[target],testing_data["prediction"]),3))

#Plotting the test data with alot of quantiles to see if there is overfitting on the tail
actual_vs_predicted(testing_data[target],testing_data["prediction"], n_bins=50, normalize=False, 
                    y_axis_label="Target Variable", plot_2_x_axis_label="Prediction",dark_mode=False)

KS: 0.514
AUC: 0.836


### 06 Summary Statistics

In [12]:
#Summary statistics
summary = summary_statistics(testing_data[target],testing_data["prediction"])
display(summary[0])
display(summary[1])

Unnamed: 0_level_0,bad_rate_prediction,actual_bad_rate,bads,goods,total_loans,total_pct,bads_pct,goods_pct,cumulative_bads_pct,cumulative_goods_pct,ks_statistic
quantile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1.0,0.064275,0.057746,145,2366,2511,0.099964,0.011556,0.188211,0.011556,0.188211,17.67
2.0,0.213656,0.147293,370,2142,2512,0.100004,0.029487,0.170392,0.041042,0.358603,31.76
3.0,0.30845,0.253583,637,1875,2512,0.100004,0.050765,0.149153,0.091807,0.507756,41.59
4.0,0.386754,0.32086,806,1706,2512,0.100004,0.064233,0.135709,0.156041,0.643465,48.74
5.0,0.459612,0.441083,1108,1404,2512,0.100004,0.088301,0.111686,0.244342,0.755151,51.08
6.0,0.531138,0.557723,1401,1111,2512,0.100004,0.111651,0.088378,0.355993,0.843529,48.75
7.0,0.605058,0.663217,1666,846,2512,0.100004,0.13277,0.067298,0.488763,0.910827,42.21
8.0,0.685842,0.751194,1887,625,2512,0.100004,0.150383,0.049718,0.639146,0.960544,32.14
9.0,0.788801,0.855892,2150,362,2512,0.100004,0.171342,0.028796,0.810488,0.989341,17.89
10.0,0.943131,0.946656,2378,134,2512,0.100004,0.189512,0.010659,1.0,1.0,0.0


Unnamed: 0_level_0,bad_rate_prediction,actual_bad_rate,bads,goods,total_loans,total_pct,bads_pct,goods_pct,cumulative_bads_pct,cumulative_goods_pct,ks_statistic
prediction_interval,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0.0-0.1,0.032483,0.0405,68,1611,1679,0.066842,0.005419,0.128152,0.005419,0.128152,12.27
0.1-0.2,0.154365,0.108447,190,1562,1752,0.069748,0.015142,0.124254,0.020561,0.252406,23.18
0.2-0.3,0.253233,0.186055,483,2113,2596,0.103348,0.038492,0.168085,0.059053,0.420492,36.14
0.3-0.4,0.351369,0.286875,918,2282,3200,0.127394,0.073159,0.181529,0.132212,0.602021,46.98
0.4-0.5,0.449702,0.424689,1469,1990,3459,0.137705,0.11707,0.158301,0.249283,0.760321,51.1
0.5-0.6,0.548881,0.582594,2035,1458,3493,0.139058,0.162177,0.115981,0.41146,0.876303,46.48
0.6-0.7,0.648997,0.714932,2212,882,3094,0.123174,0.176283,0.070161,0.587743,0.946464,35.87
0.7-0.8,0.747715,0.819686,1932,425,2357,0.093833,0.153969,0.033808,0.741712,0.980272,23.86
0.8-0.9,0.84795,0.89739,1513,173,1686,0.067121,0.120577,0.013762,0.862289,0.994034,13.17
0.9-1.0,0.968963,0.958403,1728,75,1803,0.071778,0.137711,0.005966,1.0,1.0,0.0
