In [14]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from dateutil.parser import parse
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA

In [2]:
# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True, figsize=(11, 5))
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
    legend=False,
)
%config InlineBackend.figure_format = 'retina'

In [3]:
random_state = 42

In [4]:
X_train = pd.read_csv( './data/feature_set_1/X_train_full.csv')
y_train = pd.read_csv( './data/feature_set_1/y_train.csv').values.ravel()

X_val = pd.read_csv( './data/feature_set_1/X_valid_full.csv')
y_val = pd.read_csv( './data/feature_set_1/y_valid.csv').values.ravel()

In [5]:
X_train.head()

Unnamed: 0,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TRANSACTION_GOODS_AND_SERVICES_AMOUNT,TRANSACTION_CASHBACK_AMOUNT,MERCHANT_ID,IS_RECURRING_TRANSACTION,MCC_CODE,TAX_EXCEMPT_INDICATOR,ANNUAL_TURNOVER_CARD,...,BUSINESS_TYPE_Limited Liability Company (LLC),BUSINESS_TYPE_S Corporations,BUSINESS_TYPE_Sole Proprietorships,OUTLET_TYPE_Ecommerce,OUTLET_TYPE_Face to Face,OUTLET_TYPE_Face to Face and Ecommerce,Time_of_day_00:00-05:59,Time_of_day_06:00-11:59,Time_of_day_12:00-17:59,Time_of_day_18:00-23:59
0,4580711269062109,26074138,5.2,5.2,0.0,24816.0,0,5262,True,323074,...,1,0,0,1,0,0,0,1,0,0
1,481190465475763,83069431,73.54,73.54,0.0,8873.0,0,4900,False,293163,...,1,0,0,0,0,1,0,0,1,0
2,2487912002442406,67373352,27.3,27.3,0.0,6199.0,0,5734,False,303209,...,1,0,0,0,1,0,0,0,0,1
3,246492597153468,90196690,23.62,23.62,0.0,20045.0,0,5552,True,87613737,...,0,1,0,0,1,0,0,0,1,0
4,1803960674374446,4907158,78.38,78.38,0.0,29889.0,0,8220,False,29358,...,0,0,1,0,1,0,1,0,0,0


In [6]:
forest_model = RandomForestRegressor(n_estimators=100,random_state=random_state,verbose=1,n_jobs=-1)
forest_model.fit(X_train,y_train)



[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 15.0min finished


In [8]:
train_pred = forest_model.predict(X_train)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.8s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    4.7s finished


In [11]:
roc_auc_score(y_train, train_pred)

0.999999999108971

In [12]:
val_pred = forest_model.predict(X_val)
roc_auc_score(y_val, val_pred)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    2.0s finished


0.7032345675443509

In [7]:
for n_esti in [4,8,16,32,64,128,256,512,1024]:
	forest_model = RandomForestRegressor(n_estimators=n_esti,random_state=random_state,verbose=1,n_jobs=-1,warm_start=True)
	forest_model.fit(X_train,y_train)
 
	train_predictions = forest_model.predict(X_train)
	print(f'Train MAE with {n_esti} estimators')
	print(mean_absolute_error(y_train, train_predictions))
 
	val_predictions = forest_model.predict(X_val)
	print(f'Val MAE with {n_esti} estimators')
	print(mean_absolute_error(y_val, val_predictions))
	print("\n")

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   41.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   41.8s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s finished


Train MAE with 4 estimators
0.0163350779028194


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished


Val MAE with 4 estimators
0.04436177062625808




[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:   57.7s remaining:  2.9min
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:  1.0min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    0.2s remaining:    0.8s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    0.2s finished


Train MAE with 8 estimators
0.016206144081703185


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    0.0s remaining:    0.3s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    0.1s finished


Val MAE with 8 estimators
0.04424612105072328




[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 out of  16 | elapsed:  2.2min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  16 out of  16 | elapsed:    0.8s finished


Train MAE with 16 estimators
0.016136318882475315


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  16 out of  16 | elapsed:    0.3s finished


Val MAE with 16 estimators
0.0439804395934136




[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 out of  32 | elapsed:  4.5min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  32 out of  32 | elapsed:    1.5s finished


Train MAE with 32 estimators
0.0161546961381474


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  32 out of  32 | elapsed:    0.6s finished


Val MAE with 32 estimators
0.043987570014878165




[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed:  9.4min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.8s
[Parallel(n_jobs=8)]: Done  64 out of  64 | elapsed:    3.2s finished


Train MAE with 64 estimators
0.016220963098805503


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done  64 out of  64 | elapsed:    1.4s finished


Val MAE with 64 estimators
0.044128664450570745




[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 128 out of 128 | elapsed: 17.1min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.7s
[Parallel(n_jobs=8)]: Done 128 out of 128 | elapsed:    6.5s finished


Train MAE with 128 estimators
0.01619584612066598


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 128 out of 128 | elapsed:    3.2s finished


Val MAE with 128 estimators
0.04397362662221972




[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
