In [3]:
!pip install statsmodels==0.12.1

Collecting statsmodels==0.12.1
  Downloading statsmodels-0.12.1-cp37-cp37m-manylinux1_x86_64.whl (9.5 MB)
[K     |████████████████████████████████| 9.5 MB 12.3 MB/s 
Collecting patsy>=0.5
  Downloading patsy-0.5.1-py2.py3-none-any.whl (231 kB)
[K     |████████████████████████████████| 231 kB 43.9 MB/s 
Installing collected packages: patsy, statsmodels
Successfully installed patsy-0.5.1 statsmodels-0.12.1
You should consider upgrading via the '/opt/venv/bin/python -m pip install --upgrade pip' command.[0m


In [7]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import export_text

In [11]:
LOGIT_data = pd.read_csv("logit_data.csv")
#LOGIT_data = LOGIT_data.drop(columns=["Insurance"])

In [18]:
#train test model
LOGIT_data = sm.add_constant(LOGIT_data)
n = LOGIT_data.shape[0]
k = int(n*0.3)
VAR_DROP = []#"Atm", "Fashion", "Beauty", "Electronics", "Drugstore", "Drugstore", "Groceries", "Digital Services"]
VARS = ["total_spent", "client_age", "Children", "Culture", "client_gender_f", "Pets", "Travel", "Bills And Household", "Car", "Consumer Goods", "Financial Services", "Food And Drink", "Freetime", "House And Garden", "Shopping Online", "Sport"]
#VARS = ["total_spent", "client_age", "client_gender_f", 'Financial Services', 'House And Garden', 'Food And Drink', 'Car']
endog_train = LOGIT_data.iloc[k:n,:]["insurance01"]
exog_train = LOGIT_data.iloc[k:n,:].drop(columns=["client_id","insurance01"]+VAR_DROP)

model_crt = RandomForestClassifier(n_estimators = 8000, criterion = 'gini', max_depth = 50, min_samples_leaf=3, n_jobs = -1, verbose = 1, class_weight={0:1, 1:150})
model_crt.fit(X = exog_train, y = endog_train)

In [19]:
exog_test = LOGIT_data.iloc[:k,:].drop(columns=["client_id","insurance01"]+VAR_DROP)
compare_test = LOGIT_data.iloc[:k]["insurance01"]
test_pred = model_crt.predict(exog_test)
train_pred = model_crt.predict(exog_test)
fpr, tpr, threshold = metrics.roc_curve(compare_test, train_pred)
# Area under curve
roc_auc = metrics.auc(fpr, tpr)
# GINI coefficient
GINI = 2 * (roc_auc - 0.5)
print("Gini=",GINI)
print("AUC=",roc_auc)

train_pred = (train_pred > 0.5).astype(int)
z = compare_test == train_pred
zz = train_pred[compare_test == 0]
print(zz.astype(float).mean())
zz = train_pred[compare_test == 1]
print(zz.astype(float).mean())
print(metrics.confusion_matrix(compare_test, train_pred))

print("acc=",sum(z) / len(train_pred))
print("sommersD=",(2*sum(z)-len(train_pred)) / len(train_pred))

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.5s
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:    1.0s
[Parallel(n_jobs=2)]: Done 1246 tasks      | elapsed:    1.4s
[Parallel(n_jobs=2)]: Done 1796 tasks      | elapsed:    2.0s
[Parallel(n_jobs=2)]: Done 2446 tasks      | elapsed:    2.8s
[Parallel(n_jobs=2)]: Done 3196 tasks      | elapsed:    3.7s
[Parallel(n_jobs=2)]: Done 4046 tasks      | elapsed:    4.7s
[Parallel(n_jobs=2)]: Done 4996 tasks      | elapsed:    5.9s
[Parallel(n_jobs=2)]: Done 6046 tasks      | elapsed:    7.2s
[Parallel(n_jobs=2)]: Done 7196 tasks      | elapsed:    8.5s
[Parallel(n_jobs=2)]: Done 8000 out of 8000 | elapsed:    9.4s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  

In [10]:
hyperparameter_grid = {'n_estimators': [100, 200, 300], 'max_depth': [32, 48], "class_weight":[{0:1, 1:150}, {0:1, 1:250}, {0:1, 1:400}]}  # Make the grid densier...
# hyperparameter grid search
grid_search = GridSearchCV(model_crt, param_grid = hyperparameter_grid, cv=5, n_jobs = -1, verbose = 1)
grid_search.fit(X = exog_train, y = endog_train)
print(grid_search.cv_results_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 49.6min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed: 106.2min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   22.1s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  1.7min
{'mean_fit_time': array([ 56.73588042, 112.8370378 , 169.9799511 ,  76.88531556,
       154.33021765, 231.15173883,  59.50070963, 117.40009499,
       176.21194105,  83.5714026 , 165.26854911, 248.91521721,
        59.2433742 , 118.9396915 , 177.02635989,  85.84857965,
       171.26596498, 246.1007503 ]), 'std_fit_time': array([ 0.81190524,  1.53316863,  2.49509317,  3.22819521,  2.32179649,
        3.19912196,  1.52452346,  1.52397941,  1.69983927,  1.85301464,
        3.10016668,  4.1925736 ,  1.75654454,  1.62117768

In [None]:
[VARS[i] for i in [14, 12, 8, 10, 6]]

['Shopping Online', 'Freetime', 'Car', 'Financial Services', 'Travel']

VARS = ["total_spent", "client_age", "Children", "Culture", "client_gender_f", "Pets", "Travel", "Bills And Household", "Car", "Consumer Goods", "Financial Services", "Food And Drink", "Freetime", "House And Garden", "Shopping Online", "Sport"]
model_crt = RandomForestClassifier(n_estimators = 8000, criterion = 'gini', max_depth = 50, min_samples_leaf=3, n_jobs = -1, verbose = 1, class_weight={0:1, 1:150})

Dává 8% fp, 16% fn.