In [53]:
import pandas as pd
import numpy as np
from tabulate import tabulate
from time import time

# For Preprocessing
from prog import LabelEncoding, Scaler, OneHotEncoding
from sklearn.pipeline import Pipeline

# For Learning
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import r2_score, explained_variance_score, max_error
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
import catboost
import lightgbm
import xgboost

import warnings
warnings.filterwarnings('ignore')

# Data Pre-Processing

In [209]:
data = pd.read_csv('data/insurance.csv')

(X_train, X_valid,
 y_train, y_valid) = train_test_split(data.drop('charges', axis=1),
                                     data.charges, shuffle=True,
                                     test_size=0.2, random_state=42)

### Use classes from prog.py

# Model testing

In [262]:
def get_gs(regr, parameters, le=True, ohe=True, scaler=False, need_print=False, cv=None):
    steps = []
    if le:
        steps += [('le', LabelEncoding())]
    if ohe:
        steps += [('ohe', OneHotEncoding())]
    if scaler:
        steps += [('scaler', Scaler())]
    preprocessing = Pipeline(steps=steps)
    pipe = Pipeline(steps=[('preproc', preprocessing),
                            ('model', regr)])

    if need_print:
        print("Performing grid search...")
        t0 = time()
    
    gs = GridSearchCV(pipe, parameters, scoring='r2', cv=cv)
    gs.fit(X_train, y_train)
    if need_print:
        print("done in %0.3fs" % (time() - t0))
        print()

        print("Best score: %0.3f" % gs.best_score_)
        print("Best parameters set:")
        best_parameters = gs.best_estimator_.get_params()
        for param_name in sorted(parameters.keys()):
            print("\t%s: %r" % (param_name, best_parameters[param_name]))
    return gs, gs.best_estimator_.steps[0][1]

In [294]:
def get_estimation(regr, le=True, ohe=True, scaler=False):
    t0 = time()
    steps = []
    if le:
        steps += [('le', LabelEncoding())]
    if ohe:
        steps += [('ohe', OneHotEncoding())]
    if scaler:
        steps += [('scaler', Scaler())]
    pipe = Pipeline(steps=steps)
    
    cv = KFold(n_splits=4, shuffle=True, random_state=42)
    res = cross_val_score(regr, pipe.fit_transform(data.drop('charges', axis=1)),
                          data.charges, cv=cv, scoring='r2')
    print("Mean r2-score: %.2f%% (%.2f%%)" % (res.mean()*100, res.std()*100))
    print(f"All r2-scores: {res}")
    model_pipe = Pipeline(steps=[('prepr', pipe),
                                 ('model', regr)
    ])
    model_pipe.fit(X_train, y_train)
    print(f'R2 - {r2_score(y_valid, model_pipe.predict(X_valid)):.4}')
    print("done in %0.3fs" % (time() - t0))
    return regr, pipe, res

In [111]:
def get_res_grid_search(regr, gs_res, preprocessing):
    X_valid_pipe = preprocessing.transform(X_valid)
    print('Before Grid Search:')
    print(f'R2 - {r2_score(y_valid, regr.predict(X_valid_pipe)):.4}')
    print(f'Expl variance - {explained_variance_score(y_valid, regr.predict(X_valid_pipe)):.4}')
    print()
    print('After Grid Serch:')
    print(f'R2 - {r2_score(y_valid, gs_res.best_estimator_.predict(X_valid)):.4}')
    print(f'Expl variance - {explained_variance_score(y_valid, gs_res.best_estimator_.predict(X_valid)):.4}')

## Catboost using categorical feature

In [295]:
ctbst_cat = catboost.CatBoostRegressor(cat_features=['region'], verbose=False)
ctbst_cat = get_estimation(ctbst_cat, ohe=False)[0]

Mean r2-score: 84.92% (3.00%)
All r2-scores: [0.86246463 0.89216229 0.8200967  0.82211456]
R2 - 0.8732
done in 37.014s


In [157]:
grid_ctbst_cat = {'model__learning_rate': [0.02, 0.009, 0.01, 0.03],
                  'model__depth': [4, 5, 3],
                  'model__l2_leaf_reg': [5, 6, 7, 9, 8]
}
gs_ctbst_cat, preproc_ctbst_cat = get_gs(ctbst_cat, grid_ctbst_cat, ohe=False, need_print=True)

get_res_grid_search(ctbst_cat, gs_ctbst_cat, preproc_ctbst_cat)

Performing grid search...
done in 1117.383s

Best score: 0.851
Best parameters set:
	model__depth: 4
	model__l2_leaf_reg: 5
	model__learning_rate: 0.009
Before Grid Search:
R2 - 0.8732
Expl variance - 0.8732

After Grid Serch:
R2 - 0.8817
Expl variance - 0.8818


#### Use catboost model best estimator

In [305]:
ctbst_cat = catboost.CatBoostRegressor(cat_features=['region'], verbose=False, 
                                       learning_rate=0.009, depth=4, l2_leaf_reg=5,
                                       loss_function='RMSE')
ctbst_cat, preproc_ctbst_cat, res_ctbst_cat = get_estimation(ctbst_cat, ohe=False)

Mean r2-score: 85.98% (2.70%)
All r2-scores: [0.86983217 0.89892027 0.82890297 0.84137472]
R2 - 0.8817
done in 21.438s


## Catboost

In [297]:
ctbst = catboost.CatBoostRegressor(task_type=None, verbose=False)
ctbst= get_estimation(ctbst)[0]

Mean r2-score: 83.82% (2.65%)
All r2-scores: [0.85864864 0.86992707 0.8147745  0.80947824]
R2 - 0.8637
done in 6.898s


In [118]:
grid_ctbst = {'model__learning_rate': [0.01, 0.03, 0.1, 0.25],
              'model__depth': [4, 5, 3, 2],
              'model__l2_leaf_reg': [5, 7, 9, 11]
}
gs_ctbst, preproc_ctbst = get_gs(ctbst, grid_ctbst, need_print=True)

get_res_grid_search(ctbst, gs_ctbst, preproc_ctbst)

Performing grid search...
done in 210.244s

Best score: 0.852
Best parameters set:
	model__depth: 3
	model__l2_leaf_reg: 5
	model__learning_rate: 0.01
Before Grid Search:
R2 - 0.8637
Expl variance - 0.8638

After Grid Serch:
R2 - 0.8822
Expl variance - 0.8824


#### Use catboost model best estimator

In [306]:
ctbst = catboost.CatBoostRegressor(task_type=None, verbose=False, 
                                   learning_rate=0.01, depth=3, 
                                   l2_leaf_reg=5, loss_function='RMSE')
ctbst, preproc_ctbst, res_ctbst = get_estimation(ctbst)

Mean r2-score: 85.99% (2.72%)
All r2-scores: [0.86980985 0.89943728 0.82872846 0.84155767]
R2 - 0.8822
done in 3.625s


## Random Forest

In [299]:
rf = RandomForestRegressor()
rf = get_estimation(rf)[0]

Mean r2-score: 83.29% (2.19%)
All r2-scores: [0.84758743 0.85804471 0.80136046 0.82460418]
R2 - 0.8606
done in 1.596s


In [125]:
grid_rf = {'model__n_estimators': [200, 300, 400, 500],
           'model__max_depth': [4, 6, 10, 3, 2],
           'model__min_samples_split': [3, 7, 9, 2]
}
gs_rf, preproc_rf = get_gs(rf, grid_rf, need_print=True)

get_res_grid_search(rf, gs_rf, preproc_rf)

Performing grid search...
done in 214.633s

Best score: 0.847
Best parameters set:
	model__max_depth: 4
	model__min_samples_split: 3
	model__n_estimators: 500
Before Grid Search:
R2 - 0.8622
Expl variance - 0.8635

After Grid Serch:
R2 - 0.8716
Expl variance - 0.8721


#### Use random forest model best estimator

In [307]:
rf = RandomForestRegressor(n_estimators=500,
                           max_depth=4, min_samples_split=3)
rf, preproc_rf, res_rf = get_estimation(rf)

Mean r2-score: 85.49% (2.47%)
All r2-scores: [0.8585246  0.89305331 0.82683147 0.84103084]
R2 - 0.8717
done in 4.132s


## Extra trees

In [301]:
et = ExtraTreesRegressor()
et = get_estimation(et)

Mean r2-score: 81.03% (2.56%)
All r2-scores: [0.83916311 0.82823395 0.77329017 0.80038029]
R2 - 0.8444
done in 1.404s


In [127]:
grid_et = {'model__n_estimators': [200, 300, 400, 500],
           'model__max_depth': [4, 6, 10, 8, 7],
           'model__min_samples_split': [3, 7, 9, 8, 12]
}
gs_et, preproc_et = get_gs(et, grid_et, need_print=True)

get_res_grid_search(et, gs_et, preproc_et)

Performing grid search...
done in 205.431s

Best score: 0.846
Best parameters set:
	model__max_depth: 7
	model__min_samples_split: 3
	model__n_estimators: 300
Before Grid Search:
R2 - 0.8465
Expl variance - 0.8469

After Grid Serch:
R2 - 0.8769
Expl variance - 0.8772


#### Use extra trees model best estimator

In [308]:
et = ExtraTreesRegressor(max_depth=7, min_samples_split=3,
                         n_estimators=300)
et, preproc_et, res_et = get_estimation(et)

Mean r2-score: 85.29% (2.81%)
All r2-scores: [0.86964469 0.88736537 0.81345958 0.84115631]
R2 - 0.8765
done in 2.248s


## XGBoost

#### first xgboost

In [303]:
xgbst1 = xgboost.XGBRFRegressor()
xgbst1 = get_estimation(xgbst1)[0]

Mean r2-score: 85.37% (2.41%)
All r2-scores: [0.86169431 0.88727083 0.82133046 0.84462885]
R2 - 0.8771
done in 0.433s


In [129]:
grid_xgbst1 = {'model__lambda': [0.1, 0.5, 1.2, 1.6],
               'model__max_depth': [3, 8, 5, 11, 14],
               'model__learning_rate': [0.1, 0.4, 0.6, 0.8]
}
gs_xgbst1, preproc_xgbst1 = get_gs(xgbst1, grid_xgbst1, need_print=True)

get_res_grid_search(xgbst1, gs_xgbst1, preproc_xgbst1)

Performing grid search...
done in 49.156s

Best score: 0.765
Best parameters set:
	model__lambda: 0.1
	model__learning_rate: 0.8
	model__max_depth: 5
Before Grid Search:
R2 - 0.8771
Expl variance - 0.8775

After Grid Serch:
R2 - 0.8066
Expl variance - 0.843


#### Better results in xgboost regressor before grid search

In [309]:
xgbst1 = xgboost.XGBRFRegressor()
xgbst1, preproc_xgbst1, res_xgbst1 = get_estimation(xgbst1)

Mean r2-score: 85.37% (2.41%)
All r2-scores: [0.86169431 0.88727083 0.82133046 0.84462885]
R2 - 0.8771
done in 0.440s


#### second xgboost

In [310]:
xgbst2 = xgboost.XGBRegressor()
xgbst2 = get_estimation(xgbst2)[0]

Mean r2-score: 80.36% (2.74%)
All r2-scores: [0.82568851 0.83346349 0.76562142 0.78980613]
R2 - 0.836
done in 0.676s


In [131]:
grid_xgbst2 = {'model__lambda': [0.1, 0.5, 1.2, 1.6],
               'model__max_depth': [3, 8, 5, 11, 14],
               'model__learning_rate': [0.1, 0.4, 0.6, 0.8]
}
gs_xgbst2, preproc_xgbst2 = get_gs(xgbst2, grid_xgbst2, need_print=True)

get_res_grid_search(xgbst2, gs_xgbst2, preproc_xgbst2)

Performing grid search...
done in 48.585s

Best score: 0.844
Best parameters set:
	model__lambda: 0.1
	model__learning_rate: 0.1
	model__max_depth: 3
Before Grid Search:
R2 - 0.836
Expl variance - 0.8362

After Grid Serch:
R2 - 0.8792
Expl variance - 0.8795


#### Use xgboost model best estimator

In [311]:
xgbst2 = xgboost.XGBRegressor(lambada=0.1, max_depth=3, learning_rate=0.1)
xgbst2, preproc_xgbst2, res_xgbst2 = get_estimation(xgbst2)

Mean r2-score: 85.36% (2.80%)
All r2-scores: [0.86166304 0.895366   0.82166102 0.8358003 ]
R2 - 0.8792
done in 0.354s


## LightGBM

In [312]:
lgbm = lightgbm.LGBMRegressor()
lgbm = get_estimation(lgbm)[0]

Mean r2-score: 83.78% (2.94%)
All r2-scores: [0.85054679 0.87873784 0.80143121 0.82047224]
R2 - 0.8654
done in 0.485s


In [289]:
grid_lgbm = {'model__learning_rate': [0.2, 0.3, 0.4, 0.5],
             'model__num_leaves': [4, 5, 6, 7, 10],
             'model__boosting': ['rf', 'dart', 'goss'],
             'model__max_depth': [2, 3, 4, -1],
             'model__lambda_l2': [0, 0.2, 0.4, 0.7, 1]
}
gs_lgbm, preproc_lgbm = get_gs(lgbm, grid_lgbm, need_print=True)

get_res_grid_search(lgbm, gs_lgbm, preproc_lgbm)

Performing grid search...
done in 315.629s

Best score: 0.850
Best parameters set:
	model__boosting: 'dart'
	model__lambda_l2: 0.4
	model__learning_rate: 0.3
	model__max_depth: 2
	model__num_leaves: 4
Before Grid Search:
R2 - 0.8654
Expl variance - 0.8655

After Grid Serch:
R2 - 0.8833
Expl variance - 0.8833


#### Use xgboost model best estimator

In [313]:
lgbm = lightgbm.LGBMRegressor(boosting='dart', learning_rate=0.3, 
                              lambda_l2=0.4, num_leaves=4, max_depth=2)
lgbm, preproc_lgbm, res_lgbm = get_estimation(lgbm)

Mean r2-score: 86.00% (2.69%)
All r2-scores: [0.86930083 0.89927074 0.82900945 0.84241878]
R2 - 0.8833
done in 0.236s


# Analyse results

#### Good models:
* 2 catboost models
* lightgbm model
* 2 xgboost models
* random forest
* extra trees

Get tab with results:

In [315]:
models = [ctbst_cat, ctbst, lgbm, xgbst1, xgbst2, rf, et]
preprocs = [preproc_ctbst_cat, preproc_ctbst, preproc_lgbm, 
            preproc_xgbst1, preproc_xgbst2, preproc_rf, preproc_et]
cv_res = list(map(np.mean, [res_ctbst_cat, res_ctbst, res_lgbm, 
          res_xgbst1, res_xgbst2, res_rf, res_et]))
names = ['Catboost categorial', 'Catboost ohe',
         'LightGBM', 'XGBoost (based on rf)',
         'XGBoost', 'Random Forest', 'Extra Trees']

In [319]:
table = []
for name, reg, prepr, cv_mean in zip(names, models, preprocs, cv_res): 
    X_valid_pipe = prepr.transform(X_valid)
    cv_mean = f'{cv_mean:.4f}'
    r2 = f'{r2_score(y_valid, reg.predict(X_valid_pipe)):.4f}'
    ev = f'{explained_variance_score(y_valid, reg.predict(X_valid_pipe)):.4f}'
    max_err = f'{max_error(y_valid, reg.predict(X_valid_pipe)):.4f}'
    table += [[name, cv_mean, r2, ev, max_err]]
print(tabulate(table, headers=['', 'CV score (mean)', 'R2-score', 'Explained Varience', 'Max Error'], tablefmt='fancy_grid'))

╒═══════════════════════╤═══════════════════╤════════════╤══════════════════════╤═════════════╕
│                       │   CV score (mean) │   R2-score │   Explained Varience │   Max Error │
╞═══════════════════════╪═══════════════════╪════════════╪══════════════════════╪═════════════╡
│ Catboost categorial   │            0.8598 │     0.8817 │               0.8818 │     20990.8 │
├───────────────────────┼───────────────────┼────────────┼──────────────────────┼─────────────┤
│ Catboost ohe          │            0.8599 │     0.8822 │               0.8824 │     20935.1 │
├───────────────────────┼───────────────────┼────────────┼──────────────────────┼─────────────┤
│ LightGBM              │            0.86   │     0.8833 │               0.8833 │     21210.1 │
├───────────────────────┼───────────────────┼────────────┼──────────────────────┼─────────────┤
│ XGBoost (based on rf) │            0.8537 │     0.8771 │               0.8775 │     21769.2 │
├───────────────────────┼───────────────

In [320]:
ctbst_cat.save_model('catboost_model')

In [322]:
lgbm.booster_.save_model('lightgbm_model.mdl', num_iteration=lgbm.booster_.best_iteration) 

<lightgbm.basic.Booster at 0x28506beaf88>