### Machine Learning model based in a work from Ricardo Santos (jnj)
* Split test and train data
* Get some insights
* Train the model

Import libraries

In [1]:
import pandas as pd, numpy as np, unicodedata
import matplotlib.pyplot as plt, seaborn as sns

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupKFold, cross_val_score, learning_curve, validation_curve
from sklearn.feature_selection import RFECV, mutual_info_regression
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import xgboost as xgb

In [3]:
import shap
from sklearn import metrics
import statsmodels.api as sm

Define Functions

In [4]:
def check_performance(model, X_test, y_test):
    print("MAE best model: %1.3f" % np.expm1(metrics.mean_absolute_error(y_test, model.predict(X_test))))
    print("Median Abs Err best model: %1.3f" % np.expm1(metrics.median_absolute_error(y_test, model.predict(X_test))))
    print("R\u00b2 best model: %1.3f" % metrics.r2_score(y_test, model.predict(X_test)))
    print("Explained Variance best model: %1.3f" % metrics.explained_variance_score(y_test, model.predict(X_test)))

Load data

In [21]:
avc = pd.read_parquet("../data/processed/avc-cuiaba.parquet")

0       2
1       4
2       2
3       8
4       5
       ..
1091    5
1092    7
1093    3
1094    2
1095    3
Name: LAG0, Length: 1096, dtype: int64

#### Creating column year to stratify the dataset

In [32]:
avc["year"] = avc.data.dt.year

#### Split dataset in train and test

In [33]:
strat_train_set, strat_test_set = train_test_split(
    avc, test_size=0.2, stratify=avc["year"], random_state=42
)

In [34]:
# Look with the proportion is the same for all the years as expected
strat_test_set["year"].value_counts() / len(strat_test_set)

2016    0.336364
2017    0.331818
2018    0.331818
Name: year, dtype: float64

Check distribution of target class

In [27]:
# quantile = QuantileTransformer(output_distribution='normal',n_quantiles=1000)
# normal   = PowerTransformer()
# y_qtl    = pd.DataFrame(quantile.fit_transform(avc["LAG0"]), columns=['quantile'])
# y_norm   = pd.DataFrame(normal.fit_transform(avc["LAG0"]), columns=['normal'])

Define Baseline Regression Model

In [37]:
def check_models(X, y, ids, n):
    models = [DummyRegressor(),\
              Ridge(),\
              KernelRidge(kernel='rbf'),\
              RandomForestRegressor(),\
              xgb.XGBRegressor(random_state=2001),\
              HistGradientBoostingRegressor(random_state=2001)]
    results = dict(model=[], score=[], std=[])
    for m in models:
        pipe = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),\
                         ('scaler', StandardScaler()),\
                         ('model', m)])
        cv = GroupKFold(n_splits=n).split(X, y, ids)
        s = cross_val_score(pipe, X, y.values.ravel(), scoring='explained_variance', cv=cv)
        results['model'].append(str(pipe['model']).split('(')[0])
        results['score'].append(s.mean())
        results['std'].append(s.std())
    return results

In [38]:
avc = strat_train_set.drop(columns=["LOAVC", "LAG0", "DS", "data"])
avc_labels = strat_train_set.LOAVC.copy()

In [40]:
results = check_models(avc, avc_labels, 4, n=5)

TypeError: check_models() missing 1 required positional argument: 'ids'