In [26]:
import json
import math
import os
import sys
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
import statistics
import statsmodels.api as sm

warnings.filterwarnings('ignore')

In [27]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer as Imputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, roc_curve, precision_recall_curve, confusion_matrix #, plot_confusion_matrix
from sklearn.metrics import accuracy_score, log_loss, f1_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [28]:
sys.path.append(os.path.join('..', 'src'))

In [29]:
import importlib
import s05_2_feature_engineering
importlib.reload(s05_2_feature_engineering)
from s05_2_feature_engineering import build_polynomials, transform_label, treat_skewness

# Data capture

In [30]:
inputs = os.path.join('..', 'data', '03_processed')
models_reports = os.path.join('..', 'data', '04_models')
model_outputs = os.path.join('..', 'data', '05_model_output')
reports = os.path.join('..', 'data', '06_reporting')

X_train           = pd.read_csv(os.path.join(inputs, 'X_train.csv'), index_col='id')
X_test            = pd.read_csv(os.path.join(inputs, 'X_test.csv'), index_col='id')
X_train_onehot    = pd.read_csv(os.path.join(inputs, 'X_train_onehot.csv'), index_col='id')
X_test_onehot     = pd.read_csv(os.path.join(inputs, 'X_test_onehot.csv'), index_col='id')
y_train           = pd.read_csv(os.path.join(inputs, 'y_train.csv'), index_col='id')
y_test            = pd.read_csv(os.path.join(inputs, 'y_test.csv'), index_col='id')

# Machine Learning

In [31]:
from sklearn.ensemble import RandomForestRegressor
# from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
# from xgboost.sklearn import XGBClassifier

# Calculating final score with a test set

In [32]:
def get_transformations(X_set, y_set, cols,
               build_polynomals_method=False, 
                label_transformation_type=None, do_treat_skewness=False,
               imputation=None, scaler=None,
               ):
    X_set = X_set.copy()
    print(cols)
    print(X_set.columns.to_list())
    X_set = X_set[cols]
    
#     if build_polynomals_method: 
#         X_train_set = build_polynomials(X_set, ProjectParameters().numerical_cols, method = build_polynomals_method)
#     if label_transformation_type:
#         y_set = transform_label(y_set, label_transformation_type)
#     if do_treat_skewness:
#         X_set = treat_skewness(X_set, set_name)
        
    if scaler:
        X_set = scale.fit_transform(X_set)
        X_set = pd.DataFrame(X_set, columns = X_set.columns)
    if imputation:
        X_set.fillna(imputation)
    
    return X_set, y_set

In [33]:
columns = X_train.columns.to_list()

In [34]:
label_transformation_type = None

X_test, y_test = get_transformations(
        X_test, y_test, columns,
        build_polynomals_method=None, do_treat_skewness=False,
        imputation=None, scaler=None)

['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat', 'if_anomaly']
['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat', 'if_anomaly']


# load results
Capture best parameters of chosen model which were obtained during cross validation

In [35]:
def load_json_results(model_type):
    filepath = os.path.join(model_outputs, model_type+'.json')
    with open(filepath, 'r') as file:
        json_results = json.load(file)
    model = json_results[model_type]
    return model

# build model to apply on test set

In [36]:
def build_model(ml_model_type, X, y):
    ml_model = load_json_results(ml_model_type)

    if ml_model_type.startswith('tree_rf'):
        params = {}
        cols = ml_model['columns']
        for k,v in ml_model['best_params'].items():
            k = k[7:]
            params[k] = v

        model = RandomForestRegressor()

    elif ml_model_type.startswith('tree_xgb'):
        params = ml_model['best_params']
        cols = ml_model['columns']
        
        model = XGBRegressor()
        
        X = X[cols]

    for k,v in params.items(): 
        setattr(model, k, v)

    model.fit(X, y)
    
    return model, cols

In [37]:
# ['tree_xgb', 'tree_xgb_num', 'tree_xgb_numcyc', 'tree_xgb_numcyc_smote', 'tree_rf']
ml_model = 'tree_xgb'

model, columns = build_model(ml_model, X_train, y_train)
X_test = X_test[columns]
y_pred = model.predict(X_test)

# calculate test score 
main metric: mean squared error for regression, f1_score for binary target

In [39]:
# print("\nCLASSIFICATION_REPORT:\n", classification_report(y_test, y_pred))
print('mean squared error:', round(mean_squared_error(y_test, y_pred), 3))
print('r2_score', round(r2_score(y_test, y_pred),3))
print('mean_absolute_error', round(mean_absolute_error(y_test, y_pred),3))

mean squared error: 268.846
r2_score -3.052
mean_absolute_error 14.318


# sample prediction
predict first 10 entries

# rebuild model for entire dataset

In [None]:
X = X_train.copy().append(X_test)
y = y_train.copy().append(y_test)
# y.extend(y_test)

# ml_model = 'tree_randomforest'
ml_model = 'tree_xgb'
final_model = build_model(ml_model, X, y)

# save pickle of results

In [None]:
file = os.path.join(model_outputs, 'trained_model.pkl')
with open(file, 'wb') as f:
    pickle.dump(final_model, f)  

# compare estimated value with true value for sample

In [None]:
index_name = X.index.name
ordered_users = pd.DataFrame(list(zip(X.index, y_pred, y_test)), columns = [index_name, 'estimated', 'true_value']).set_index(index_name)
# ordered_users
ordered_users.tail(10).sort_values(by='true_value', ascending=True)

# Decision making

# Sort entries
Retrieve IDs e and sort them by probability.

In [None]:
index_name = X_train.index.name
y_pred = y_pred_prob
ordered_users = pd.DataFrame(list(zip(X_test.index, y_pred, y_test['y'])), columns = [index_name, 'probabilidade', 'valor_verdadeiro']).set_index(index_name)
# ordered_users.sort_values(by='probabilidade', ascending=False, inplace=True)
ordered_users.head()

Now let's see the number of observations from testset and how many of them are positive or negative.

In [None]:
users_total = len(ordered_users)
users_yes = len(ordered_users[ordered_users['valor_verdadeiro'] == 1])
users_no = users_total - users_yes
print('Size of test set:', users_total)
print('number of positive cases:', users_yes, 'ou {}% do total'.format(round(users_yes/users_total*100, 2)))
print('number of negative cases:', users_no, 'ou {}% do total'.format(round(users_no/users_total*100, 2)))

# Final thoughts

If we invested in only 8000 of users of higher probability to answer to campaign, we would achieve 3000 positive cases. But if we tried to increase that number to 4000 (1/3 de aumento!), we would need to double the investment, in other words, surpass 16000!

Therefore, I would suggest a threshold close to 0.1 (10%). With that, we get many true positives while avoiding a high cost that could reduce ROI, given the consideration that the marginal cost would be low.

Beyond that, the teams which are interested in evaluating the most relevant variables can consult feature importances results. 