In [29]:
import os
import sys
import warnings
from datetime import datetime

import json
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
import statsmodels.api as sm
import statistics
from pprint import pprint

%matplotlib inline
warnings.filterwarnings('ignore')

In [30]:
sys.path.append(os.path.join('..', 'src'))

In [31]:
# sys.path.append(os.path.join('..', 'src'))
# sys.path.append(os.path.join('..', 'src', 'c04model'))

import utils
import importlib
importlib.reload(utils)
# importlib.reload(model)

from model import plot_scores
from model import timer, measure_prediction_time, apply_ml_model, save_model_parameters, save_model_metrics

In [32]:
inputs = os.path.join('..', 'data', '03_processed')
outputs = os.path.join('..', 'data', '03_processed')
models_reports = os.path.join('..', 'data', '04_models')
model_outputs = os.path.join('..', 'data', '05_model_output')
reports = os.path.join('..', 'data', '06_reporting')

# Data capture

In [33]:
def capture_data(file, label=False):
    path = os.path.join(inputs, file+'.csv')
    return pd.read_csv(path, index_col='id')

In [34]:
X_train                  = capture_data('X_train')
X_test                   = capture_data('X_test')
X_train_oh               = capture_data('X_train_oh')
X_test_oh                = capture_data('X_test_oh')

y_train                  = capture_data('y_train', label=True)
y_test                   = capture_data('y_test', label=True)

# Corroboração das features escolhidas
Podemos acrescentar diferentes formas de fazer a seleção.
* Correlação de Pearson
* Eliminação Recursiva de Features (RFE)
* Lasso

In [35]:
from sklearn.preprocessing import MinMaxScaler

In [36]:
feature_names_oh = X_train_oh.columns.to_list()
feature_names_ord = X_train.columns.to_list()

In [37]:
num_features = 5

In [38]:
X_oh_norm = MinMaxScaler().fit_transform(X_train_oh)
X_oh_norm = pd.DataFrame(X_oh_norm, columns = feature_names_oh)

X_ord_norm = MinMaxScaler().fit_transform(X_train)
X_ord_norm = pd.DataFrame(X_ord_norm, columns = feature_names_ord)

## Correlação de Pearson

In [39]:
def cor_selector(X, y, num_feats):
    cor_list = []
    feature_name = X.columns.tolist()
    # calcula as correlaçoes com y para cada variável
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # substitui NaN por 0 onde houver
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-num_feats:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_mask = [True if i in cor_feature else False for i in feature_name]
    return cor_feature

In [40]:
cor_dict = {'oh':{}, 'ord':{}}

for k,df in zip(['oh', 'ord'], [X_oh_norm, X_ord_norm]):
    print('Seleção de', num_features, 'variáveis')
    cor_dict[k]['features'] = cor_selector(X_train_oh, y_train.y, num_features)

Seleção de 5 variáveis
Seleção de 5 variáveis


In [41]:
cor_dict

{'oh': {'features': ['dummy_relationship_Own_child',
   'age',
   'dummy_marital_status_Never_married',
   'education_num',
   'dummy_marital_status_Married_civ_spouse']},
 'ord': {'features': ['dummy_relationship_Own_child',
   'age',
   'dummy_marital_status_Never_married',
   'education_num',
   'dummy_marital_status_Married_civ_spouse']}}

## Lasso (regularização L1)
Usando Lasso em regressão logística. Lasso força que algumas features sejam reponderadas para zero, sendo perfeito para feature selection, diferentemente de Ridge, que não repondera para zero.

Nesse caso, os dados adequados são os provenientes de One-Hot Encoding.

In [42]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

In [43]:
def lr_selector(X, y, num_feats):
    embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l1", solver='liblinear'), max_features=num_features)
    embeded_lr_selector.fit(X_ord_norm, y)

    support = embeded_lr_selector.get_support()
    feature = X_ord_norm.loc[:,support].columns.tolist()
    print(str(len(feature)), 'selected features')
    return feature

In [44]:
lr_dict = {'oh':{}, 'ord':{}}

# print('Seleção de', num_features, 'variáveis')
key_str = 'oh'
df_str = 'X_'+key_str+'_norm'
print('Base usada:', df_str)
lr_dict[key_str]['features'] = lr_selector(eval(df_str), y_train, num_features)

Base usada: X_oh_norm
5 selected features


In [45]:
lr_dict

{'oh': {'features': ['age',
   'education_num',
   'marital_status',
   'capital_gain',
   'hours_per_week']},
 'ord': {}}

### Compara todos os resultados
Aqui comparamos todos os resultados, mas ainda será necessário comparar com o modelo final de Machine Learning. Vamos deixar as considerações mais para frente, mas aqui já criamos a tabela de comparações.

In [46]:
selected_features = list(cor_dict['oh']['features'])
# lr_dict
selected_features.extend(lr_dict['oh']['features'])
selected_features = list(set(selected_features))
print(selected_features)

['age', 'hours_per_week', 'dummy_marital_status_Married_civ_spouse', 'capital_gain', 'dummy_relationship_Own_child', 'dummy_marital_status_Never_married', 'education_num', 'marital_status']


In [47]:
cor_mask = [True if i in cor_dict['oh']['features'] else False for i in selected_features]
lr_mask = [True if i in lr_dict['oh']['features'] else False for i in selected_features]

In [48]:

feature_selection_df = pd.DataFrame({'Feature':selected_features, 'Pearson':cor_mask, 'lasso':lr_mask})

# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)

# display the top ones
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df

Unnamed: 0,Feature,Pearson,lasso,Total
1,education_num,True,True,2
2,age,True,True,2
3,marital_status,False,True,1
4,hours_per_week,False,True,1
5,dummy_relationship_Own_child,True,False,1
6,dummy_marital_status_Never_married,True,False,1
7,dummy_marital_status_Married_civ_spouse,True,False,1
8,capital_gain,False,True,1


# Using ML model

In [49]:
result_file = os.path.join(model_outputs, 'trained_model.pkl')
with open(result_file, 'rb') as f:
    clf_rf_validated = pickle.load(f) 

In [50]:
names = X_test.columns.to_list()
importances = clf_rf_validated.feature_importances_

In [51]:
df_importances = pd.DataFrame.from_dict({names[i]: importances[i] for i in range(len(names))}, orient='index', columns=['importance'])
df_importances.sort_values(by='importance', ascending=False, inplace=True)
df_importances.reset_index(inplace=True)

In [52]:
def check_in_selector(var, f_selector, fs_df):
    check = fs_df[fs_df['Feature'] == var][f_selector].to_list()
    if check and check[0] == True:
        return True
    else: return False

In [53]:
for selector in ['Pearson', 'lasso']:
    df_importances[selector] = df_importances['index'].apply(lambda x: check_in_selector(x, selector, feature_selection_df))
df_importances.head(7)

Unnamed: 0,index,importance,Pearson,lasso
0,age,0.213695,True,True
1,capital_gain,0.12014,False,True
2,hours_per_week,0.107835,False,True
3,marital_status,0.107098,False,True
4,relationship,0.092552,False,False
5,occupation,0.092524,False,False
6,education_num,0.086789,True,True


# Econometrics (with statsmodels) (incomplete section!!!)
Using statsmodels to analyse direct influence of each feature.

In [54]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [55]:
data = X_train.copy()
data['y'] = y_train

In [56]:
sm_model = ols("y ~ capital_gain", data=data).fit()
sm_model_summary = sm_model.summary()
sm_model_summary

0,1,2,3
Dep. Variable:,y,R-squared:,0.05
Model:,OLS,Adj. R-squared:,0.05
Method:,Least Squares,F-statistic:,1208.0
Date:,"Fri, 21 May 2021",Prob (F-statistic):,5.129999999999999e-258
Time:,03:45:52,Log-Likelihood:,-12388.0
No. Observations:,22792,AIC:,24780.0
Df Residuals:,22790,BIC:,24800.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.2268,0.003,81.313,0.000,0.221,0.232
capital_gain,1.317e-05,3.79e-07,34.761,0.000,1.24e-05,1.39e-05

0,1,2,3
Omnibus:,3940.365,Durbin-Watson:,1.982
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6037.252
Skew:,1.245,Prob(JB):,0.0
Kurtosis:,2.601,Cond. No.,7440.0
