## Modeling for performance

### implementing some learnings from Amazon Access (project 11)
- generating multiple datasets 
- zipping datasets with models 
- stacking models for results

In [1]:
import numpy as np 
import seaborn as sns 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn import metrics, linear_model, ensemble
from yellowbrick.regressor import residuals_plot, prediction_error
from fast_ml.model_development import train_valid_test_split
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from tpot import TPOTRegressor
import category_encoders as ce
import time
from xgboost import XGBRegressor
import warnings
encoders = {

    'BackwardDifferenceEncoder': ce.backward_difference.BackwardDifferenceEncoder,
    'OneHotEncoder': ce.one_hot.OneHotEncoder,
    
}


In [6]:
import os 
os.chdir('..')
from Utils.Metrics import regression as reg_metrics
os.chdir('./9. Clickstream data for online shopping')

In [7]:
clothing_data_df = pd.read_csv('./data/e-shop data and description/e-shop clothing 2008.csv',sep=',').drop(['Unnamed: 0'],axis=1)

In [8]:
X = clothing_data_df.drop('price', axis=1)
y = clothing_data_df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [3]:
selected_models = [
    "LR:dataset1",
    "LR:dataset2",
    "LR:dataset3",
    "LR:dataset4",
    "RFC:dataset1",
    "RFC:dataset2",
    "RFC:dataset3",
    "RFC:dataset4",
    "GBC:dataset1",
    "GBC:dataset2",
    "GBC:dataset3",
    "GBC:dataset4"
]

models = []
for item in selected_models:
    model_id, dataset = item.split(':')
    # instantiating models (with datasets)
    model = {'LR': linear_model.LogisticRegression,
             'GBC': ensemble.GradientBoostingClassifier,
             'RFC': ensemble.RandomForestClassifier,
             'ETC': ensemble.ExtraTreesClassifier}[model_id]() #I have never seen this done before.
    model.set_params()
    models.append((model, dataset))

datasets = [dataset for model, dataset in models]


In [18]:
X_train

Unnamed: 0,month,day,order,country,session ID,page 1 (main category),page 2 (clothing model),colour,location,model photography,page
98301,6,13,15,24,14326,3,C56,6,1,2,4
17580,4,9,1,29,2549,4,P16,7,6,1,1
124394,7,8,25,46,18259,3,C41,13,2,1,3
148102,7,29,23,29,21628,1,A16,1,6,1,1
94949,6,10,7,44,13863,3,C39,6,1,1,3
...,...,...,...,...,...,...,...,...,...,...,...
50032,5,2,3,29,7178,4,P57,4,1,1,4
75456,5,24,2,29,10910,2,B24,11,2,1,2
120851,7,5,2,44,17757,1,A11,3,4,1,1
23160,4,13,35,29,3359,4,P42,6,2,2,3


In [9]:
# dataset 1
# Create a dataset where the features are the effects of a logistic regression trained on sparsified data.
def sparsify(X, X_test):
    """Return One-Hot encoded datasets."""
    enc = OneHotEncoder()
    enc.fit(np.vstack((X, X_test)))
    return enc.transform(X), enc.transform(X_test)

from sklearn import linear_model 
from sklearn.model_selection import KFold

Xe_train = np.zeros(X_train.shape)
Xe_test = np.zeros(X_test.shape)
n_cols = Xe_train.shape[1]

model = linear_model.LogisticRegression(C=2)
X_train, X_test = sparsify(X_train, X_test)

kfold = KFold(5).split(X_train,y)

for train, cv in kfold:
    model.fit(X_train[train], y[train])
    colindices = X_test.nonzero()[1]
    for i, k in zip(cv, range(len(cv))):
        for j in range(n_cols):
            z = colindices[n_cols*k + j]
            Xe_train[i, j] = model.coef_[0, z]

model.fit(X_train, y)
colindices = X_test.nonzero()[1]
for i in range(Xe_test.shape[0]):
    for j in range(n_cols):
        z = colindices[n_cols*i + j]
        Xe_test[i, j] = model.coef_[0, z]

return Xe_train, Xe_test


TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [None]:
results_df = pd.DataFrame(columns = ['model','rmse','r2','mape'])

In [None]:
max_levels = round(clothing_data_df.columns.shape[0]*0.5)
categorical_features = clothing_data_df.select_dtypes(exclude=[np.number]).columns
cats_many = []
cats_few = []
for ft in categorical_features:
    levels = clothing_data_df[ft].unique().shape[0]
    if levels > max_levels:
        cats_many.append(ft)
    else:
        cats_few.append(ft)

In [None]:
numeric_features = clothing_data_df.select_dtypes([np.number]).drop(['price'], axis=1).columns
numeric_features

categorical_features = clothing_data_df.select_dtypes(exclude=[np.number]).columns
categorical_features

X = clothing_data_df.drop('price', axis=1)
y = clothing_data_df['price']

y = np.log(y)

X_train, X_test_tmp, y_train, y_test_tmp = train_test_split(X, y, test_size=0.3)
X_val, X_test, y_val, y_test = train_test_split(X_test_tmp, y_test_tmp, test_size=0.5)

del X_test_tmp, y_test_tmp

selected_model = XGBRegressor(tree_method = "gpu_hist",single_precision_histogram=True, gpu_id=0)


categorical_transformer_many_level = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('encoder', encoders['BackwardDifferenceEncoder']())
    ]
)    

categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('encoder', encoders['OneHotEncoder']())
    ]
) 

numeric_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', numeric_transformer, numeric_features),
        ('categorical_many', categorical_transformer_many_level, cats_many),
        ('categorical', categorical_transformer, cats_few)
    ]
)

pipe = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('regressor', selected_model)
    ]
)


In [None]:

pipe.named_steps['preprocessor'].fit(X_train)

    
numeric_feat_names = pipe.named_steps['preprocessor'].transformers_[0][2]
cat_feat_names = pipe.named_steps['preprocessor'].transformers_[1][2]
#redo_names = pipe.named_steps['preprocessor'].transformers_[2][1].named_steps['encoder'].get_feature_names()
#base_names = pipe.named_steps['preprocessor'].transformers_[2][2]   
#one_hot_feat_names=[]
#for i in range(len(base_names)):
#    one_hot_feat_names.append([base_names[i]+'_'+x.split('_')[-1] for x in redo_names if x[0] == str(i)])
    
feature_names = list(numeric_feat_names) + list(cat_feat_names) #+ [y for x in one_hot_feat_names for y in x]