ootb per tpot settings (seems to return very different results when rerun with the same settings, which makes it hard to tell if tuning is working):

    *  R2 for test grs data: 0.705872893854177
    
    *  negMedAE for test grs data: -2243.18338383248

    *  R2 for test gcs data: 0.7512150347720993

    *  negMedAE for test gcs data: -4511.080341844702

In [2]:
from helpers import utils
from os.path import join, dirname
from dotenv import load_dotenv
import os
import pickle
from snowflake import connector
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedKFold
from sklearn.preprocessing import Normalizer, QuantileTransformer, RobustScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from tpot.export_utils import set_param_recursive
from sklearn.metrics import r2_score, make_scorer


pd.options.display.float_format = '{:,.2f}'.format

# get environment variables
dotenv_path = join(dirname('streamlit_grs_fit\\app\\'), '.env')
load_dotenv(dotenv_path)
SF_ACCOUNT = os.getenv('SF_ACCOUNT')
SF_USER = os.getenv('SF_USER')
SF_PASSWORD = os.getenv('SF_PASSWORD')
SF_ROLE = os.getenv('SF_ROLE')
SF_WAREHOUSE = os.getenv('SF_WAREHOUSE')
SF_DATABASE = os.getenv('SF_DATABASE')
SF_SCHEMA = os.getenv('SF_SCHEMA')

def load_data(query):
    conn = connector.connect(
        user = SF_USER
        ,password = SF_PASSWORD
        ,account = SF_ACCOUNT
        ,warehouse = SF_WAREHOUSE
        ,database = SF_DATABASE
        ,schema = SF_SCHEMA
        ,role = SF_ROLE
    )
    cur = conn.cursor()
    df_data = cur.execute(query).fetch_pandas_all()
    return df_data

In [3]:
query = 'select '+\
            'JOB'+\
            ',DIRECT_COST'+\
            ',DIV_00_DIRECT_COST'+\
            ',DIV_01_DIRECT_COST'+\
            ',DIV_02_DIRECT_COST'+\
            ',DIV_03_DIRECT_COST'+\
            ',DIV_04_DIRECT_COST'+\
            ',DIV_05_DIRECT_COST'+\
            ',DIV_06_DIRECT_COST'+\
            ',DIV_07_DIRECT_COST'+\
            ',DIV_08_DIRECT_COST'+\
            ',DIV_09_DIRECT_COST'+\
            ',DIV_10_DIRECT_COST'+\
            ',DIV_11_DIRECT_COST'+\
            ',DIV_12_DIRECT_COST'+\
            ',DIV_13_DIRECT_COST'+\
            ',DIV_14_DIRECT_COST'+\
            ',DIV_15_DIRECT_COST'+\
            ',DIV_16_DIRECT_COST'+\
            ',DIV_17_DIRECT_COST'+\
            ',DIV_18_DIRECT_COST'+\
            ',DIV_19_DIRECT_COST'+\
            ',DIV_21_DIRECT_COST'+\
            ',DIV_22_DIRECT_COST'+\
            ',DIV_23_DIRECT_COST'+\
            ',DIV_26_DIRECT_COST'+\
            ',DIV_27_DIRECT_COST'+\
            ',DIV_28_DIRECT_COST'+\
            ',DIV_31_DIRECT_COST'+\
            ',DIV_32_DIRECT_COST'+\
            ',DIV_33_DIRECT_COST'+\
            ',DIV_34_DIRECT_COST'+\
            ',DIV_55_DIRECT_COST'+\
            ',GCS_COST'+\
            ',GRS_COST '+\
            'from sandbox.global.ml_grs_fit ' 
df_data = load_data(query).set_index('JOB') 
df_data = pd.DataFrame(df_data)
df_data = df_data.fillna(0)

In [4]:
df_working = df_data.loc[
                    (0 != df_data.GRS_COST) &
                    (0 != df_data.GCS_COST)
].copy()
df_working.describe()

Unnamed: 0,DIRECT_COST,DIV_00_DIRECT_COST,DIV_01_DIRECT_COST,DIV_02_DIRECT_COST,DIV_03_DIRECT_COST,DIV_04_DIRECT_COST,DIV_05_DIRECT_COST,DIV_06_DIRECT_COST,DIV_07_DIRECT_COST,DIV_08_DIRECT_COST,...,DIV_26_DIRECT_COST,DIV_27_DIRECT_COST,DIV_28_DIRECT_COST,DIV_31_DIRECT_COST,DIV_32_DIRECT_COST,DIV_33_DIRECT_COST,DIV_34_DIRECT_COST,DIV_55_DIRECT_COST,GCS_COST,GRS_COST
count,3332.0,3332.0,3332.0,3332.0,3332.0,3332.0,3332.0,3332.0,3332.0,3332.0,...,3332.0,3332.0,3332.0,3332.0,3332.0,3332.0,3332.0,3332.0,3332.0,3332.0
mean,2753305.18,15416.14,1095.3,120305.74,305087.19,25975.15,192784.27,83875.74,97562.86,262156.35,...,279623.43,22192.54,7159.77,84671.51,18546.22,19750.99,505.83,1710.05,159975.26,130762.61
std,18205353.59,247803.94,21856.71,835518.53,2456075.21,255679.11,1926522.1,479208.15,712596.43,2347850.24,...,2283423.57,355302.65,98050.42,852341.42,200217.24,315920.99,14397.44,98425.42,991159.86,893014.97
min,-3404153.0,-327264.32,0.0,-41282.56,-4315458.2,-164664.12,-1398362.17,-174550.72,-16197.24,-407656.41,...,-104.77,0.0,0.0,-2322.1,-11.5,-53330.53,-4.2,0.0,-3335087.0,-629785.0
25%,4410.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,554.25,305.75
50%,31937.0,0.0,0.0,889.22,0.0,0.0,0.0,353.55,0.0,37.68,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4763.0,2099.0
75%,279720.25,0.0,0.0,10486.36,1041.34,0.0,180.5,10992.53,44.95,9230.26,...,3893.87,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35369.5,16783.0
max,399570438.0,8618727.16,762087.69,26715320.64,49560386.64,9635075.36,51850542.49,9711287.61,19337427.69,56887683.67,...,57604860.12,14904687.98,2706150.04,19786153.9,6898015.6,10785799.69,607500.0,5681453.04,23749853.0,18182714.0


In [5]:
X  = df_working.iloc[:,:-2] #.values
y_gcs = df_working.iloc[:,-2:-1].values.ravel()
y_grs = df_working.iloc[:,-1:].values.ravel()
y_grs.shape

(3332,)

In [6]:
X_train, X_test, y_grs_train, y_grs_test = train_test_split(X, y_grs, test_size=0.33, random_state=42)
X_train, X_test, y_gcs_train, y_gcs_test = train_test_split(X, y_gcs, test_size=0.33, random_state=42)

In [43]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)

# pipeline setup
pipeline = Pipeline([
                     ('scaler', None)
                     ,('reg', GradientBoostingRegressor(min_samples_split=19, n_estimators=100, subsample=0.6500000000000001))
                     ])

parameters = {
                # 'scaler':  [MinMaxScaler(), RobustScaler(), Normalizer(), QuantileTransformer()]
                'scaler':  [MinMaxScaler(), QuantileTransformer()]
                ,'reg__alpha':  [0.75]
                ,'reg__learning_rate':  [0.01]
                # ,'reg__learning_rate':  [0.0001, 0.001, 0.01, 0.1, 1.0] #this makes it worse
                ,'reg__loss':  ['huber']
                # ,'reg__max_depth':  [6]
                ,'reg__max_depth':  [3, 6, 7, 9]
                ,'reg__max_features':  [0.8]
                ,'reg__min_samples_leaf':  [1]
                ,'reg__min_samples_split':  [19]
                ,'reg__n_estimators':  [100]
                # ,'reg__n_estimators':  [10, 50, 100, 500]
                ,'reg__subsample':  [0.6500000000000001]
                # ,'reg__subsample':  [0.5, 0.6500000000000001, 0.7, 1.0]
                }
#grs model
grs_grid = GridSearchCV(
    pipeline
    ,parameters
    ,cv=cv
    ,scoring={'R2': make_scorer(r2_score)
            ,'negMedAE': make_scorer(utils.neg_median_absolute_error)
    }
    # ,refit='R2'
    ,refit=utils.refit_strategy
    ,return_train_score=False
    ,n_jobs=-2
)   

#gcs model
gcs_grid = GridSearchCV(
    pipeline
    ,parameters
    ,cv=cv
    ,scoring={'R2': make_scorer(r2_score)
            ,'negMedAE': make_scorer(utils.neg_median_absolute_error)
        }
    ,refit=utils.refit_strategy
    ,return_train_score=False
    ,n_jobs=-2
)   

In [None]:
list({int(_) for _ in np.linspace(1, X.shape[1], X.shape[1])})

In [44]:
grs_grid = grs_grid.fit(X_train, y_grs_train)

All grid-search results:
R2: 0.166 (±0.109), negMedAE: -1757.275 (±160.764), for {'reg__alpha': 0.75, 'reg__learning_rate': 0.01, 'reg__loss': 'huber', 'reg__max_depth': 3, 'reg__max_features': 0.8, 'reg__min_samples_leaf': 1, 'reg__min_samples_split': 19, 'reg__n_estimators': 100, 'reg__subsample': 0.6500000000000001, 'scaler': MinMaxScaler()}
R2: 0.156 (±0.099), negMedAE: -1780.943 (±166.706), for {'reg__alpha': 0.75, 'reg__learning_rate': 0.01, 'reg__loss': 'huber', 'reg__max_depth': 3, 'reg__max_features': 0.8, 'reg__min_samples_leaf': 1, 'reg__min_samples_split': 19, 'reg__n_estimators': 100, 'reg__subsample': 0.6500000000000001, 'scaler': QuantileTransformer()}
R2: 0.291 (±0.150), negMedAE: -1867.362 (±260.909), for {'reg__alpha': 0.75, 'reg__learning_rate': 0.01, 'reg__loss': 'huber', 'reg__max_depth': 6, 'reg__max_features': 0.8, 'reg__min_samples_leaf': 1, 'reg__min_samples_split': 19, 'reg__n_estimators': 100, 'reg__subsample': 0.6500000000000001, 'scaler': MinMaxScaler()}
R2

In [40]:
gcs_grid = gcs_grid.fit(X_train, y_gcs_train)

All grid-search results:
R2: 0.244 (±0.137), negMedAE: -3499.760 (±441.367), for {'reg__alpha': 0.75, 'reg__learning_rate': 0.01, 'reg__loss': 'huber', 'reg__max_depth': 6, 'reg__max_features': 0.8, 'reg__min_samples_leaf': 1, 'reg__min_samples_split': 19, 'reg__n_estimators': 100, 'reg__subsample': 0.6500000000000001, 'scaler': MinMaxScaler()}
R2: 0.244 (±0.134), negMedAE: -3574.778 (±505.361), for {'reg__alpha': 0.75, 'reg__learning_rate': 0.01, 'reg__loss': 'huber', 'reg__max_depth': 6, 'reg__max_features': 0.8, 'reg__min_samples_leaf': 1, 'reg__min_samples_split': 19, 'reg__n_estimators': 100, 'reg__subsample': 0.6500000000000001, 'scaler': QuantileTransformer()}

Models with an R2 higher than 0.14448255148466965:
R2: 0.244 (±0.137), negMedAE: -3499.760 (±441.367), for {'reg__alpha': 0.75, 'reg__learning_rate': 0.01, 'reg__loss': 'huber', 'reg__max_depth': 6, 'reg__max_features': 0.8, 'reg__min_samples_leaf': 1, 'reg__min_samples_split': 19, 'reg__n_estimators': 100, 'reg__subsampl

In [41]:
y_grs_test_pred = grs_grid.best_estimator_.predict(X_test)
print(f'R2 for test grs data: {r2_score(y_grs_test, y_grs_test_pred)}')
print(f'negMedAE for test grs data: {utils.neg_median_absolute_error(y_grs_test, y_grs_test_pred)}')

R2 for test grs data: 0.29795144973329835
negMedAE for test grs data: -2201.306725798465


In [42]:
y_gcs_test_pred = gcs_grid.best_estimator_.predict(X_test)
print(f'R2 for test gcs data: {r2_score(y_gcs_test, y_gcs_test_pred)}')
print(f'negMedAE for test gcs data: {utils.neg_median_absolute_error(y_gcs_test, y_gcs_test_pred)}')

R2 for test gcs data: 0.20663506955134714
negMedAE for test gcs data: -4294.055925788829


In [None]:
gcs_grid_test = gcs_grid.best_estimator_.fit(X_test, y_gcs_test)

In [25]:
print("the best grs estimator is \n {} ".format(grs_grid.best_estimator_))
print("the best grs parameters are \n {}".format(grs_grid.best_params_))
print("the best gcs estimator is \n {} ".format(gcs_grid.best_estimator_))
print("the best gcs parameters are \n {}".format(gcs_grid.best_params_))

the best grs estimator is 
 Pipeline(steps=[('scaler', QuantileTransformer()), ('kbest', SelectKBest(k=28)),
                ('reg',
                 GradientBoostingRegressor(alpha=0.75, loss='huber',
                                           max_depth=6, max_features=0.8,
                                           min_samples_split=19,
                                           subsample=0.6500000000000001))]) 
the best grs parameters are 
 {'kbest__k': 28, 'reg__alpha': 0.75, 'reg__learning_rate': 0.1, 'reg__loss': 'huber', 'reg__max_depth': 6, 'reg__max_features': 0.8, 'reg__min_samples_leaf': 1, 'reg__min_samples_split': 19, 'reg__n_estimators': 100, 'reg__subsample': 0.6500000000000001, 'scaler': QuantileTransformer()}
the best gcs estimator is 
 Pipeline(steps=[('scaler', MinMaxScaler()), ('kbest', SelectKBest(k=25)),
                ('reg',
                 GradientBoostingRegressor(alpha=0.75, loss='huber',
                                           max_depth=6, max_features=

In [None]:
grs_best_pipe = grs_grid.best_estimator_
grs_mask = list(grs_best_pipe.fit(X,y_grs)[:-1].get_feature_names_out())
grs_model = grs_best_pipe.fit(df_working[grs_mask],y_grs)
grs_predictions = grs_model.predict(df_working[grs_mask])

In [None]:
gcs_best_pipe = gcs_grid.best_estimator_
gcs_mask = list(gcs_best_pipe.fit(X,y_gcs)[:-1].get_feature_names_out())
gcs_model = gcs_best_pipe.fit(df_working[gcs_mask],y_gcs)
gcs_predictions = gcs_model.predict(df_working[gcs_mask])

In [None]:
list(grs_model[:-1].get_feature_names_out())

In [None]:
grs_parameters = list(df_working[grs_mask].columns)
gcs_parameters = list(df_working[gcs_mask].columns)
combined_mask = list(set(grs_parameters + gcs_parameters))
df = df_working[combined_mask].copy()
df['GRS_PREDICTIONS'] = grs_predictions
df['GCS_PREDICTIONS'] = gcs_predictions
knnr_model_bag = {
    'df': df
    ,'grs_model': grs_model
    ,'grs_parameters': grs_parameters
    ,'gcs_model': gcs_model
    ,'gcs_parameters': gcs_parameters
}
with open('./app/knnr_model_bag.pkl','wb') as p:
    pickle.dump(knnr_model_bag, p, protocol=-1)

In [None]:
with open('./app/model_bag.pkl','rb') as p:
    bag = pickle.load(p)

In [None]:
bag.keys()

In [None]:
grs_params = bag['grs_parameters']
gcs_params = bag['gcs_parameters']
all_params = list(set(grs_params + gcs_params))
test_vec = bag['df'][all_params].sample(1).copy()
# bag['grs_model'].predict(test_vec)
# model = bag['grs_model']
# list(model[:-1].get_feature_names_out())
print(*list(test_vec[grs_params].columns), sep='\n,')

In [None]:
vec = test_vec.reset_index(drop=True).T
vec.index.names = ['PARAMETERS']
vec = vec.reset_index()
vec.set_index('PARAMETERS').sort_index()

In [None]:
bag['grs_model'].predict(test_vec)

In [None]:
# get the features scores rounded in 2 decimals
pip_steps = grs_grid.best_estimator_.named_steps['kbest']
pip_steps.get_support()

features_scores = ['%.2f' % elem for elem in pip_steps.scores_ ]
print("the features scores are \n {}".format(features_scores))

feature_scores_pvalues = ['%.3f' % elem for elem in pip_steps.pvalues_]
print("the feature_pvalues is \n {} ".format(feature_scores_pvalues))

scored_features = pd.DataFrame(df_working[grs_mask].columns, columns=['feature_names'])
scored_features['feature_scores'] = features_scores
scored_features['feature_scores_pvalues'] = feature_scores_pvalues
scored_features = scored_features.loc[(scored_features['feature_scores'] != 'nan') & (scored_features['feature_scores'] != 'inf')].sort_values(by='feature_scores', ascending=False).iloc[:num_features]
scored_features

In [None]:
selected_features = scored_features.feature_names.to_list()
df_working[selected_features].describe()

In [None]:
pickle.dump(neigh, open('grs_model.pkl','wb'))

In [None]:
grs_model = pickle.load(open('grs_model.pkl','rb'))

In [None]:
pickle.dump(data_preds, open('grs_model.pkl','ab+'))

In [None]:
grs_data = []
with open('./app/grs_model.pkl', 'rb') as fr:
    try:
        while True:
            grs_data.append(pickle.load(fr))
    except EOFError:
        pass
gcs_data = []
with open('./app/gcs_model.pkl', 'rb') as fr:
    try:
        while True:
            gcs_data.append(pickle.load(fr))
    except EOFError:
        pass

In [None]:
grs_model, grs_preds = grs_data
gcs_model, gcs_preds = gcs_data

In [None]:
graphWidth = 1500
graphHeight = graphWidth * 800 / 1000
x_plot = data_preds.DIRECT_COST
y1_plot = data_preds.GRS_ACTUAL
y2_plot = data_preds.GRS_PREDICTIONS
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
axes.plot(x_plot, y1_plot, c='g', alpha=0.15)
axes.plot(x_plot, y2_plot, alpha=0.15)
axes.scatter(direct_cost, grs_cost, c='r', marker='D')
plt.show()