best tuned settings:

    *  R2 for test grs data: 0.8252644569578274

    *  negMedAE for test grs data: -5597.125

    *  R2 for test gcs data: 0.28250792443626493

    *  negMedAE for test gcs data: -8657.5

In [2]:
from helpers import utils
from os.path import join, dirname
from dotenv import load_dotenv
import os
import pickle
from snowflake import connector
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter, StrMethodFormatter
from scipy.optimize import curve_fit
from sklearn.preprocessing import Normalizer, QuantileTransformer, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, RepeatedKFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import AdaBoostRegressor
from tpot.builtins import StackingEstimator
from sklearn.metrics import r2_score, make_scorer


pd.options.display.float_format = '{:,.2f}'.format

# get environment variables
dotenv_path = join(dirname('streamlit_grs_fit\\app\\'), '.env')
load_dotenv(dotenv_path)
SF_ACCOUNT = os.getenv('SF_ACCOUNT')
SF_USER = os.getenv('SF_USER')
SF_PASSWORD = os.getenv('SF_PASSWORD')
SF_ROLE = os.getenv('SF_ROLE')
SF_WAREHOUSE = os.getenv('SF_WAREHOUSE')
SF_DATABASE = os.getenv('SF_DATABASE')
SF_SCHEMA = os.getenv('SF_SCHEMA')

def load_data(query):
    conn = connector.connect(
        user = SF_USER
        ,password = SF_PASSWORD
        ,account = SF_ACCOUNT
        ,warehouse = SF_WAREHOUSE
        ,database = SF_DATABASE
        ,schema = SF_SCHEMA
        ,role = SF_ROLE
    )
    cur = conn.cursor()
    df_data = cur.execute(query).fetch_pandas_all()
    return df_data



In [5]:
query = 'select '+\
            'JOB'+\
            ',DIRECT_COST'+\
            ',DIV_00_DIRECT_COST'+\
            ',DIV_01_DIRECT_COST'+\
            ',DIV_02_DIRECT_COST'+\
            ',DIV_03_DIRECT_COST'+\
            ',DIV_04_DIRECT_COST'+\
            ',DIV_05_DIRECT_COST'+\
            ',DIV_06_DIRECT_COST'+\
            ',DIV_07_DIRECT_COST'+\
            ',DIV_08_DIRECT_COST'+\
            ',DIV_09_DIRECT_COST'+\
            ',DIV_10_DIRECT_COST'+\
            ',DIV_11_DIRECT_COST'+\
            ',DIV_12_DIRECT_COST'+\
            ',DIV_13_DIRECT_COST'+\
            ',DIV_14_DIRECT_COST'+\
            ',DIV_15_DIRECT_COST'+\
            ',DIV_16_DIRECT_COST'+\
            ',DIV_17_DIRECT_COST'+\
            ',DIV_18_DIRECT_COST'+\
            ',DIV_19_DIRECT_COST'+\
            ',DIV_21_DIRECT_COST'+\
            ',DIV_22_DIRECT_COST'+\
            ',DIV_23_DIRECT_COST'+\
            ',DIV_26_DIRECT_COST'+\
            ',DIV_27_DIRECT_COST'+\
            ',DIV_28_DIRECT_COST'+\
            ',DIV_31_DIRECT_COST'+\
            ',DIV_32_DIRECT_COST'+\
            ',DIV_33_DIRECT_COST'+\
            ',DIV_34_DIRECT_COST'+\
            ',DIV_55_DIRECT_COST'+\
            ',GCS_COST'+\
            ',GRS_COST '+\
            'from sandbox.global.ml_grs_fit ' 
df_data = load_data(query).set_index('JOB') 
df_data = pd.DataFrame(df_data)
df_data = df_data.fillna(0)

In [6]:
df_working = df_data.loc[
                    (0 != df_data.GRS_COST) &
                    (0 != df_data.GCS_COST)
].copy()
df_working.describe()

Unnamed: 0,DIRECT_COST,DIV_00_DIRECT_COST,DIV_01_DIRECT_COST,DIV_02_DIRECT_COST,DIV_03_DIRECT_COST,DIV_04_DIRECT_COST,DIV_05_DIRECT_COST,DIV_06_DIRECT_COST,DIV_07_DIRECT_COST,DIV_08_DIRECT_COST,...,DIV_26_DIRECT_COST,DIV_27_DIRECT_COST,DIV_28_DIRECT_COST,DIV_31_DIRECT_COST,DIV_32_DIRECT_COST,DIV_33_DIRECT_COST,DIV_34_DIRECT_COST,DIV_55_DIRECT_COST,GCS_COST,GRS_COST
count,3332.0,3332.0,3332.0,3332.0,3332.0,3332.0,3332.0,3332.0,3332.0,3332.0,...,3332.0,3332.0,3332.0,3332.0,3332.0,3332.0,3332.0,3332.0,3332.0,3332.0
mean,2753305.18,15416.14,1095.3,120305.74,305087.19,25975.15,192784.27,83875.74,97562.86,262156.35,...,279623.43,22192.54,7159.77,84671.51,18546.22,19750.99,505.83,1710.05,159975.26,130762.61
std,18205353.59,247803.94,21856.71,835518.53,2456075.21,255679.11,1926522.1,479208.15,712596.43,2347850.24,...,2283423.57,355302.65,98050.42,852341.42,200217.24,315920.99,14397.44,98425.42,991159.86,893014.97
min,-3404153.0,-327264.32,0.0,-41282.56,-4315458.2,-164664.12,-1398362.17,-174550.72,-16197.24,-407656.41,...,-104.77,0.0,0.0,-2322.1,-11.5,-53330.53,-4.2,0.0,-3335087.0,-629785.0
25%,4410.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,554.25,305.75
50%,31937.0,0.0,0.0,889.22,0.0,0.0,0.0,353.55,0.0,37.68,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4763.0,2099.0
75%,279720.25,0.0,0.0,10486.36,1041.34,0.0,180.5,10992.53,44.95,9230.26,...,3893.87,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35369.5,16783.0
max,399570438.0,8618727.16,762087.69,26715320.64,49560386.64,9635075.36,51850542.49,9711287.61,19337427.69,56887683.67,...,57604860.12,14904687.98,2706150.04,19786153.9,6898015.6,10785799.69,607500.0,5681453.04,23749853.0,18182714.0


In [7]:
X  = df_working.iloc[:,:-2] #.values
y_gcs = df_working.iloc[:,-2:-1].values.ravel()
y_grs = df_working.iloc[:,-1:].values.ravel()
y_grs.shape

(3332,)

In [8]:
X_train, X_test, y_grs_train, y_grs_test = train_test_split(X, y_grs, test_size=0.33, random_state=42)
X_train, X_test, y_gcs_train, y_gcs_test = train_test_split(X, y_gcs, test_size=0.33, random_state=42)

In [9]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)

# pipeline setup
pipeline = Pipeline([
                     ('scaler', None) 
                     ,('kbest', SelectKBest(f_regression))
                    #  ,('stack_reg_one', StackingEstimator(estimator=AdaBoostRegressor()))
                     ,('regressor', KNeighborsRegressor())
                     ])

parameters = {
                'scaler':  [RobustScaler()]
                ,'kbest__k': [6]
                # ,'stack_reg_one__estimator__learning_rate': [0.04, 0.1]
                # ,'stack_reg_one__estimator__loss': ['linear', 'exponential']
                # ,'stack_reg_one__estimator__n_estimators': [100, 150] 
                ,'regressor__n_neighbors': list(range(1,10))
                }
#grs model
grs_grid = GridSearchCV(
    pipeline
    ,parameters
    ,cv=cv
    ,scoring={'R2': make_scorer(r2_score)
            ,'negMedAE': make_scorer(utils.neg_median_absolute_error)
    }
    ,refit=utils.refit_strategy
    ,return_train_score=False
    ,n_jobs=-1
)   

#gcs model
gcs_grid = GridSearchCV(
    pipeline
    ,parameters
    ,cv=cv
    ,scoring={'R2': make_scorer(r2_score)
            ,'negMedAE': make_scorer(utils.neg_median_absolute_error)
    }
    ,refit=utils.refit_strategy
    ,return_train_score=False
    ,n_jobs=-1
)   

In [10]:
grs_grid = grs_grid.fit(X_train, y_grs_train)

All grid-search results:
R2: 0.493 (±0.417), negMedAE: -3497.717 (±687.788), for {'kbest__k': 6, 'regressor__n_neighbors': 1, 'scaler': RobustScaler()}
R2: 0.613 (±0.265), negMedAE: -3861.083 (±744.402), for {'kbest__k': 6, 'regressor__n_neighbors': 2, 'scaler': RobustScaler()}
R2: 0.608 (±0.274), negMedAE: -3853.467 (±640.043), for {'kbest__k': 6, 'regressor__n_neighbors': 3, 'scaler': RobustScaler()}
R2: 0.545 (±0.316), negMedAE: -4391.246 (±878.776), for {'kbest__k': 6, 'regressor__n_neighbors': 4, 'scaler': RobustScaler()}
R2: 0.565 (±0.298), negMedAE: -4311.093 (±870.560), for {'kbest__k': 6, 'regressor__n_neighbors': 5, 'scaler': RobustScaler()}
R2: 0.599 (±0.253), negMedAE: -4484.194 (±791.459), for {'kbest__k': 6, 'regressor__n_neighbors': 6, 'scaler': RobustScaler()}
R2: 0.605 (±0.253), negMedAE: -4544.843 (±684.953), for {'kbest__k': 6, 'regressor__n_neighbors': 7, 'scaler': RobustScaler()}
R2: 0.613 (±0.243), negMedAE: -4589.194 (±656.576), for {'kbest__k': 6, 'regressor__n_

In [11]:
gcs_grid = gcs_grid.fit(X_train, y_gcs_train)

All grid-search results:
R2: -0.228 (±1.899), negMedAE: -6704.533 (±1354.627), for {'kbest__k': 6, 'regressor__n_neighbors': 1, 'scaler': RobustScaler()}
R2: 0.368 (±0.525), negMedAE: -6922.042 (±1200.015), for {'kbest__k': 6, 'regressor__n_neighbors': 2, 'scaler': RobustScaler()}
R2: 0.324 (±0.470), negMedAE: -7079.428 (±1175.231), for {'kbest__k': 6, 'regressor__n_neighbors': 3, 'scaler': RobustScaler()}
R2: 0.194 (±0.603), negMedAE: -7339.642 (±1310.903), for {'kbest__k': 6, 'regressor__n_neighbors': 4, 'scaler': RobustScaler()}
R2: 0.313 (±0.417), negMedAE: -7663.660 (±1387.939), for {'kbest__k': 6, 'regressor__n_neighbors': 5, 'scaler': RobustScaler()}
R2: 0.369 (±0.324), negMedAE: -8034.397 (±1431.724), for {'kbest__k': 6, 'regressor__n_neighbors': 6, 'scaler': RobustScaler()}
R2: 0.405 (±0.274), negMedAE: -8344.264 (±1740.620), for {'kbest__k': 6, 'regressor__n_neighbors': 7, 'scaler': RobustScaler()}
R2: 0.404 (±0.286), negMedAE: -8450.873 (±1474.054), for {'kbest__k': 6, 'regr

In [12]:
y_grs_test_pred = grs_grid.best_estimator_.predict(X_test)
print(f'R2 for test grs data: {r2_score(y_grs_test, y_grs_test_pred)}')
print(f'negMedAE for test grs data: {utils.neg_median_absolute_error(y_grs_test, y_grs_test_pred)}')

R2 for test grs data: 0.6007433160709761
negMedAE for test grs data: -5319.5


In [13]:
y_gcs_test_pred = gcs_grid.best_estimator_.predict(X_test)
print(f'R2 for test gcs data: {r2_score(y_gcs_test, y_gcs_test_pred)}')
print(f'negMedAE for test gcs data: {utils.neg_median_absolute_error(y_gcs_test, y_gcs_test_pred)}')

R2 for test gcs data: 0.426733313885705
negMedAE for test gcs data: -8303.928571428572


In [None]:
gcs_grid_test = gcs_grid.best_estimator_.fit(X_test, y_gcs_test)

In [14]:
print("the best grs estimator is \n {} ".format(grs_grid.best_estimator_))
print("the best grs parameters are \n {}".format(grs_grid.best_params_))
print("the best gcs estimator is \n {} ".format(gcs_grid.best_estimator_))
print("the best gcs parameters are \n {}".format(gcs_grid.best_params_))

the best grs estimator is 
 Pipeline(steps=[('scaler', RobustScaler()),
                ('kbest',
                 SelectKBest(k=6,
                             score_func=<function f_regression at 0x000001F4893FD700>)),
                ('regressor', KNeighborsRegressor(n_neighbors=2))]) 
the best grs parameters are 
 {'kbest__k': 6, 'regressor__n_neighbors': 2, 'scaler': RobustScaler()}
the best gcs estimator is 
 Pipeline(steps=[('scaler', RobustScaler()),
                ('kbest',
                 SelectKBest(k=6,
                             score_func=<function f_regression at 0x000001F4893FD700>)),
                ('regressor', KNeighborsRegressor(n_neighbors=7))]) 
the best gcs parameters are 
 {'kbest__k': 6, 'regressor__n_neighbors': 7, 'scaler': RobustScaler()}


In [15]:
grs_best_pipe = grs_grid.best_estimator_
grs_mask = list(grs_best_pipe.fit(X,y_grs)[:-1].get_feature_names_out())
grs_model = grs_best_pipe.fit(df_working[grs_mask],y_grs)
grs_predictions = grs_model.predict(df_working[grs_mask])

In [16]:
grs_best_pipe.fit(X,y_grs)[:-1]

In [None]:
gcs_best_pipe = gcs_grid.best_estimator_
gcs_mask = list(gcs_best_pipe.fit(X,y_gcs)[:-1].get_feature_names_out())
gcs_model = gcs_best_pipe.fit(df_working[gcs_mask],y_gcs)
gcs_predictions = gcs_model.predict(df_working[gcs_mask])

In [None]:
list(grs_model[:-1].get_feature_names_out())

In [None]:
grs_parameters = list(df_working[grs_mask].columns)
gcs_parameters = list(df_working[gcs_mask].columns)
combined_mask = list(set(grs_parameters + gcs_parameters))
df = df_working[combined_mask].copy()
df['GRS_PREDICTIONS'] = grs_predictions
df['GCS_PREDICTIONS'] = gcs_predictions
knnr_model_bag = {
    'df': df
    ,'grs_model': grs_model
    ,'grs_parameters': grs_parameters
    ,'gcs_model': gcs_model
    ,'gcs_parameters': gcs_parameters
}
with open('./app/knnr_model_bag.pkl','wb') as p:
    pickle.dump(knnr_model_bag, p, protocol=-1)

In [None]:
with open('./app/model_bag.pkl','rb') as p:
    bag = pickle.load(p)

In [None]:
bag.keys()

In [None]:
grs_params = bag['grs_parameters']
gcs_params = bag['gcs_parameters']
all_params = list(set(grs_params + gcs_params))
test_vec = bag['df'][all_params].sample(1).copy()
# bag['grs_model'].predict(test_vec)
# model = bag['grs_model']
# list(model[:-1].get_feature_names_out())
print(*list(test_vec[grs_params].columns), sep='\n,')

In [None]:
vec = test_vec.reset_index(drop=True).T
vec.index.names = ['PARAMETERS']
vec = vec.reset_index()
vec.set_index('PARAMETERS').sort_index()

In [None]:
bag['grs_model'].predict(test_vec)

In [None]:
# get the features scores rounded in 2 decimals
pip_steps = grs_grid.best_estimator_.named_steps['kbest']
pip_steps.get_support()

features_scores = ['%.2f' % elem for elem in pip_steps.scores_ ]
print("the features scores are \n {}".format(features_scores))

feature_scores_pvalues = ['%.3f' % elem for elem in pip_steps.pvalues_]
print("the feature_pvalues is \n {} ".format(feature_scores_pvalues))

scored_features = pd.DataFrame(df_working[grs_mask].columns, columns=['feature_names'])
scored_features['feature_scores'] = features_scores
scored_features['feature_scores_pvalues'] = feature_scores_pvalues
scored_features = scored_features.loc[(scored_features['feature_scores'] != 'nan') & (scored_features['feature_scores'] != 'inf')].sort_values(by='feature_scores', ascending=False).iloc[:num_features]
scored_features

In [None]:
selected_features = scored_features.feature_names.to_list()
df_working[selected_features].describe()

In [None]:
pickle.dump(neigh, open('grs_model.pkl','wb'))

In [None]:
grs_model = pickle.load(open('grs_model.pkl','rb'))

In [None]:
pickle.dump(data_preds, open('grs_model.pkl','ab+'))

In [None]:
grs_data = []
with open('./app/grs_model.pkl', 'rb') as fr:
    try:
        while True:
            grs_data.append(pickle.load(fr))
    except EOFError:
        pass
gcs_data = []
with open('./app/gcs_model.pkl', 'rb') as fr:
    try:
        while True:
            gcs_data.append(pickle.load(fr))
    except EOFError:
        pass

In [None]:
grs_model, grs_preds = grs_data
gcs_model, gcs_preds = gcs_data

In [None]:
graphWidth = 1500
graphHeight = graphWidth * 800 / 1000
x_plot = data_preds.DIRECT_COST
y1_plot = data_preds.GRS_ACTUAL
y2_plot = data_preds.GRS_PREDICTIONS
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
axes.plot(x_plot, y1_plot, c='g', alpha=0.15)
axes.plot(x_plot, y2_plot, alpha=0.15)
axes.scatter(direct_cost, grs_cost, c='r', marker='D')
plt.show()