In [1]:
from os.path import join, dirname
from dotenv import load_dotenv
import os
import pickle
from snowflake import connector
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter, StrMethodFormatter
from scipy.optimize import curve_fit
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split, RepeatedKFold
from sklearn.metrics import r2_score
# from sklearn.preprocessing import Normalizer, QuantileTransformer, RobustScaler, PolynomialFeatures
# from sklearn.pipeline import Pipeline
# from sklearn.model_selection import GridSearchCV, train_test_split
# from sklearn.neighbors import KNeighborsRegressor
# from sklearn.linear_model import LinearRegression, Lasso, Ridge
# from sklearn.svm import SVR
# from sklearn.neural_network import MLPRegressor
# from sklearn.feature_selection import SelectKBest, f_classif
# from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_squared_log_error, mean_absolute_percentage_error, median_absolute_error, max_error, make_scorer


pd.options.display.float_format = '{:,.2f}'.format

# get environment variables
dotenv_path = join(dirname('streamlit_grs_fit\\app\\'), '.env')
load_dotenv(dotenv_path)
SF_ACCOUNT = os.getenv('SF_ACCOUNT')
SF_USER = os.getenv('SF_USER')
SF_PASSWORD = os.getenv('SF_PASSWORD')
SF_ROLE = os.getenv('SF_ROLE')
SF_WAREHOUSE = os.getenv('SF_WAREHOUSE')
SF_DATABASE = os.getenv('SF_DATABASE')
SF_SCHEMA = os.getenv('SF_SCHEMA')

def load_data(query):
    conn = connector.connect(
        user = SF_USER
        ,password = SF_PASSWORD
        ,account = SF_ACCOUNT
        ,warehouse = SF_WAREHOUSE
        ,database = SF_DATABASE
        ,schema = SF_SCHEMA
        ,role = SF_ROLE
    )
    cur = conn.cursor()
    df_data = cur.execute(query).fetch_pandas_all()
    return df_data



In [2]:
query = 'select '+\
            'JOB'+\
            ',DIRECT_COST'+\
            ',DIV_00_DIRECT_COST'+\
            ',DIV_01_DIRECT_COST'+\
            ',DIV_02_DIRECT_COST'+\
            ',DIV_03_DIRECT_COST'+\
            ',DIV_04_DIRECT_COST'+\
            ',DIV_05_DIRECT_COST'+\
            ',DIV_06_DIRECT_COST'+\
            ',DIV_07_DIRECT_COST'+\
            ',DIV_08_DIRECT_COST'+\
            ',DIV_09_DIRECT_COST'+\
            ',DIV_10_DIRECT_COST'+\
            ',DIV_11_DIRECT_COST'+\
            ',DIV_12_DIRECT_COST'+\
            ',DIV_13_DIRECT_COST'+\
            ',DIV_14_DIRECT_COST'+\
            ',DIV_15_DIRECT_COST'+\
            ',DIV_16_DIRECT_COST'+\
            ',DIV_17_DIRECT_COST'+\
            ',DIV_18_DIRECT_COST'+\
            ',DIV_19_DIRECT_COST'+\
            ',DIV_21_DIRECT_COST'+\
            ',DIV_22_DIRECT_COST'+\
            ',DIV_23_DIRECT_COST'+\
            ',DIV_26_DIRECT_COST'+\
            ',DIV_27_DIRECT_COST'+\
            ',DIV_28_DIRECT_COST'+\
            ',DIV_31_DIRECT_COST'+\
            ',DIV_32_DIRECT_COST'+\
            ',DIV_33_DIRECT_COST'+\
            ',DIV_34_DIRECT_COST'+\
            ',DIV_55_DIRECT_COST'+\
            ',GCS_COST'+\
            ',GRS_COST '+\
            'from sandbox.global.ml_grs_fit ' 
df_data = load_data(query).set_index('JOB') 
df_data = pd.DataFrame(df_data)
df_data = df_data.fillna(0)

In [3]:
df_working = df_data.loc[
                    (0 != df_data.GRS_COST) &
                    (0 != df_data.GCS_COST)
].copy()
df_working.describe()

Unnamed: 0,DIRECT_COST,DIV_00_DIRECT_COST,DIV_01_DIRECT_COST,DIV_02_DIRECT_COST,DIV_03_DIRECT_COST,DIV_04_DIRECT_COST,DIV_05_DIRECT_COST,DIV_06_DIRECT_COST,DIV_07_DIRECT_COST,DIV_08_DIRECT_COST,...,DIV_26_DIRECT_COST,DIV_27_DIRECT_COST,DIV_28_DIRECT_COST,DIV_31_DIRECT_COST,DIV_32_DIRECT_COST,DIV_33_DIRECT_COST,DIV_34_DIRECT_COST,DIV_55_DIRECT_COST,GCS_COST,GRS_COST
count,3332.0,3332.0,3332.0,3332.0,3332.0,3332.0,3332.0,3332.0,3332.0,3332.0,...,3332.0,3332.0,3332.0,3332.0,3332.0,3332.0,3332.0,3332.0,3332.0,3332.0
mean,2753305.18,15416.14,1095.3,120305.74,305087.19,25975.15,192784.27,83875.74,97562.86,262156.35,...,279623.43,22192.54,7159.77,84671.51,18546.22,19750.99,505.83,1710.05,159975.26,130762.61
std,18205353.59,247803.94,21856.71,835518.53,2456075.21,255679.11,1926522.1,479208.15,712596.43,2347850.24,...,2283423.57,355302.65,98050.42,852341.42,200217.24,315920.99,14397.44,98425.42,991159.86,893014.97
min,-3404153.0,-327264.32,0.0,-41282.56,-4315458.2,-164664.12,-1398362.17,-174550.72,-16197.24,-407656.41,...,-104.77,0.0,0.0,-2322.1,-11.5,-53330.53,-4.2,0.0,-3335087.0,-629785.0
25%,4410.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,554.25,305.75
50%,31937.0,0.0,0.0,889.22,0.0,0.0,0.0,353.55,0.0,37.68,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4763.0,2099.0
75%,279720.25,0.0,0.0,10486.36,1041.34,0.0,180.5,10992.53,44.95,9230.26,...,3893.87,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35369.5,16783.0
max,399570438.0,8618727.16,762087.69,26715320.64,49560386.64,9635075.36,51850542.49,9711287.61,19337427.69,56887683.67,...,57604860.12,14904687.98,2706150.04,19786153.9,6898015.6,10785799.69,607500.0,5681453.04,23749853.0,18182714.0


In [24]:
X  = list(df_working.iloc[:,:-2].columns)
y_grs = list(df_working.iloc[:,-1:].columns)
y_gcs = list(df_working.iloc[:,-2:-1].columns)
y_gcs

['GCS_COST']

I've tried a number of estimators, and none has beat the k-nearest neighbor regressor. We're going to change it up a bit now, and use a genetic programming tool called TPOT and see how that does

In [25]:
X_train, X_test, y_grs_train, y_grs_test = train_test_split(df_working[X], df_working[y_grs].values.ravel(), test_size=0.2, random_state=42)
X_gcs_train, X_gcs_test, y_gcs_train, y_gcs_test = train_test_split(df_working[X], df_working[y_gcs].values.ravel(), test_size=0.2, random_state=42)

In [12]:
tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2, random_state=42)
tpot.fit(X_train, y_grs_train)
print(tpot.score(X_test, y_grs_test))
tpot.export('tpot_grs_pipeline.py')

                                                                              
Generation 1 - Current best internal CV score: -169620157638.27933
                                                                              
Generation 2 - Current best internal CV score: -169620157638.27933
                                                                              
Generation 3 - Current best internal CV score: -161401803546.80942
                                                                              
Generation 4 - Current best internal CV score: -156260656385.51758
                                                                              
Generation 5 - Current best internal CV score: -156260656385.51758
                                                                              
Best pipeline: GradientBoostingRegressor(MinMaxScaler(input_matrix), alpha=0.75, learning_rate=0.1, loss=huber, max_depth=6, max_features=0.8, min_samples_leaf=1, min_samples_split=19, n_esti

In [26]:
tpot_gcs = TPOTRegressor(generations=5, population_size=50, verbosity=2, random_state=42)
tpot_gcs.fit(X_gcs_train, y_gcs_train)
print(tpot_gcs.score(X_gcs_test, y_gcs_test))
tpot.export('tpot_gcs_pipeline.py')

                                                                              
Generation 1 - Current best internal CV score: -454980662761.639
                                                                              
Generation 2 - Current best internal CV score: -448118718508.5226
                                                                              
Generation 3 - Current best internal CV score: -448118718508.5226
                                                                              
Generation 4 - Current best internal CV score: -448118718508.5226
                                                                              
Generation 5 - Current best internal CV score: -448118718508.5226
                                                                              
Best pipeline: XGBRegressor(ZeroCount(input_matrix), learning_rate=0.1, max_depth=4, min_child_weight=3, n_estimators=100, n_jobs=1, objective=reg:squarederror, subsample=0.45, verbosity=0)
-289694

In [None]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)
tpot_75 = TPOTRegressor(generations=5, population_size=75, verbosity=2, cv=cv, random_state=42, n_jobs=-2)
tpot_75.fit(X_train, y_grs_train)
print(tpot_75.score(X_test, y_grs_test))
tpot_75.export('tpot_75_grs_pipeline.py')

In [27]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)
tpot_75_gcs = TPOTRegressor(generations=5, population_size=75, verbosity=2, cv=cv, random_state=42, n_jobs=-2)
tpot_75_gcs.fit(X_gcs_train, y_gcs_train)
print(tpot_75_gcs.score(X_gcs_test, y_gcs_test))
tpot.export('tpot_75_gcs_pipeline.py')

                                                                               
Generation 1 - Current best internal CV score: -466665305372.89484
                                                                                
Generation 2 - Current best internal CV score: -466665305372.89484
                                                                              
Generation 3 - Current best internal CV score: -466665305372.89484
                                                                              
Generation 4 - Current best internal CV score: -465474810426.29193
                                                                              
Generation 5 - Current best internal CV score: -461777905481.6639
                                                             
Best pipeline: RandomForestRegressor(RandomForestRegressor(input_matrix, bootstrap=True, max_features=0.55, min_samples_leaf=19, min_samples_split=14, n_estimators=100), bootstrap=False, max_features=0.1, m



In [22]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)
tpot_75_r2 = TPOTRegressor(generations=5, population_size=75, scoring='r2', verbosity=2, cv=cv, random_state=42, n_jobs=-2)
tpot_75_r2.fit(X_train, y_grs_train)
print(tpot_75_r2.score(X_test, y_grs_test))
tpot_75_r2.export('tpot_75_r2_grs_pipeline.py')

                                                                               
Generation 1 - Current best internal CV score: 0.7567428441911429
                                                                                
Generation 2 - Current best internal CV score: 0.7567428441911429
                                                                              
Generation 3 - Current best internal CV score: 0.7567428441911429
                                                                              
Generation 4 - Current best internal CV score: 0.7647658556084654
                                                                              
Generation 5 - Current best internal CV score: 0.7647658556084654
                                                                              
Best pipeline: ElasticNetCV(XGBRegressor(input_matrix, learning_rate=0.1, max_depth=5, min_child_weight=10, n_estimators=100, n_jobs=1, objective=reg:squarederror, subsample=0.9500000000000001,

In [29]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)
tpot_75_r2_gcs = TPOTRegressor(generations=5, population_size=75, scoring='r2', verbosity=2, cv=cv, random_state=42, n_jobs=-2)
tpot_75_r2_gcs.fit(X_gcs_train, y_gcs_train)
print(tpot_75_r2_gcs.score(X_gcs_test, y_gcs_test))
tpot.export('tpot_75_r2_gcs_pipeline.py')

                                                                               
Generation 1 - Current best internal CV score: 0.5945191543603052
                                                                              
Generation 2 - Current best internal CV score: 0.5986198913067237
                                                                              
Generation 3 - Current best internal CV score: 0.6008664219379748
                                                                              
Generation 4 - Current best internal CV score: 0.6008664219379748
                                                                              
Generation 5 - Current best internal CV score: 0.6008664219379748
                                                           
Best pipeline: RandomForestRegressor(SelectFwe(input_matrix, alpha=0.037), bootstrap=True, max_features=0.15000000000000002, min_samples_leaf=2, min_samples_split=3, n_estimators=100)
0.8551568838841415


In [23]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)
tpot_75_negmae = TPOTRegressor(generations=5, population_size=75, scoring='neg_median_absolute_error', verbosity=2, cv=cv, random_state=42, n_jobs=-2)
tpot_75_negmae.fit(X_train, y_grs_train)
print(tpot_75_negmae.score(X_test, y_grs_test))
tpot_75_r2.export('tpot_75_negmae_grs_pipeline.py')

                                                                               
Generation 1 - Current best internal CV score: -2028.0356968607452
                                                                                
Generation 2 - Current best internal CV score: -1900.681771666977
                                                                              
Generation 3 - Current best internal CV score: -1900.681771666977
                                                                              
Generation 4 - Current best internal CV score: -1879.8812673367806
                                                                                
Generation 5 - Current best internal CV score: -1879.8812673367806
                                                             
Best pipeline: GradientBoostingRegressor(input_matrix, alpha=0.75, learning_rate=0.01, loss=huber, max_depth=5, max_features=0.55, min_samples_leaf=14, min_samples_split=17, n_estimators=100, subsample=0.3

In [30]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)
tpot_75_negmae_gcs = TPOTRegressor(generations=5, population_size=75, scoring='neg_median_absolute_error', verbosity=2, cv=cv, random_state=42, n_jobs=-2)
tpot_75_negmae_gcs.fit(X_gcs_train, y_gcs_train)
print(tpot_75_negmae_gcs.score(X_gcs_test, y_gcs_test))
tpot.export('tpot_75_negmae_gcs_pipeline.py')

                                                                               
Generation 1 - Current best internal CV score: -3682.918929821638
                                                                              
Generation 2 - Current best internal CV score: -3682.9075127652673
                                                                              
Generation 3 - Current best internal CV score: -3682.9075127652673
                                                                              
Generation 4 - Current best internal CV score: -3682.9075127652673
                                                                              
Generation 5 - Current best internal CV score: -3682.79982607396
                                                           
Best pipeline: LinearSVR(input_matrix, C=0.0001, dual=True, epsilon=0.01, loss=epsilon_insensitive, tol=0.01)
-3483.9933932198182


In [31]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)
tpot_350 = TPOTRegressor(generations=5, population_size=350, verbosity=2, cv=cv, random_state=42, n_jobs=-2)
tpot_350.fit(X_train, y_grs_train)
print(tpot_350.score(X_test, y_grs_test))
tpot_350.export('tpot_350_grs_pipeline.py')

tpot_350_gcs = TPOTRegressor(generations=5, population_size=350, verbosity=2, cv=cv, random_state=42, n_jobs=-2)
tpot_350_gcs.fit(X_gcs_train, y_gcs_train)
print(tpot_350_gcs.score(X_gcs_test, y_gcs_test))
tpot_350_gcs.export('tpot_350_gcs_pipeline.py')

                                                                                   
Generation 1 - Current best internal CV score: -165650662637.27472
                                                                                    
Generation 2 - Current best internal CV score: -162148449971.6318
                                                                                    
Generation 3 - Current best internal CV score: -160243932342.43372
                                                                                    
Generation 4 - Current best internal CV score: -157587980595.02185
                                                                                  
Generation 5 - Current best internal CV score: -153761057425.6203
                                                              
Best pipeline: ElasticNetCV(XGBRegressor(ZeroCount(input_matrix), learning_rate=0.5, max_depth=5, min_child_weight=4, n_estimators=100, n_jobs=1, objective=reg:squarederror, subsampl

In [4]:
X_dir  = list(df_working.iloc[:,0:1].columns)
y_grs = list(df_working.iloc[:,-1:].columns)
X_dir

['DIRECT_COST']

In [5]:
X_dir_train, X_dir_test, y_grs_train, y_grs_test = train_test_split(df_working[X_dir], df_working[y_grs].values.ravel(), test_size=0.2, random_state=42)

In [7]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)
tpot_dir_75_r2 = TPOTRegressor(generations=50, population_size=75, scoring='r2', verbosity=2, cv=cv, random_state=42, n_jobs=-2)
tpot_dir_75_r2.fit(X_dir_train, y_grs_train)
print(tpot_dir_75_r2.score(X_dir_test, y_grs_test))
tpot_dir_75_r2.export('tpot_dir_75_r2_grs_pipeline.py')

                                                                                 
Generation 1 - Current best internal CV score: 0.6924413345605032
                                                                                 
Generation 2 - Current best internal CV score: 0.6924413345605032
                                                                                 
Generation 3 - Current best internal CV score: 0.6924413345605032
                                                                                 
Generation 4 - Current best internal CV score: 0.6924413345605032
                                                                                 
Generation 5 - Current best internal CV score: 0.7051810612917506
                                                                                 
Generation 6 - Current best internal CV score: 0.7051810612917506
                                                                                 
Generation 7 - Current best in

