In [1]:
# Import our dependencies
import pandas as pd
import matplotlib as plt
import sklearn as skl
import tensorflow as tf
import sys
import os
import random
sys.path.append('../data')
sys.path.append('..')
from app import db
from sqlalchemy import create_engine
import ml
from ml.training import ENC, encode_df, prep_data, evaluate_models, train_model, get_prediction
!set "DATABASE_URL=postgresql://postgres:postgres@127.0.0.1:5432/frog_projects_db"
engine = create_engine(os.environ['DATABASE_URL'])

In [2]:
projects_df = pd.read_sql_query('select * from "projects"',con=engine)
# projects_df.to_csv('projects.csv')


In [3]:
lumber_df = pd.read_sql_query('select * from lumber_prices', con=engine)
lumber_df.head(5)

Unnamed: 0,date,ticker,open,close,change
0,2021-11-02,LBS=F,575.0,579.5,-4.5
1,2021-11-03,LBS=F,579.799988,646.700012,-66.900024
2,2021-11-04,LBS=F,609.299988,604.0,5.299988
3,2021-11-05,LBS=F,605.299988,615.0,-9.700012
4,2021-11-06,LBS=F,605.299988,615.0,-9.700012


In [4]:
merged = projects_df.merge(lumber_df,left_on="sales_order_date", right_on="date",how='outer').dropna(axis=0,how='any')
# Create calcluated $/elevsqft value columns
# merged['sqft_wall_panels'] = merged['sqft_wall_panels_ext'] + merged['sqft_wall_panels_int']
# merged.to_csv('merged.csv')

In [5]:
features = ['wall_panels_cost_per_elev_sqft','sales_order_date','prototype_prefix','region','panel_vendor','sqft','sqft_wall_panels_ext', 'sqft_wall_panels_int','close']
# features = ['wall_panels_cost_per_elev_sqft','sales_order_date','prototype_prefix','region','panel_vendor','sqft','sqft_wall_panels_ext','close']

# Create new df containing only columns relevant to analyitics
analytical_df = merged[features].sort_values(by=['sales_order_date'],ascending=True) 
# Only consider P12 and P13
analytical_df = analytical_df[analytical_df['prototype_prefix'].str.startswith('P12') | analytical_df['prototype_prefix'].str.startswith('P13')]


# Sales order date only needed for merge - 
analytical_df = analytical_df.drop(columns=['sales_order_date']).reset_index().drop(axis=1,columns='index')
analytical_df
sample_prediction = pd.DataFrame(analytical_df.iloc[[random.randint(0,80)]]).drop(columns=["wall_panels_cost_per_elev_sqft"],axis=1)


In [6]:
X, y = prep_data()
X

Unnamed: 0,sqft,sqft_wall_panels_ext,sqft_wall_panels_int,close,panel_vendor_Golden State,panel_vendor_Mitsui,panel_vendor_RedBuilt,panel_vendor_SR Sloan,panel_vendor_Stark Truss,region_Atlantic Region,region_Midwest Region,region_Northeast Region,region_Southeast Region,region_Southwest Region,region_West Region,prototype_prefix_P12,prototype_prefix_P13
0,4880.0,6576.0,3199.0,543.200012,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,4880.0,7336.0,3013.0,410.700012,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,4880.0,5298.0,3157.0,337.399994,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,4844.0,5303.0,3401.0,323.600006,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,4859.0,5757.0,4109.0,396.899994,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,5003.0,5452.0,0.0,606.500000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
77,4592.0,5364.0,0.0,505.100006,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
78,5147.0,5618.0,0.0,505.100006,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
79,5183.0,5604.0,0.0,505.100006,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [7]:
prediction = prep_data(sample_prediction)
prediction

Unnamed: 0,sqft,sqft_wall_panels_ext,sqft_wall_panels_int,close,panel_vendor_Golden State,panel_vendor_Mitsui,panel_vendor_RedBuilt,panel_vendor_SR Sloan,panel_vendor_Stark Truss,region_Atlantic Region,region_Midwest Region,region_Northeast Region,region_Southeast Region,region_Southwest Region,region_West Region,prototype_prefix_P12,prototype_prefix_P13
80,4549.0,5711.0,2611.0,676.299988,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [8]:
rf_model = train_model(X,y,True,'RFR')

In [9]:
pred = get_prediction(rf_model,prediction)
pred

  f"X has feature names, but {self.__class__.__name__} was fitted without"


array([11.49446945])

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43,test_size=0.20)

scaler = MinMaxScaler()

# Fit the StandardScaler on non-binary columns
# cols = ['sqft','sqft_wall_panels_ext','sqft_wall_panels_int','close']
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [13]:
X_train

Unnamed: 0,sqft,sqft_wall_panels_ext,sqft_wall_panels_int,close,vendor_Golden State,vendor_Mitsui,vendor_RedBuilt,vendor_SR Sloan,vendor_Stark Truss,region_Atlantic Region,region_Midwest Region,region_Northeast Region,region_Southeast Region,region_Southwest Region,region_West Region,prototype_prefix_P12,prototype_prefix_P13
63,4990.0,5833.0,0.0,973.500000,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
8,4880.0,5572.0,2642.0,365.200012,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
39,4872.0,5482.0,2404.0,361.299988,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
14,4869.0,6020.0,4028.0,396.299988,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
37,4990.0,5991.0,2655.0,354.600006,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,4994.0,6081.0,3151.0,990.099976,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
21,5164.0,5777.0,2934.0,367.100006,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
49,4992.0,6000.0,3619.0,928.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
64,4979.0,5898.0,0.0,1009.099976,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


# RANDOM FOREST REGRESSOR

In [14]:
RFP={'n_estimators': 400,'min_samples_split': 10,'min_samples_leaf': 4,'max_features': 'auto','max_depth': 70,'bootstrap': True}

In [15]:
# Create a random forest model
rf_model = RandomForestRegressor(n_estimators=RFP['n_estimators'], min_samples_split=RFP['min_samples_split'], min_samples_leaf=RFP['min_samples_leaf'],max_features=RFP['max_features'],max_depth=RFP['max_depth'],bootstrap=RFP['bootstrap'], random_state=69) 
rf_model = rf_model.fit(X_train,y_train)

In [16]:
# Rank feature importance
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_,X.columns),reverse=True)

[(0.932200248795263, 'sqft_wall_panels_int'),
 (0.03969485124532873, 'close'),
 (0.008886551600879354, 'region_Northeast Region'),
 (0.005667496947088656, 'sqft'),
 (0.004697038165209793, 'vendor_Golden State'),
 (0.004457531782337986, 'sqft_wall_panels_ext'),
 (0.0011404021432223914, 'region_West Region'),
 (0.00107529775241429, 'prototype_prefix_P12'),
 (0.0008007847662553704, 'prototype_prefix_P13'),
 (0.0006553247097103025, 'region_Midwest Region'),
 (0.0003668870897884188, 'region_Southeast Region'),
 (0.00028702386481315015, 'vendor_RedBuilt'),
 (7.056113768848933e-05, 'vendor_SR Sloan'),
 (0.0, 'vendor_Stark Truss'),
 (0.0, 'vendor_Mitsui'),
 (0.0, 'region_Southwest Region'),
 (0.0, 'region_Atlantic Region')]

In [10]:
sample_prediction.values

array([['P12', 'Northeast Region', 'RedBuilt', 4872.0, 5482.0, 2404.0,
        361.299987792969]], dtype=object)

In [18]:
rf_model.score(X_test,y_test)

0.9226468510904

In [11]:
# Get prediction
X, y, prediction = prep_data(sample_prediction)
prediction

Unnamed: 0,sqft,sqft_wall_panels_ext,sqft_wall_panels_int,close,panel_vendor_Golden State,panel_vendor_Mitsui,panel_vendor_RedBuilt,panel_vendor_SR Sloan,panel_vendor_Stark Truss,region_Atlantic Region,region_Midwest Region,region_Northeast Region,region_Southeast Region,region_Southwest Region,region_West Region,prototype_prefix_P12,prototype_prefix_P13
80,4549.0,5711.0,2611.0,676.299988,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [13]:
scores = evaluate_models(X,y)
scores

{'RandomForestRegressor': 0.9589081809851683,
 'KNeighborsRegressor': 0.6436792417569732,
 'LinearRegression': 0.9312653328528012}

In [21]:
rf_model.predict(prediction)


Feature names unseen at fit time:
- panel_vendor_Golden State
- panel_vendor_Mitsui
- panel_vendor_RedBuilt
- panel_vendor_SR Sloan
- panel_vendor_Stark Truss
Feature names seen at fit time, yet now missing:
- vendor_Golden State
- vendor_Mitsui
- vendor_RedBuilt
- vendor_SR Sloan
- vendor_Stark Truss



array([12.05213765])

# KNN

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
knn_model = KNeighborsRegressor(n_neighbors=10,weights='uniform',algorithm='ball_tree')
knn_model = knn_model.fit(X_train,y_train)
knn_model.score(X_test,y_test)

  "X does not have valid feature names, but"


0.8766023107939747

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
linear_model = LinearRegression()
linear_model = linear_model.fit(X_train,y_train)
linear_model.score(X_test,y_test)

0.7964850475697015

# Model Selection

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

In [None]:

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
# rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
# rf_random.fit(X_train, y_train)


In [None]:
# rf_random.best_params_

In [None]:

# Number of trees in knn
k_range=list(range(1,31))
param_grid = dict(n_neighbors=k_range,weights=['uniform','distance'],algorithm=['auto', 'ball_tree', 'kd_tree', 'brute'])

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
kn = KNeighborsRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
kn_random = RandomizedSearchCV(estimator = kn, param_distributions = param_grid, n_iter = 300, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
kn_random.fit(X_train, y_train)




Fitting 3 folds for each of 240 candidates, totalling 720 fits


RandomizedSearchCV(cv=3, estimator=KNeighborsRegressor(), n_iter=300, n_jobs=-1,
                   param_distributions={'algorithm': ['auto', 'ball_tree',
                                                      'kd_tree', 'brute'],
                                        'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8,
                                                        9, 10, 11, 12, 13, 14,
                                                        15, 16, 17, 18, 19, 20,
                                                        21, 22, 23, 24, 25, 26,
                                                        27, 28, 29, 30],
                                        'weights': ['uniform', 'distance']},
                   random_state=42, verbose=2)

In [None]:
kn_random.best_params_

{'weights': 'distance', 'n_neighbors': 10, 'algorithm': 'ball_tree'}

# Neural Network - DNN Regressor

In [None]:
import tensorflow as tf

In [None]:
#Creating Feature Columns
feat_cols=[]
for cols in X.columns[:-1]:
    column=tf.feature_column.numeric_column(cols)
    feat_cols.append(column)
    
print(feat_cols)

[NumericColumn(key='sqft', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='sqft_wall_panels_ext', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='sqft_wall_panels_int', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='close', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='vendor_Golden State', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='vendor_Mitsui', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='vendor_RedBuilt', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='vendor_SR Sloan', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='vendor_Stark Truss', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='region_Atlanti

In [None]:
# Create the Keras Sequential model
dnn_model = tf.compat.v1.estimator.DNNRegressor(hidden_units=[6,10,6],feature_columns=feat_cols)
input_func=tf.compat.v1.estimator.inputs.pandas_input_fn(X_train,y_train,batch_size=10,num_epochs=1000,shuffle=True)

#Training the model
dnn_model.train(input_fn=input_func,steps=1000)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\jbuccola\\AppData\\Local\\Temp\\tmpp_fqauil', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.


ValueError: in user code:

    C:\Users\jbuccola\.conda\envs\PythonData\lib\site-packages\tensorflow_estimator\python\estimator\canned\dnn.py:238 call  *
        net = self._input_layer(features, training=is_training)
    C:\Users\jbuccola\.conda\envs\PythonData\lib\site-packages\keras\engine\base_layer_v1.py:765 __call__  **
        outputs = call_fn(cast_inputs, *args, **kwargs)
    C:\Users\jbuccola\.conda\envs\PythonData\lib\site-packages\keras\feature_column\dense_features.py:163 call  **
        with backend.name_scope(column.name):
    C:\Users\jbuccola\.conda\envs\PythonData\lib\site-packages\tensorflow\python\framework\ops.py:6729 __enter__
        scope_name = scope.__enter__()
    C:\Users\jbuccola\.conda\envs\PythonData\lib\contextlib.py:112 __enter__
        return next(self.gen)
    C:\Users\jbuccola\.conda\envs\PythonData\lib\site-packages\tensorflow\python\framework\ops.py:4274 name_scope
        raise ValueError("'%s' is not a valid scope name" % name)

    ValueError: 'region_Atlantic Region' is not a valid scope name


In [None]:

#Evaluating the model
train_metrics=dnn_model.evaluate(input_fn=input_func,steps=1000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2021-11-08T10:55:45
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\jbuccola\AppData\Local\Temp\tmp2e8pbp3h\model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [100/1000]
INFO:tensorflow:Evaluation [200/1000]
INFO:tensorflow:Evaluation [300/1000]
INFO:tensorflow:Evaluation [400/1000]
INFO:tensorflow:Evaluation [500/1000]
INFO:tensorflow:Evaluation [600/1000]
INFO:tensorflow:Evaluation [700/1000]
INFO:tensorflow:Evaluation [800/1000]
INFO:tensorflow:Evaluation [900/1000]
INFO:tensorflow:Evaluation [1000/1000]
INFO:tensorflow:Inference Time : 1.25582s
INFO:tensorflow:Finished evaluation at 2021-11-08-10:55:46
INFO:tensorflow:Saving dict for global step 1000: average_loss = 90.0454, global_step = 1000, label/mean = 12.907051, loss = 900.454, prediction/mean = 6.07830

In [None]:
# Fit the model to the training data
# fit_model = nn_model.fit(X_train, y_train, epochs=)