In [610]:
# Import our dependencies
import pandas as pd
import matplotlib as plt
import sklearn as skl
import tensorflow as tf
import sys
import os
sys.path.append('../data')
sys.path.append('..')
from app import db
from sqlalchemy import create_engine
!set "DATABASE_URL=postgresql://postgres:postgres@127.0.0.1:5432/frog_projects_db"
engine = create_engine(os.environ['DATABASE_URL'])

In [611]:
projects_df = pd.read_sql_query('select * from "projects"',con=engine)
# projects_df.to_csv('projects.csv')


In [612]:
lumber_df = pd.read_sql_query('select * from lumber_prices', con=engine)
lumber_df.head(5)

Unnamed: 0,date,ticker,open,close,change
0,2021-11-02,LBS=F,575.0,579.5,-4.5
1,2021-11-03,LBS=F,579.799988,646.700012,-66.900024
2,2021-11-04,LBS=F,609.299988,604.0,5.299988
3,2018-05-01,LBS=F,577.200012,591.799988,-14.599976
4,2018-05-02,LBS=F,595.099976,595.0,0.099976


In [613]:
merged = projects_df.merge(lumber_df,left_on="sales_order_date", right_on="date",how='outer').dropna(axis=0,how='any')
# Create calcluated $/elevsqft value columns
# merged['sqft_wall_panels'] = merged['sqft_wall_panels_ext'] + merged['sqft_wall_panels_int']
# merged.to_csv('merged.csv')

In [614]:
features = ['wall_panels_cost_per_elev_sqft','sales_order_date','prototype_prefix','region','panel_vendor','sqft','sqft_wall_panels_ext', 'sqft_wall_panels_int','close']
# features = ['wall_panels_cost_per_elev_sqft','sales_order_date','prototype_prefix','region','panel_vendor','sqft','sqft_wall_panels_ext','close']

# Create new df containing only columns relevant to analyitics
analytical_df = merged[features].sort_values(by=['sales_order_date'],ascending=True) 
# Only consider P12 and P13
analytical_df = analytical_df[analytical_df['prototype_prefix'].str.startswith('P12') | analytical_df['prototype_prefix'].str.startswith('P13')]


# Sales order date only needed for merge - 
analytical_df = analytical_df.drop(columns=['sales_order_date']).reset_index().drop(axis=1,columns='index')
analytical_df


Unnamed: 0,wall_panels_cost_per_elev_sqft,prototype_prefix,region,panel_vendor,sqft,sqft_wall_panels_ext,sqft_wall_panels_int,close
0,9.406957,P12,Northeast Region,Mitsui,4880.0,6576.0,3199.0,543.200012
1,9.288530,P12,Southwest Region,Mitsui,4880.0,7336.0,3013.0,410.700012
2,10.706682,P12,Southwest Region,Mitsui,4880.0,5298.0,3157.0,337.399994
3,9.214844,P12,West Region,Golden State,4844.0,5303.0,3401.0,323.600006
4,9.275289,P12,Southeast Region,Golden State,4859.0,5757.0,4109.0,396.899994
...,...,...,...,...,...,...,...,...
76,25.159391,P12,Northeast Region,RedBuilt,5003.0,5452.0,0.0,606.500000
77,22.694909,P13,Northeast Region,RedBuilt,5147.0,5618.0,0.0,505.100006
78,23.542133,P13,Northeast Region,RedBuilt,4592.0,5364.0,0.0,505.100006
79,22.939864,P13,Northeast Region,RedBuilt,5183.0,5604.0,0.0,505.100006


In [615]:
#ML Packages
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import explained_variance_score


In [616]:
enc = OneHotEncoder(sparse=False)

# Create encoded prototype DF
prototype_enc_df = pd.DataFrame(enc.fit_transform(analytical_df['prototype_prefix'].values.reshape(-1,1))).sort_index()
prototype_enc_df.columns = enc.get_feature_names_out(["prototype_prefix"])

# Create encoded vendor DF
vendor_enc_df = pd.DataFrame(enc.fit_transform(analytical_df['panel_vendor'].values.reshape(-1,1))).sort_index()
vendor_enc_df.columns = enc.get_feature_names_out(["vendor"])

# # Create encoded region DF
region_enc_df = pd.DataFrame(enc.fit_transform(analytical_df['region'].values.reshape(-1,1))).sort_index()
region_enc_df.columns = enc.get_feature_names_out(["region"])


In [617]:
# vendor_enc_df.to_csv('vendors_enc.csv')
# analytical_df.to_csv('analytical_df.csv')

In [618]:
# Merge DFs back into original
analytical_df = analytical_df.merge(vendor_enc_df,left_index=True,right_index=True,how="outer").drop(columns="panel_vendor",axis=1)
analytical_df = analytical_df.merge(region_enc_df,left_index=True,right_index=True).drop(columns="region",axis=1)
analytical_df = analytical_df.merge(prototype_enc_df,left_index=True,right_index=True).drop(columns="prototype_prefix",axis=1)


In [619]:
# Features
X = analytical_df.drop(columns=["wall_panels_cost_per_elev_sqft"],axis=1)

# Target
y = analytical_df["wall_panels_cost_per_elev_sqft"]

In [620]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43,test_size=0.20)

scaler = MinMaxScaler()

# Fit the StandardScaler on non-binary columns
# cols = ['sqft','sqft_wall_panels_ext','sqft_wall_panels_int','close']
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# RANDOM FOREST REGRESSOR

In [621]:
# Create a random forest model
rf_model = RandomForestRegressor(n_estimators=500, random_state=69) 
rf_model = rf_model.fit(X_train_scaled,y_train)

In [622]:
# Rank feature importance
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_,X.columns),reverse=True)

[(0.8599523578828634, 'sqft_wall_panels_int'),
 (0.06001530073398924, 'close'),
 (0.024783404298534952, 'sqft_wall_panels_ext'),
 (0.01775078851734801, 'sqft'),
 (0.007911952619964404, 'region_Southwest Region'),
 (0.006270552089281993, 'region_Northeast Region'),
 (0.005293222052470969, 'region_West Region'),
 (0.003442451115035509, 'vendor_Golden State'),
 (0.00290352774580376, 'prototype_prefix_P13'),
 (0.0028862636437991525, 'vendor_Stark Truss'),
 (0.0026958010196223236, 'prototype_prefix_P12'),
 (0.0025508122791763985, 'region_Southeast Region'),
 (0.0017494607965527285, 'region_Midwest Region'),
 (0.0010199808447582494, 'vendor_RedBuilt'),
 (0.0007071047233827814, 'vendor_SR Sloan'),
 (6.70196374162831e-05, 'region_Atlantic Region'),
 (0.0, 'vendor_Mitsui')]

In [623]:
rf_model.score(X_test,y_test)

  f"X has feature names, but {self.__class__.__name__} was fitted without"


0.8770512102484236

# KNN

In [624]:
from sklearn.neighbors import KNeighborsRegressor

In [625]:
knn_model = KNeighborsRegressor(n_neighbors=7,weights='uniform')
knn_model = knn_model.fit(X_train,y_train)
knn_model.score(X_test,y_test)

  "X does not have valid feature names, but"


0.8890640147398948

# Linear Regression

In [626]:
from sklearn.linear_model import LinearRegression

In [627]:
linear_model = LinearRegression()
linear_model = linear_model.fit(X_train,y_train)
linear_model.score(X_test,y_test)

0.7964850475697015

# Neural Network - DNN Regressor

In [628]:
import tensorflow as tf

In [629]:
#Creating Feature Columns
feat_cols=[]
for cols in X.columns[:-1]:
    column=tf.feature_column.numeric_column(cols)
    feat_cols.append(column)
    
print(feat_cols)

[NumericColumn(key='sqft', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='sqft_wall_panels_ext', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='sqft_wall_panels_int', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='close', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='vendor_Golden State', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='vendor_Mitsui', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='vendor_RedBuilt', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='vendor_SR Sloan', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='vendor_Stark Truss', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='region_Atlanti

In [630]:
# Create the Keras Sequential model
dnn_model = tf.compat.v1.estimator.DNNRegressor(hidden_units=[6,10,6],feature_columns=feat_cols)
input_func=tf.compat.v1.estimator.inputs.pandas_input_fn(X_train,y_train,batch_size=10,num_epochs=1000,shuffle=True)

#Training the model
dnn_model.train(input_fn=input_func,steps=1000)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\jbuccola\\AppData\\Local\\Temp\\tmpa5n1cxsi', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.


ValueError: in user code:

    C:\Users\jbuccola\.conda\envs\PythonData\lib\site-packages\tensorflow_estimator\python\estimator\canned\dnn.py:238 call  *
        net = self._input_layer(features, training=is_training)
    C:\Users\jbuccola\.conda\envs\PythonData\lib\site-packages\keras\engine\base_layer_v1.py:765 __call__  **
        outputs = call_fn(cast_inputs, *args, **kwargs)
    C:\Users\jbuccola\.conda\envs\PythonData\lib\site-packages\keras\feature_column\dense_features.py:163 call  **
        with backend.name_scope(column.name):
    C:\Users\jbuccola\.conda\envs\PythonData\lib\site-packages\tensorflow\python\framework\ops.py:6729 __enter__
        scope_name = scope.__enter__()
    C:\Users\jbuccola\.conda\envs\PythonData\lib\contextlib.py:112 __enter__
        return next(self.gen)
    C:\Users\jbuccola\.conda\envs\PythonData\lib\site-packages\tensorflow\python\framework\ops.py:4274 name_scope
        raise ValueError("'%s' is not a valid scope name" % name)

    ValueError: 'region_Atlantic Region' is not a valid scope name


In [None]:

#Evaluating the model
train_metrics=dnn_model.evaluate(input_fn=input_func,steps=1000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2021-11-08T10:55:45
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\jbuccola\AppData\Local\Temp\tmp2e8pbp3h\model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [100/1000]
INFO:tensorflow:Evaluation [200/1000]
INFO:tensorflow:Evaluation [300/1000]
INFO:tensorflow:Evaluation [400/1000]
INFO:tensorflow:Evaluation [500/1000]
INFO:tensorflow:Evaluation [600/1000]
INFO:tensorflow:Evaluation [700/1000]
INFO:tensorflow:Evaluation [800/1000]
INFO:tensorflow:Evaluation [900/1000]
INFO:tensorflow:Evaluation [1000/1000]
INFO:tensorflow:Inference Time : 1.25582s
INFO:tensorflow:Finished evaluation at 2021-11-08-10:55:46
INFO:tensorflow:Saving dict for global step 1000: average_loss = 90.0454, global_step = 1000, label/mean = 12.907051, loss = 900.454, prediction/mean = 6.07830

In [None]:
# Fit the model to the training data
# fit_model = nn_model.fit(X_train, y_train, epochs=)