In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import missingno as msno
import datetime

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Helper Functions

In [3]:
def rsi(df,column_name,n, ema = True):
    """
    Returns a pd.Series with the relative strength index.
    
    inputs:
    
    df: dataframe
    columns_name: str, columns label to calculat the rsi of
    n: rsi period length
    ema: Bool, whether or not to use exponantial moving average or simple moving average in calculation
    
    """
    delta = df[column_name].diff(1)

    # Make two series: one for lower closes and one for higher closes
    
    up = delta.clip(lower=0)
    down = -1 * delta.clip(upper=0)
    
    if ema == True:
        
        ma_up = up.ewm(com = n - 1, adjust=True, min_periods = n).mean()
        ma_down = down.ewm(com = n - 1, adjust=True, min_periods = n).mean()
        
    else:
        # Use simple moving average
        ma_up = up.rolling(window = n).mean()
        ma_down = down.rolling(window = n).mean()
        
    rsi = ma_up / ma_down
    rsi = 100 - (100/(1 + rsi))
    return rsi

In [4]:
def time_lag_merge(df_1, df_2,lag_dictionary = {},return_full = False):
    '''
    Parameters
    ----------
    df_1 : pandas Dataframe
        left dataframe that has a 'zip_code' and 'Time' column.
    df_2 : pandas Dataframe
        right dataframe that has a 'zip_code' and 'Time' column.
    lag_dictionary : dictionary
        keys are number of months you want to lag. 
        values are lists of columns that you want to have that lag.
    Returns
    -------
    df_1_ : pandas Dataframe
        dataframe to have new lagged columns.
    '''
    if lag_dictionary:
        df_1_ = df_1.copy()
        for lag in lag_dictionary.keys():
            df_2_ = df_2.copy()
            df_2_.loc[:,'Time'] = df_2_.loc[:,'Time'] + pd.DateOffset(months=lag)
            if return_full:
                df_1_ = df_1_.merge(
                    df_2_[lag_dictionary[lag]+['zip_code','Time']
                          ].add_suffix(f'_{lag}_month_shift').rename(
                      columns={f'Time_{lag}_month_shift':'Time',
                               f'zip_code_{lag}_month_shift':'zip_code'}), 
                    how = 'outer', 
                    on = ['zip_code','Time'])
            else: 
                df_1_ = df_1_.merge(
                    df_2_[lag_dictionary[lag]+['zip_code','Time']
                          ].add_suffix(f'_{lag}_month_shift').rename(
                      columns={f'Time_{lag}_month_shift':'Time',
                               f'zip_code_{lag}_month_shift':'zip_code'}), 
                    how = 'left', 
                    on = ['zip_code','Time'])
    else:
        df_1_ = df_1.merge(df_2, how = 'left', on = ['zip_code','Time'],
                          suffixes = (None,'_right'))
    return df_1_

# Load Data

In [148]:
# load data
texas_data = pd.read_csv(r'C:\Users\robla\Desktop\nycdsa\Capstone\Pipeline Data\merged_texas_data.csv', index_col = 0,
                   parse_dates = ['Time']
                  )

acs_data = pd.read_csv(r'C:\Users\robla\Desktop\nycdsa\Capstone\Pipeline Data\merged_acs_data.csv', index_col = 0,
                      parse_dates = ['Time'])

# engineered_acs = pd.read_csv(r'C:\Users\robla\Desktop\nycdsa\Capstone\data_dump\engineered_acs.csv', index_col = 0,
#                       parse_dates = ['Time'])


# acs_data = pd.merge(acs_data,engineered_acs, how = 'left', on = ['Time','zip_code'])

zri = pd.read_csv(r'C:\Users\robla\Desktop\nycdsa\Capstone\Pipeline Data\long_interpolated_target.csv', index_col = 0,
                   parse_dates=['Time']
                  )

In [149]:
texas_data.columns

Index(['Time', 'zip_code', 'mintempC', 'sales_tax_rate', 'total_sales_tax',
       'tx_is_abt_same', 'tx_is_better', 'tx_is_worse',
       'Single Family Building Permits TX',
       'Gross Value Natural Gas Production', 'sap_case_shiller_index',
       'taxpayer_count', 'taxpayer_cl_ratio', 'taxpayer_is_ratio',
       'taxpayer_foreign_ratio', 'mintempC_annual_pct_change',
       'sales_tax_rate_annual_pct_change', 'total_sales_tax_annual_pct_change',
       'tx_is_abt_same_annual_pct_change', 'tx_is_better_annual_pct_change',
       'tx_is_worse_annual_pct_change',
       'Single Family Building Permits TX_annual_pct_change',
       'Gross Value Natural Gas Production_annual_pct_change',
       'sap_case_shiller_index_annual_pct_change',
       'taxpayer_count_annual_pct_change',
       'taxpayer_cl_ratio_annual_pct_change',
       'taxpayer_is_ratio_annual_pct_change'],
      dtype='object')

In [150]:
texas_data.dtypes

Time                                                    datetime64[ns]
zip_code                                                         int64
mintempC                                                       float64
sales_tax_rate                                                 float64
total_sales_tax                                                float64
tx_is_abt_same                                                 float64
tx_is_better                                                   float64
tx_is_worse                                                    float64
Single Family Building Permits TX                              float64
Gross Value Natural Gas Production                             float64
sap_case_shiller_index                                         float64
taxpayer_count                                                 float64
taxpayer_cl_ratio                                              float64
taxpayer_is_ratio                                              float64
taxpay

In [151]:
acs_data.dtypes

zip_code                                                               float64
white_pop                                                              float64
black_pop                                                              float64
hispanic_pop                                                           float64
female_female_households                                               float64
children                                                               float64
high_school_diploma                                                    float64
armed_forces                                                           float64
Time                                                            datetime64[ns]
driving_alone_population                                                 int64
housing_units_built_1940_to_1949                                         int64
female_45_to_49                                                          int64
female_75_to_79                                     

In [152]:
zri.dtypes

Time        datetime64[ns]
zip_code             int64
zori_ssa           float64
dtype: object

In [357]:
# adding shift to zri
zri_shift = time_lag_merge(zri, zri, {
    12:['zori_ssa'],###############
    13:['zori_ssa'],
    18:['zori_ssa'],
    24:['zori_ssa']
},
                                          return_full = True
                                         )

# there should now be extra values after our target. 
# We are gonna remove the missing values that happen at the start of our inputs tho
zri_shift = zri_shift.sort_values('Time')
# msno.matrix(zri_shift)
# plt.show()
zri_shift = zri_shift.dropna(subset = ['zori_ssa_24_month_shift'],axis='index',
                             how = 'any').reset_index(drop = True)
# msno.matrix(zri_shift)
# plt.show()
# Adding the shift values
zri_shift.loc[:,'zori_ssa_1_diff_lag_12'] = (zri_shift.loc[:,'zori_ssa_12_month_shift'] -
                                             zri_shift.loc[:,'zori_ssa_13_month_shift'])
zri_shift.loc[:,'zori_ssa_6_diff_lag_12'] = (zri_shift.loc[:,'zori_ssa_12_month_shift'] -
                                             zri_shift.loc[:,'zori_ssa_18_month_shift'])
zri_shift.loc[:,'zori_ssa_12_diff_lag_12'] = (zri_shift.loc[:,'zori_ssa_12_month_shift'] -
                                             zri_shift.loc[:,'zori_ssa_24_month_shift'])
zri_shift['zori_ssa_12_diff_lag_12_per'] = (zri_shift['zori_ssa_12_diff_lag_12']/
                                           zri_shift['zori_ssa_12_month_shift'])

zri_shift = zri_shift[['Time','zip_code','zori_ssa',#new features
                       'zori_ssa_12_month_shift',
                       'zori_ssa_1_diff_lag_12', 
                       'zori_ssa_6_diff_lag_12',
                       'zori_ssa_12_diff_lag_12_per'
                      ]]


# merge non acs data 
extra_shift = ['Gross Value Natural Gas Production', 'sap_case_shiller_index']
merged_df = time_lag_merge(zri_shift, 
                                                    texas_data, {
    12:list(texas_data.drop(columns = ['Time','zip_code']+extra_shift
                            ).columns),
    13:extra_shift
},
                                          return_full = True
                                         )
# merge acs data
acs_1_cols = [
    'black_pop',
    'white_pop',
    'hispanic_pop',
    'high_school_diploma',
    'female_female_households',
    'armed_forces',
    'children',
    'black_pop_annual_pct_change',
    'white_pop_annual_pct_change',
    'hispanic_pop_annual_pct_change',
    'high_school_diploma_annual_pct_change',
    'children_annual_pct_change',
    ]
merged_df = time_lag_merge(merged_df, 
                                                    acs_data, {
    36:list(acs_data.drop(columns = ['Time','zip_code'] + acs_1_cols).columns),
    48:acs_1_cols                                              
},
                                          return_full = True
                                         )
# # visualize missing values. it should be that acs 2 does not have a single zipcode
# # then removing that line and checking to see that there are no more missing values.
merged_df = merged_df.loc[merged_df['Time']>datetime.datetime(2016,6,2),:
                          ].reset_index(drop=True)
merged_df = merged_df.loc[merged_df['Time']<datetime.datetime(2022,7,2),:
                          ].reset_index(drop=True)
merged_df = merged_df.sort_values('Time')
merged_df = merged_df.dropna(subset = ['single_women_36_month_shift'],axis='index',
                             how = 'any').reset_index(drop = True)

# this now should have no missing values except for the last year when 
# we are doing a forecast
# msno.matrix(merged_df)
# plt.show()



# creating list of variables to put into the model. initialy is all non index and target
X_vals = merged_df.drop(columns = ['Time','zip_code','zori_ssa']).columns.to_list()
# then remove variables that don't work for that given model.
bad_X_vals = []
X_vals = [X_val for X_val in X_vals if X_val not in bad_X_vals]
y_val = 'zori_ssa'

# split train and test based on a year in advance.
train = merged_df.loc[merged_df['Time']<datetime.datetime(2020,7,2),:].reset_index(drop=True)
post_train = merged_df.loc[merged_df['Time']>datetime.datetime(2020,7,2),:].reset_index(drop=True)
test = post_train.loc[post_train['Time']<datetime.datetime(2021,7,2),:].reset_index(drop=True)
forecast = post_train.loc[post_train['Time']>datetime.datetime(2021,7,2),:].reset_index(drop=True)

# set up x and y values with a scaler (for now ignoring scaler for non-linear methods)
# train first

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(with_mean=False)
X = train[X_vals]
#X = scaler.fit_transform(X)
y = train[y_val]
# test second
X_test = test[X_vals]
#X_test = scaler.transform(X_test)
y_test = test[y_val]
# forecasted values
X_forecast = forecast[X_vals]
#X_forecast = scaler.transform(X_forecast)

In [358]:
X_vals

['zori_ssa_12_month_shift',
 'zori_ssa_1_diff_lag_12',
 'zori_ssa_6_diff_lag_12',
 'zori_ssa_12_diff_lag_12_per',
 'mintempC_12_month_shift',
 'sales_tax_rate_12_month_shift',
 'total_sales_tax_12_month_shift',
 'tx_is_abt_same_12_month_shift',
 'tx_is_better_12_month_shift',
 'tx_is_worse_12_month_shift',
 'Single Family Building Permits TX_12_month_shift',
 'taxpayer_count_12_month_shift',
 'taxpayer_cl_ratio_12_month_shift',
 'taxpayer_is_ratio_12_month_shift',
 'taxpayer_foreign_ratio_12_month_shift',
 'mintempC_annual_pct_change_12_month_shift',
 'sales_tax_rate_annual_pct_change_12_month_shift',
 'total_sales_tax_annual_pct_change_12_month_shift',
 'tx_is_abt_same_annual_pct_change_12_month_shift',
 'tx_is_better_annual_pct_change_12_month_shift',
 'tx_is_worse_annual_pct_change_12_month_shift',
 'Single Family Building Permits TX_annual_pct_change_12_month_shift',
 'Gross Value Natural Gas Production_annual_pct_change_12_month_shift',
 'sap_case_shiller_index_annual_pct_change_1

# Random Forest XgBoost 


In [359]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from math import sqrt
import time

In [388]:
xgb = XGBRegressor(objective = 'reg:squarederror')

grid = {'n_estimators':[10,15,20],
        'max_depth':np.arange(1,11,1),
        
}

grid_search_XGB = GridSearchCV(estimator = xgb, param_grid = grid, cv = 4)


start_time = time.time()

grid_search_XGB.fit(X,y)

print(time.time() - start_time)

33.2349956035614


In [390]:
cv_results = pd.DataFrame(grid_search_XGB.cv_results_)
cv_results.loc[:,'mean_train-mean_test'] = cv_results.loc[:,'mean_train_score'] - cv_results.loc[:,'mean_test_score']
#cv_results

In [391]:
fig = px.line(data_frame = cv_results, x = 'param_max_depth', y = ['mean_train-mean_test'], color = 'param_n_estimators')
fig.show()

In [392]:
fig = px.line(data_frame = cv_results, x = 'param_max_depth', y = ['mean_test_score'], color = 'param_n_estimators')
fig.show()

In [393]:
print("Best Parameters: ",grid_search_XGB.best_params_,"\n\n",
      "Mean cross-validated score of the best_estimator: ",grid_search_XGB.best_score_,"\n")

best_xgb = grid_search_XGB.best_estimator_

Best Parameters:  {'max_depth': 10, 'n_estimators': 20} 

 Mean cross-validated score of the best_estimator:  0.9895160467036421 



In [394]:
best_xgb.score(X,y)

0.9994855915174374

In [395]:
best_xgb.score(X_test,y_test)

0.9416662678749914

In [396]:
best_xgb_predictions = best_xgb.predict(X_test)

#calculate RMSE
print('RMSE: ',sqrt(mean_squared_error(y_test, best_xgb_predictions)))


residuals = y_test - best_xgb_predictions

residuals = pd.DataFrame(residuals)
residuals.rename(columns = {'zori_ssa':'residuals'}, inplace = True)

residuals.loc[:,'predicted_values'] = best_xgb_predictions

fig = px.scatter(data_frame = residuals, x = 'predicted_values',y = 'residuals' ,title = "Residual Plot")

fig.show()

RMSE:  61.221660818085766


In [378]:
feature_importance = [i for i in zip(X.columns,best_xgb.feature_importances_)]
feature_importance = sorted(feature_importance,key = lambda x: x[1],reverse = True)

FI = pd.DataFrame(feature_importance,columns = ['feature','importance'])

fig = px.bar(data_frame = FI.iloc[1:,:],
             x = 'feature',
             y = 'importance',
             title = "Feature Importance",
             color = 'feature',
            height = 1000,width = 2000)
fig.show()

# Linear Model XgBoost 

In [262]:
xgb = XGBRegressor(objective = 'reg:squarederror',booster = 'gblinear')

grid = {'reg_lambda':np.arange(.01,1,.005),
        'reg_alpha':np.arange(.01,.02,.005),
        
}

grid_search_XGB = GridSearchCV(estimator = xgb, param_grid = grid, cv = 3)


start_time = time.time()

grid_search_XGB.fit(X,y)

print(time.time() - start_time)

313.91293716430664


In [264]:
cv_results = pd.DataFrame(grid_search_XGB.cv_results_)
cv_results.loc[:,'mean_train-mean_test'] = cv_results.loc[:,'mean_train_score'] - cv_results.loc[:,'mean_test_score']
#cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_reg_alpha,param_reg_lambda,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score,mean_train-mean_test
0,0.220333,0.006649903,0.007334,0.0004710896,0.01,0.01,"{'reg_alpha': 0.01, 'reg_lambda': 0.01}",0.87979,0.895449,0.888729,0.887988,0.006414,65,0.915715,0.918755,0.927424,0.920631,0.004961,0.032643
1,0.227,0.01883105,0.007,1.561394e-06,0.01,0.015,"{'reg_alpha': 0.01, 'reg_lambda': 0.015}",0.886312,0.905106,0.886335,0.892584,0.008854,48,0.917985,0.922703,0.919671,0.92012,0.001952,0.027536
2,0.217333,0.005435621,0.007001,5.947204e-07,0.01,0.02,"{'reg_alpha': 0.01, 'reg_lambda': 0.0199999999...",0.895967,0.902083,0.897293,0.898448,0.002627,18,0.921164,0.920821,0.921495,0.92116,0.000275,0.022712
3,0.260997,0.02499416,0.007,7.867412e-07,0.01,0.025,"{'reg_alpha': 0.01, 'reg_lambda': 0.0249999999...",0.885195,0.914234,0.886125,0.895184,0.013475,39,0.920508,0.929307,0.920587,0.923467,0.004129,0.028283
4,0.217999,0.004966307,0.007335,0.0004717076,0.01,0.03,"{'reg_alpha': 0.01, 'reg_lambda': 0.03}",0.889859,0.90617,0.897235,0.897754,0.006669,23,0.918786,0.923625,0.921892,0.921434,0.002002,0.02368
5,0.210666,0.003090905,0.008002,1.730247e-06,0.01,0.035,"{'reg_alpha': 0.01, 'reg_lambda': 0.0349999999...",0.899058,0.91072,0.89686,0.902213,0.006082,1,0.923069,0.926443,0.924451,0.924654,0.001385,0.022442
6,0.216665,0.004643498,0.006667,0.0004709208,0.01,0.04,"{'reg_alpha': 0.01, 'reg_lambda': 0.0399999999...",0.896457,0.901838,0.896432,0.898242,0.002542,20,0.924169,0.921851,0.922048,0.922689,0.00105,0.024447
7,0.215667,0.01033766,0.012667,0.007319409,0.01,0.045,"{'reg_alpha': 0.01, 'reg_lambda': 0.045}",0.899602,0.893966,0.894622,0.896064,0.002517,34,0.920844,0.923881,0.921373,0.922033,0.001325,0.025969
8,0.226666,0.02163891,0.007,8.485379e-07,0.01,0.05,"{'reg_alpha': 0.01, 'reg_lambda': 0.0499999999...",0.894911,0.905929,0.89863,0.899823,0.004576,9,0.920051,0.924199,0.92393,0.922727,0.001895,0.022904
9,0.210333,0.002624479,0.007333,0.0004715394,0.01,0.055,"{'reg_alpha': 0.01, 'reg_lambda': 0.0549999999...",0.892525,0.911077,0.901898,0.901833,0.007574,2,0.916422,0.92686,0.926163,0.923148,0.004765,0.021315


In [265]:
fig = px.line(data_frame = cv_results, x = 'param_reg_lambda', y = ['mean_train-mean_test'], color = 'param_reg_alpha')
fig.show()

In [266]:
fig = px.line(data_frame = cv_results, x = 'param_reg_lambda', y = ['mean_test_score'], color = 'param_reg_alpha')
fig.show()

In [267]:
print("Best Parameters: ",grid_search_XGB.best_params_,"\n\n",
      "Mean cross-validated score of the best_estimator: ",grid_search_XGB.best_score_,"\n")

best_xgb = grid_search_XGB.best_estimator_

Best Parameters:  {'reg_alpha': 0.01, 'reg_lambda': 0.034999999999999996} 

 Mean cross-validated score of the best_estimator:  0.9022126031406033 



In [268]:
best_xgb.score(X,y)

0.9214809222983835

In [269]:
best_xgb.score(X_test,y_test)

0.8599927386233654

In [270]:
best_xgb_predictions = best_xgb.predict(X_test)

#calculate RMSE
print('RMSE: ',sqrt(mean_squared_error(y_test, best_xgb_predictions)))


residuals = y_test - best_xgb_predictions

residuals = pd.DataFrame(residuals)
residuals.rename(columns = {'zori_ssa':'residuals'}, inplace = True)

residuals.loc[:,'predicted_values'] = best_xgb_predictions

fig = px.scatter(data_frame = residuals, x = 'predicted_values',y = 'residuals' ,title = "Residual Plot")

fig.show()

RMSE:  94.8463245144735


# Random Forest

In [379]:
from sklearn.ensemble import RandomForestRegressor

In [397]:
randomForest = RandomForestRegressor()

da_grid = {'n_estimators':[10,15,20],
           'criterion':['mse'],
          'max_depth':[5,10,15,20],
          'min_samples_split':[4,6,8]}

grid_search_forest = GridSearchCV(estimator = randomForest, param_grid = da_grid, cv = 4)

start_time = time.time()

grid_search_forest.fit(X,y)

print(time.time() - start_time)

449.6273367404938


In [398]:
print("Best Parameters: ",grid_search_forest.best_params_,"\n\n",
      "Mean cross-validated score of the best_estimator: ",grid_search_forest.best_score_,"\n")

best_forest = grid_search_forest.best_estimator_
feature_importance = [i for i in zip(X.columns,best_forest.feature_importances_)]
feature_importance = sorted(feature_importance,key = lambda x: x[1],reverse = True)

FI3 = pd.DataFrame(feature_importance,columns = ['feature','importance'])

fig = px.bar(data_frame = FI3.iloc[1:31,:],
             x = 'feature',
             y = 'importance',
             title = "Feature Importance",
             color = 'feature',
            height = 800,width = 1000)
fig.show()

Best Parameters:  {'criterion': 'mse', 'max_depth': 15, 'min_samples_split': 4, 'n_estimators': 15} 

 Mean cross-validated score of the best_estimator:  0.9883906593678142 



In [399]:
best_forest.score(X,y)

0.9997484537340942

In [400]:
best_forest.score(X_test,y_test)

0.9463591913020061

In [401]:
cv_results = pd.DataFrame(grid_search_forest.cv_results_)
cv_results.loc[:,'mean_train-mean_test'] = cv_results.loc[:,'mean_train_score'] - cv_results.loc[:,'mean_test_score']
fig = px.line(data_frame = cv_results, x = 'param_max_depth', y = 'mean_test_score', color = 'param_n_estimators')
fig.show()
fig = px.line(data_frame = cv_results, x = 'param_max_depth', y = ['mean_train-mean_test'], color = 'param_n_estimators')
fig.show()


You are accessing a training score ('split0_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True


You are accessing a training score ('split1_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True


You are accessing a training score ('split2_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True


You are accessing a training score ('split3_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True


You are accessing a training score ('mean_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True


You are accessing a training score ('std_train_score'), which will not be available by defa

In [424]:
forest_predictions = best_forest.predict(X_test)

residuals = (y_test - forest_predictions)/y_test

residuals = pd.DataFrame(residuals)

residuals.loc[:,'predicted_values'] = forest_predictions


fig = px.scatter(data_frame = residuals, x = 'predicted_values',y = 'zori_ssa',title = "residual plot" )

#calculate RMSE
print('RMSE: ',sqrt(mean_squared_error(y_test, forest_predictions)))

fig.show()


RMSE:  58.707404974173116




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



# Backtest

In [471]:
city_zips = pd.read_csv(r'C:\Users\robla\Desktop\nycdsa\Capstone\data_dump\city_zip.csv', index_col = 0)
city_zips.columns = ['metro','zip_code']

In [449]:
Backtest.shape

(2568, 116)

In [450]:
city_zips.shape

(215, 2)

In [451]:
Backtest['zori_ssa_prediction'] = forest_predictions
Backtest['percent_error'] = (Backtest['zori_ssa'] - Backtest['zori_ssa_prediction'])/ Backtest['zori_ssa']
Backtest = pd.merge(Backtest,city_zips,how = 'left', on = 'zip_code')

In [472]:
Backtest.sort_values(by = 'percent_error',ascending = False)[['Time','metro',
                                                              'zip_code','zori_ssa',
                                                              'zori_ssa_prediction',
                                                              'percent_error']].iloc[:1000,:]['metro'].value_counts()

dfw            555
austin         180
houston        138
san_antonio    109
el_paso         18
Name: metro, dtype: int64

In [513]:
fig = px.line(data_frame = Backtest.loc[Backtest['metro'] == 'dfw'],x = 'Time',y = 'percent_error', color = 'zip_code',title = 'Dallas-Fort Worth ZORI Backtest',
             hover_data = ['metro','zip_code','percent_error'])
fig.show()

In [514]:
fig = px.box(data_frame = Backtest.loc[Backtest['metro'] == 'dfw'],x = 'Time',y = 'percent_error',title = 'Dallas-Fort Worth ZORI Backtest %error Boxplots vs. Time',
             hover_data = ['metro','zip_code','percent_error'])
fig.show()

In [511]:
fig = px.line(data_frame = Backtest.loc[Backtest['metro'] == 'austin'],x = 'Time',y = 'percent_error', color = 'zip_code',title = 'Austin ZORI Backtest',
             hover_data = ['metro','zip_code','percent_error'])
fig.show()

In [515]:
fig = px.box(data_frame = Backtest.loc[Backtest['metro'] == 'austin'],x = 'Time',y = 'percent_error',title = 'Austin ZORI Backtest %error Boxplots vs. Time',
             hover_data = ['metro','zip_code','percent_error'])
fig.show()

In [517]:
fig = px.line(data_frame = Backtest.loc[Backtest['metro'] == 'houston'],x = 'Time',y = 'percent_error', color = 'zip_code',title = 'Houston ZORI Backtest',
             hover_data = ['metro','zip_code','percent_error'])
fig.show()

In [518]:
fig = px.box(data_frame = Backtest.loc[Backtest['metro'] == 'houston'],x = 'Time',y = 'percent_error',title = 'Houston ZORI Backtest %error Boxplots vs. Time',
             hover_data = ['metro','zip_code','percent_error'])
fig.show()

In [485]:
fig = px.line(data_frame = Backtest.loc[Backtest['metro'] == 'el_paso'],x = 'Time',y = 'percent_error', color = 'zip_code',title = 'El Paso ZORI Backtest',
             hover_data = ['metro','zip_code','percent_error'])
fig.show()

In [519]:
fig = px.box(data_frame = Backtest.loc[Backtest['metro'] == 'el_paso'],x = 'Time',y = 'percent_error',title = 'EL Paso ZORI Backtest %error Boxplots vs. Time',
             hover_data = ['metro','zip_code','percent_error'])
fig.show()

In [505]:
fig = px.line(data_frame = Backtest.loc[Backtest['metro'] == 'san_antonio'],x = 'Time',y = 'percent_error', color = 'zip_code',title = 'San Antonio ZORI Backtest',
             hover_data = ['metro','zip_code','percent_error'])
fig.show()

In [520]:
fig = px.box(data_frame = Backtest.loc[Backtest['metro'] == 'san_antonio'],x = 'Time',y = 'percent_error',title = 'San Antonio ZORI Backtest %error Boxplots vs. Time',
             hover_data = ['metro','zip_code','percent_error'])
fig.show()

# Forecast

In [455]:
forest_forecast = best_forest.predict(X_forecast)

Outlook = post_train.loc[post_train['Time']>datetime.datetime(2021,7,2),:]

Outlook['zori_ssa'] = forest_forecast



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [415]:
fig = px.line(data_frame = Outlook,x = 'Time',y = 'zori_ssa', color = 'zip_code',title = 'ZORI Year Out Forecast')
fig.show()