In [84]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime
import pmdarima as pm
from pmdarima.utils import diff
from sklearn.metrics import mean_squared_error
import pickle
import sklearn
import statsmodels.tsa.statespace as sm
from sklearn.preprocessing import StandardScaler

In [85]:
pd.set_option('display.max_rows', 500)

In [86]:
df = pd.read_csv('./../data/full_dataset_unscaled.csv')
df['zip'] = df['zip'].map(lambda x: str(x))
df['zip'] = df['zip'].map(lambda x: '0' + x if len(x)<5 else x)
df['datetime']=df['datetime'].map(lambda x: datetime.strptime(x, '%Y-%m-%d'))
df = df.set_index('datetime')

In [87]:
dropcols = ['percent_other_race','has_bike_sharing','bs_total_systems','percent_40_64','percent_0_17','percent_18_39',
       'percent_65+','percent_rental_units_vacant','percent_not_us_citizen','percent_less_highschool', 'percent_buildings_less_10_units',
 'percent_commute_30_to_59',
 'percent_commute_60_to_89',
 'percent_commute_90_more', 'percent_commute_less_30','percent_graduate_deg', 'percent_female','gini_index','percent_hispanic','percent_black','percent_bachelors','percent_asian','percent_new_city','percent_new_unit']

In [88]:
df = df.drop(columns = dropcols)

In [89]:
df = df[df['year']>=2015]

In [90]:
x = df.drop(columns = ['zip','year', 'month', 'City', 'State', 'Metro', 'CountyName', 'zri'])

In [91]:
acs_cols = set([
    'percent_white',
    'percent_black',
    'percent_asian',
    'percent_hispanic',
    'percent_native_am',
    'percent_other_race',
    'percent_0_17',
    'percent_18_39',
    'percent_40_64',
    'percent_65+',
    'percent_rental_units_vacant',
    'percent_rental_units_occupied',
    'percent_graduate_deg',
    'percent_bachelors',
    'percent_associates',
    'percent_highschool',
    'percent_less_highschool',
    'percent_commute_public_transport',
    'percent_commute_less_30',
    'percent_buildings_less_10_units',
    'percent_buildings_10_19_units',
    'percent_buildings_20_49_units',
    'percent_buildings_50+_units',
    'percent_commute_30_to_59',
    'percent_commute_60_to_89',
    'percent_commute_90_more',
    'percent_new_city',
    'percent_new_unit',
    'percent_units_owner_occupied',
    'median_building_age',
    'income_per_capita',
    'poverty_rate',
    'total_pop',
    'percent_workforce_unemployed',
    'percent_work_from_home',
    'median_age',
    'percent_female',
    'gini_index',
    'percent_not_us_citizen'])
bikeshare_cols = set([
    'bs_total_stations',
    'bs_total_systems',
    'has_bike_sharing'])
trends_cols = set([
    'gun range',
    'gun control',
    'gun violence',
    'job opportunities',
    'unemployment',
    'retirement',
    'layoff',
    'lgbt',
    'same sex marriage',
    'they',
    'pronouns',
    'black lives matter',
    'political correctness',
    'make america great again',
    'euthanasia',
    'getaway',
    'places to go',
    'flight tickets',
    'twitter',
    'hashtag',
    'fake news',
    'hurricane',
    'wildfire',
    'flood',
    'fire',
    "trader joe's",
    'whole foods',
    'lululemon',
    'thrift',
    'condos for rent',
    'duplex apartments for rent',
    'townhomes for rent',
    'townhouses for rent',
    'home for rent',
    'house for rent',
    'townhome for rent',
    'townhouse for rent',
    'apartment for rent',
    'studio for rent',
    '1 bedroom for rent',
    '3 bedroom for rent',
    'starbucks'])
bds_cols = set([
    'total_firms',
    'job_creation_rate',
    'job_destruction_rate',
    'startup_firms'])
tax_cols = ('state_local_perc')

In [92]:
acs_cols = acs_cols - set(dropcols)
bds_cols = bds_cols -set(dropcols)
trends_cols = trends_cols - set(dropcols)
bikeshare_cols = bikeshare_cols = set(dropcols)

In [93]:
x2 = x.loc[:, list(acs_cols)]

In [94]:
vif_df2 = pd.DataFrame()
vif_df2['feature'] = x2.columns
vif_df2['vif'] = [variance_inflation_factor(x2.values, i)
                          for i in range(len(x2.columns))]

In [95]:
vif_df2

Unnamed: 0,feature,vif
0,percent_highschool,26.605328
1,percent_rental_units_occupied,205.045894
2,percent_native_am,1.290787
3,percent_buildings_10_19_units,6.317504
4,percent_workforce_unemployed,17.142018
5,percent_buildings_50+_units,5.946151
6,percent_work_from_home,10.242374
7,median_age,115.802069
8,percent_white,11.645166
9,percent_buildings_20_49_units,5.382392


In [96]:
np.mean(vif_df2['vif'])

32.85423085910149