In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from catboost import CatBoostRegressor
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, Normalizer, MinMaxScaler

from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor

from statsmodels.tsa.seasonal import seasonal_decompose
import plotly.express as px

from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.feature_selection import RFECV

from datetime import datetime

pd.set_option('display.max_rows', 5000)

<H1>External feature integration

In [2]:
df=pd.read_csv('../data/zri_acs_bikeshare_merged.csv')

In [3]:
df['year-month']=df['datetime'].apply(lambda x: x[0:7])

In [4]:
df.columns

Index(['zip', 'City', 'State', 'Metro', 'CountyName', 'zri', 'year', 'month',
       'datetime', 'percent_white', 'percent_black', 'percent_asian',
       'percent_hispanic', 'percent_native_am', 'percent_other_race',
       'percent_0_17', 'percent_18_39', 'percent_40_64', 'percent_65+',
       'percent_rental_units_vacant', 'percent_rental_units_occupied',
       'percent_graduate_deg', 'percent_bachelors', 'percent_associates',
       'percent_highschool', 'percent_less_highschool',
       'percent_commute_public_transport', 'percent_commute_less_30',
       'percent_buildings_less_10_units', 'percent_buildings_10_19_units',
       'percent_buildings_20_49_units', 'percent_buildings_50+_units',
       'percent_commute_30_to_59', 'percent_commute_60_to_89',
       'percent_commute_90_more', 'percent_new_city', 'percent_new_unit',
       'percent_units_owner_occupied', 'median_building_age',
       'income_per_capita', 'poverty_rate', 'total_pop',
       'percent_workforce_unemployed'

<H1>Standardization

In [5]:
#df=df.drop('zip', axis=1)

for zipcode in df['zip'].unique():
    globals()[f"minmax_{zipcode}"]=MinMaxScaler(copy=False)
    df_filtered=df[df['zip']==zipcode]
    df.loc[df['zip']==zipcode,'zri']=globals()[f"minmax_{zipcode}"].fit_transform(df_filtered[['zri']])

In [6]:
# for colval in ['percent_white', 'percent_black', 'percent_asian',
#        'percent_hispanic', 'percent_native_am', 'percent_other_race',
#        'percent_0_17', 'percent_18_39', 'percent_40_64', 'percent_65+',
#        'percent_rental_units_vacant', 'percent_rental_units_occupied',
#        'percent_graduate_deg', 'percent_bachelors', 'percent_associates',
#        'percent_highschool', 'percent_less_highschool',
#        'percent_commute_public_transport', 'percent_commute_less_30',
#        'percent_buildings_less_10_units', 'percent_buildings_10_19_units',
#        'percent_buildings_20_49_units', 'percent_buildings_50+_units',
#        'percent_commute_30_to_59', 'percent_commute_60_to_89',
#        'percent_commute_90_more', 'percent_new_city', 'percent_new_unit',
#        'percent_units_owner_occupied', 'median_building_age',
#        'income_per_capita', 'poverty_rate', 'total_pop',
#        'percent_workforce_unemployed', 'percent_work_from_home', 'median_age',
#        'percent_female', 'gini_index', 'percent_not_us_citizen',
#        'bs_total_stations', 'bs_total_systems', 'has_bike_sharing']:
#     for zipcode in df['zip'].unique():
#         globals()[f"minmax_{zipcode}_{colval}"]=MinMaxScaler(copy=False)
#         df_filtered=df[df['zip']==zipcode]
#         df.loc[df['zip']==zipcode,f'{colval}']=globals()[f"minmax_{zipcode}_{colval}"].fit_transform(df_filtered[[f'{colval}']])
        
        

In [7]:
# zri3['mean']=zri3['RegionName'].apply(lambda x: std_dict[x][0])
# zri3['std']=zri3['RegionName'].apply(lambda x: std_dict[x][1])

#zri3['ZRI_scaled']=(zri3['ZRI']-zri3['mean'])/zri3['std']
#zri3['ZRI_scaled']=zri3['zri']
#zri3=zri3.drop(['zri'],axis=1)
# zri3=zri3.drop(['mean','std'],axis=1)

<H1>Lag development

In [8]:
def laggenerator(i,colname,df):
    timelist=list(df['year-month'].drop_duplicates().sort_values())[0:i]
    
    df.loc[:,f'{colname}_lag{i}']=df.loc[:,f'{colname}'].shift(i)
    
    df.loc[df['year-month'].isin(timelist),f'{colname}_lag{i}']=0
    return df

In [9]:
def laggenerator_diff(i,colname,df):
    timelist=list(df['year-month'].drop_duplicates().sort_values())[0:i+1]
    
    df.loc[:,f'{colname}_diff_lag{i}_lag{i+1}']=df.loc[:,f'{colname}'].shift(i)-df.loc[:,f'{colname}'].shift(i+1)
    
    df.loc[df['year-month'].isin(timelist),f'{colname}_diff_lag{i}_lag{i+1}']=0
    return df

In [10]:
def laggenerator_diff6(i,colname,df):
    timelist=list(df['year-month'].drop_duplicates().sort_values())[0:i+1]
    
    df.loc[:,f'{colname}_diff_lag{i}_lag{i+6}']=df.loc[:,f'{colname}'].shift(i)-df.loc[:,f'{colname}'].shift(i+6)
    
    df.loc[df['year-month'].isin(timelist),f'{colname}_diff_lag{i}_lag{i+6}']=0
    return df

In [11]:
def laggenerator_diff12(i,colname,df):
    timelist=list(df['year-month'].drop_duplicates().sort_values())[0:i+11]
    
    df.loc[:,f'{colname}_diff_lag{i}_lag{i+11}']=df.loc[:,f'{colname}'].shift(i)-df.loc[:,f'{colname}'].shift(i+11)
    
    df.loc[df['year-month'].isin(timelist),f'{colname}_diff_lag{i}_lag{i+11}']=0
    return df

In [12]:
def laggenerator_MA(i,colname,df):
    timelist=list(df['year-month'].drop_duplicates().sort_values())[0:i+4]
    
    df.loc[:,f'{colname}_MA{i}']=df.loc[:,f'{colname}'].shift(i)-(df.loc[:,f'{colname}'].shift(i)+df.loc[:,f'{colname}'].shift(i+1)+df.loc[:,f'{colname}'].shift(i+2)+df.loc[:,f'{colname}'].shift(i+3)+df.loc[:,f'{colname}'].shift(i+4))/5
    
    df.loc[df['year-month'].isin(timelist),f'{colname}_MA{i}']=0
    return df

In [13]:
def laggenerator_mean(i,colname,df):
    timelist=list(df['year-month'].drop_duplicates().sort_values())[0:i-1]
    df.loc[:,f'{colname}_mean{i}']=0
    for x in range(1,i):
        df.loc[:,f'{colname}_mean{i}']+=df.loc[:,f'{colname}'].shift(x)
    df.loc[:,f'{colname}_mean{i}']=df.loc[:,f'{colname}_mean{i}']/i
    
    df.loc[df['year-month'].isin(timelist),f'{colname}_mean{i}']=0
    return df

In [14]:
def laggenerator_ewm(i,colname,df):
    timelist=list(df['year-month'].drop_duplicates().sort_values())[0:i-1]
    df.loc[:,f'{colname}_ewm{i}']=0
    weightsum=0
    for y in range(1,i):
        weight=i-y
        weightsum+=weight
    for x in range(1,i):
        weight=i-x
        df.loc[:,f'{colname}_ewm{i}']+=df.loc[:,f'{colname}'].shift(x)*weight/weightsum
    df.loc[:,f'{colname}_ewm{i}']=df.loc[:,f'{colname}_ewm{i}']
    
    df.loc[df['year-month'].isin(timelist),f'{colname}_ewm{i}']=0
    return df

In [15]:
def laggenerator_min(i,colname,df):
    timelist=list(df['year-month'].drop_duplicates().sort_values())[0:i-1]
    df.loc[:,f'{colname}_min{i}']=999999
    for x in range(1,i):
        df.loc[:,f'{colname}_min{i}']=np.minimum(df.loc[:,f'{colname}'].shift(x),df.loc[:,f'{colname}_min{i}'])
    
    df.loc[df['year-month'].isin(timelist),f'{colname}_min{i}']=0
    return df

In [16]:
def laggenerator_max(i,colname,df):
    timelist=list(df['year-month'].drop_duplicates().sort_values())[0:i-1]
    df.loc[:,f'{colname}_max{i}']=0
    for x in range(1,i):
        df.loc[:,f'{colname}_max{i}']=np.maximum(df.loc[:,f'{colname}'].shift(x),df.loc[:,f'{colname}_max{i}'])
    
    df.loc[df['year-month'].isin(timelist),f'{colname}_max{i}']=0
    return df

In [17]:
def lag_generator_all(df, var):
    for i in range(1,6):   #14
        df=laggenerator(i, var,df)

    for i in range(1,6):  #11
        df=laggenerator_diff(i, var,df)

#     for i in range(1,6):
#         df=laggenerator_diff6(i, var,df)

    #for i in range(1,11):
     #   df=laggenerator_MA(i, var,df)

#     for i in range(1,14):
#         df=laggenerator_mean(i, 'ZRI_scaled',df)

#     for i in range(1,2):
#         df=laggenerator_ewm(i, var,df)

#     for i in range(1,14):
#         df=laggenerator_min(i, var,df)

#     for i in range(1,14):
#         df=laggenerator_max(i, var,df)
    
    return df

def lag_generator_12M(df, var):
    df=laggenerator(12, var,df)
    return df

def lag_generator_dff_12M(df, var):
    df=laggenerator_diff12(1, var,df)
    return df


In [18]:
df.columns[5:43]

Index(['zri', 'year', 'month', 'datetime', 'percent_white', 'percent_black',
       'percent_asian', 'percent_hispanic', 'percent_native_am',
       'percent_other_race', 'percent_0_17', 'percent_18_39', 'percent_40_64',
       'percent_65+', 'percent_rental_units_vacant',
       'percent_rental_units_occupied', 'percent_graduate_deg',
       'percent_bachelors', 'percent_associates', 'percent_highschool',
       'percent_less_highschool', 'percent_commute_public_transport',
       'percent_commute_less_30', 'percent_buildings_less_10_units',
       'percent_buildings_10_19_units', 'percent_buildings_20_49_units',
       'percent_buildings_50+_units', 'percent_commute_30_to_59',
       'percent_commute_60_to_89', 'percent_commute_90_more',
       'percent_new_city', 'percent_new_unit', 'percent_units_owner_occupied',
       'median_building_age', 'income_per_capita', 'poverty_rate', 'total_pop',
       'percent_workforce_unemployed'],
      dtype='object')

In [19]:
# for var in df.columns[5:43]:
#     df=lag_generator_all(df, var)



for feat in ['percent_white', 'percent_black',
        'percent_asian', 'percent_hispanic', 'percent_native_am',
        'percent_other_race', 'percent_0_17', 'percent_18_39', 'percent_40_64',
       'percent_65+', 'percent_rental_units_vacant',
       'percent_rental_units_occupied', 'percent_graduate_deg',
       'percent_bachelors', 'percent_associates', 'percent_highschool',
       'percent_less_highschool', 'percent_commute_public_transport',
       'percent_commute_less_30', 'percent_commute_30_to_59',
       'percent_commute_60_to_89', 'percent_commute_90_more',
       'percent_new_city', 'percent_new_unit', 'percent_units_owner_occupied',
       'median_building_age', 'income_per_capita', 'poverty_rate', 'total_pop',
       'percent_workforce_unemployed', 'percent_work_from_home', 'median_age',
       'percent_female', 'gini_index', 'percent_not_us_citizen',
       'bs_total_stations', 'bs_total_systems', 'has_bike_sharing','percent_buildings_less_10_units','percent_buildings_10_19_units','percent_buildings_20_49_units','percent_buildings_50+_units']:
    df=lag_generator_12M(df,feat)
    df=lag_generator_dff_12M(df,feat)

# df=df[['year-month','zip', 'City', 'State', 'Metro', 'CountyName', 'zri', 'year', 'month',
#        'datetime',
# 'percent_work_from_home','percent_white',
# 'percent_white_lag12',
# 'percent_native_am_lag12',
# 'percent_graduate_deg_lag12',
# 'percent_commute_60_to_89',
# 'percent_commute_less_30_lag12',
# 'percent_commute_less_30',
# 'percent_commute_public_transport',
# 'percent_commute_less_30_diff_lag1_lag12',
# 'percent_bachelors',
# 'percent_new_unit_lag12',
# 'percent_workforce_unemployed_diff_lag1_lag12']]   
    
#df=lag_generator_all(df, 'zri')
    
# df=df.drop(['percent_white', 'percent_black',
#         'percent_asian', 'percent_hispanic', 'percent_native_am',
#         'percent_other_race', 'percent_0_17', 'percent_18_39', 'percent_40_64',
#        'percent_65+', 'percent_rental_units_vacant',
#        'percent_rental_units_occupied', 'percent_graduate_deg',
#        'percent_bachelors', 'percent_associates', 'percent_highschool',
#        'percent_less_highschool', 'percent_commute_public_transport',
#        'percent_commute_less_30', 'percent_commute_30_to_59',
#        'percent_commute_60_to_89', 'percent_commute_90_more',
#        'percent_new_city', 'percent_new_unit', 'percent_units_owner_occupied',
#        'median_building_age', 'income_per_capita', 'poverty_rate', 'total_pop',
#        'percent_workforce_unemployed', 'percent_work_from_home', 'median_age',
#        'percent_female', 'gini_index', 'percent_not_us_citizen',
#        'bs_total_stations', 'bs_total_systems', 'has_bike_sharing','percent_buildings_less_10_units','percent_buildings_10_19_units','percent_buildings_20_49_units','percent_buildings_50+_units'],axis=1)

In [20]:
col_to_keep=['zip', 'City', 'State', 'Metro', 'CountyName', 'zri', 'year', 'month',
       'datetime','percent_white',
'zri_diff_lag1_lag2',
'zri_diff_lag3_lag4',
'zri_diff_lag4_lag5',
'percent_work_from_home',
'zri_diff_lag5_lag6',
'percent_white_lag12',
'percent_native_am_lag12',
'percent_graduate_deg_lag12',
'percent_commute_60_to_89',
'percent_commute_less_30_lag12',
'percent_commute_less_30',
'percent_commute_public_transport',
'percent_commute_less_30_diff_lag1_lag12',
'zri_lag1',
'percent_bachelors',
'percent_new_unit_lag12',
'percent_workforce_unemployed_diff_lag1_lag12',]



In [21]:
df.shape

(93672, 136)

In [22]:
train=df[(df['year-month']<='2018-12') & (df['year-month']>'2015-01')]

train=train.drop('year-month',axis=1)

Y_train=train['zri']
X_train=train.drop('zri',axis=1)


In [23]:
labelencoder = LabelEncoder()
labelencoder2 = LabelEncoder()
labelencoder3 = LabelEncoder()
labelencoder4 = LabelEncoder()
enc=OneHotEncoder()
enc2=OneHotEncoder()

# enc2_df = pd.DataFrame(enc2.fit_transform(X_train[['zip']]).toarray())
# X_train = X_train.join(enc2_df, rsuffix='1')
# X_train=X_train.fillna(0)

X_train=X_train.drop(['zip','year','month','datetime','City','State','Metro','CountyName'],axis=1)
cols=X_train.columns

#X_train = X_train.join(norm_df, rsuffix='1')
#X_train=X_train.fillna(0)



In [24]:
# fold=TimeSeriesSplit(n_splits=5)

# linmod=RandomForestRegressor(n_estimators=500)

# params=({
#      'min_samples_split':[20,30,40], 
#         'max_features':[0.9],
#     'min_samples_leaf':[15,20,30]
#  })

# grid=GridSearchCV(linmod, param_grid=params, cv=fold)
# grid.fit(X_train, Y_train)

# grid.best_params_

In [25]:
# fold=TimeSeriesSplit(n_splits=5)

# linmod=Lasso(max_iter=1000)

# params=({
#      'alpha':np.linspace(0.00000000001,0.000001,20), 
#  })

# grid=GridSearchCV(linmod, param_grid=params, cv=fold)
# grid.fit(X_train, Y_train)

# grid.best_params_

In [26]:
# fold=TimeSeriesSplit(n_splits=5)

# kr_model=Lasso(alpha=1e-06)

# rfe_grid=RFECV(estimator=kr_model, cv=fold, min_features_to_select=1, scoring='r2')
# rfe_grid.fit(X_train, Y_train)

In [27]:
# feature_ranking=pd.DataFrame({'Feature':cols,'Importance':rfe_grid.ranking_}).sort_values('Importance')
# feature_ranking



In [28]:
#kr_model=KernelRidge(kernel='chi2',gamma=0.0001)   #gamma=0.00000001

#{'degree': 3, 'gamma': 1.0000000230258512, 'kernel': 'linear'}
#kr_model=KernelRidge(kernel='linear',gamma=1, degree=3)   #gamma=0.00000001

#kr_model=KernelRidge(kernel='polynomial',gamma=1e-06,degree=3)

#kr_model=SVR(kernel='poly',gamma=0.000001,degree=3)
#kr_model=Lasso(alpha=100)   #alpha=10

#kr_model=RandomForestRegressor(min_samples_split=15, max_features=0.9, n_estimators=500, min_samples_leaf=30)
#8, 16

kr_model=Lasso(alpha=1e-06)

%time kr_model.fit(X_train, Y_train)

CPU times: user 24.2 s, sys: 369 ms, total: 24.6 s
Wall time: 3.45 s


  model = cd_fast.enet_coordinate_descent(


Lasso(alpha=1e-06)

In [29]:
#RF: 2 features: 15 seconds

In [30]:
kr_model.score(X_train, Y_train)

0.23862363875912462

In [31]:
feature_importances=pd.DataFrame({'Feature':cols,'Importance':kr_model.coef_}).sort_values('Importance', ascending=False)
feature_importances

Unnamed: 0,Feature,Importance
53,percent_other_race_diff_lag1_lag12,2.071819
45,percent_black_diff_lag1_lag12,1.24358
25,percent_commute_90_more,1.214872
83,percent_commute_60_to_89_diff_lag1_lag12,1.155684
89,percent_new_unit_diff_lag1_lag12,1.069379
100,percent_workforce_unemployed_lag12,0.9149661
87,percent_new_city_diff_lag1_lag12,0.8773566
51,percent_native_am_diff_lag1_lag12,0.7659121
61,percent_65+_diff_lag1_lag12,0.7483893
24,percent_commute_60_to_89,0.6081697


In [32]:
# feature_importances=pd.DataFrame({'Feature':cols,'Importance':kr_model.feature_importances_}).sort_values('Importance', ascending=False)
# feature_importances

In [33]:
#pd.DataFrame({'Feature':X_train.columns,'Coefs':kr_model.coef_}).sort_values('Coefs')

In [34]:
Y_test=df[(df['year-month']>'2018-12') & (df['year-month']<='2019-12')][['zri']]
predictor_table=df.copy()
X_test=predictor_table[predictor_table['year-month']>'2018-12']

#loop through all month in 2019
for month in list(X_test['year-month'].drop_duplicates()):
    
    #run prediction for one month
    X_test=X_test[X_test['year-month']==month]
    
#     enc2_df2 = pd.DataFrame(enc2.transform(X_test[['zip']]).toarray())
#     X_test = X_test.join(enc2_df2, rsuffix='1')
#     X_test=X_test.fillna(0)    
    
    X_test=X_test.drop(['zip','year','month','datetime','City','State','Metro','CountyName','year-month','zri'],axis=1)
    #X_test = pd.DataFrame(normal.transform(X_test))
   
    val=kr_model.predict(X_test)

    #write current month prediction into predictor_table

    predictor_table.loc[predictor_table['year-month']==month,'zri']=val
        
    #predictor_table=lag_generator_all(predictor_table, 'zri')
#     for var in predictor_table.columns[5:43]:
#         predictor_table=lag_generator_all(predictor_table, var)
        
    X_test=predictor_table[predictor_table['year-month']>'2018-12']



In [35]:
#Y_test['ZRI_Yest_scaled']=Y_test['ZRI_scaled']
rstable=X_test[['zip','year','month','datetime','City','State','Metro','CountyName','year-month','zri']].merge(Y_test[['zri']], left_index=True, right_index=True)


In [36]:
rstable

Unnamed: 0,zip,year,month,datetime,City,State,Metro,CountyName,year-month,zri_x,zri_y
60,1013,2019,1,2019-01-01,Chicopee,MA,Springfield,Hampden County,2019-01,0.620039,0.692308
61,1013,2019,2,2019-02-01,Chicopee,MA,Springfield,Hampden County,2019-02,0.613276,0.688259
62,1013,2019,3,2019-03-01,Chicopee,MA,Springfield,Hampden County,2019-03,0.606513,0.720648
63,1013,2019,4,2019-04-01,Chicopee,MA,Springfield,Hampden County,2019-04,0.599749,0.753036
64,1013,2019,5,2019-05-01,Chicopee,MA,Springfield,Hampden County,2019-05,0.592986,0.761134
...,...,...,...,...,...,...,...,...,...,...,...
93667,99654,2019,8,2019-08-01,Wasilla,AK,Anchorage,Matanuska Susitna Borough,2019-08,0.440765,0.491713
93668,99654,2019,9,2019-09-01,Wasilla,AK,Anchorage,Matanuska Susitna Borough,2019-09,0.446373,0.458564
93669,99654,2019,10,2019-10-01,Wasilla,AK,Anchorage,Matanuska Susitna Borough,2019-10,0.451980,0.613260
93670,99654,2019,11,2019-11-01,Wasilla,AK,Anchorage,Matanuska Susitna Borough,2019-11,0.457587,0.406077


In [37]:
for zipcode in rstable['zip'].unique():
    rstable_filtered=rstable[rstable['zip']==zipcode]
    rstable.loc[rstable['zip']==zipcode,'zri_x']=globals()[f"minmax_{zipcode}"].inverse_transform(rstable_filtered[['zri_x']])
    rstable.loc[rstable['zip']==zipcode,'zri_y']=globals()[f"minmax_{zipcode}"].inverse_transform(rstable_filtered[['zri_y']])   
    

In [38]:
RSME=(sum((rstable['zri_x']-rstable['zri_y'])**2)/len(rstable))**0.5
RSME

96.31187923536882

In [39]:
#51.29194574107712 (max with autoregressive)(random forest)
#51.532600992379024 (best with autoregressive, high leaf sizes)(random forest)
#52.... (RF with race (WIP))
#--------------------------------------
#56.2454620290386 (autoregressive Lasso)
#56.20530931549105 (race Lasso)
#--------------------------------------
#101.8813259385975 (Lasso, best, non-auto)
#96.31187923536882 (Lasso, all, non-auto))


In [47]:
rstable['residual']=rstable['zri_x']-rstable['zri_y']

In [48]:
rstable['residual_squared']=rstable['residual'].map(lambda x: x**2)

In [49]:
rstable2=rstable.groupby('State').agg('mean').sort_values('residual_squared')
rstable2=rstable2.reset_index()

In [50]:
rstable

Unnamed: 0,zip,year,month,datetime,City,State,Metro,CountyName,year-month,zri_x,zri_y,residual,residual_squared
60,1013,2019,1,2019-01-01,Chicopee,MA,Springfield,Hampden County,2019-01,1081.149740,1099.0,-17.850260,318.631765
61,1013,2019,2,2019-02-01,Chicopee,MA,Springfield,Hampden County,2019-02,1079.479178,1098.0,-18.520822,343.020854
62,1013,2019,3,2019-03-01,Chicopee,MA,Springfield,Hampden County,2019-03,1077.808615,1106.0,-28.191385,794.754178
63,1013,2019,4,2019-04-01,Chicopee,MA,Springfield,Hampden County,2019-04,1076.138053,1114.0,-37.861947,1433.527066
64,1013,2019,5,2019-05-01,Chicopee,MA,Springfield,Hampden County,2019-05,1074.467490,1116.0,-41.532510,1724.949396
...,...,...,...,...,...,...,...,...,...,...,...,...,...
93667,99654,2019,8,2019-08-01,Wasilla,AK,Anchorage,Matanuska Susitna Borough,2019-08,1267.778497,1277.0,-9.221503,85.036120
93668,99654,2019,9,2019-09-01,Wasilla,AK,Anchorage,Matanuska Susitna Borough,2019-09,1268.793440,1271.0,-2.206560,4.868908
93669,99654,2019,10,2019-10-01,Wasilla,AK,Anchorage,Matanuska Susitna Borough,2019-10,1269.808383,1299.0,-29.191617,852.150520
93670,99654,2019,11,2019-11-01,Wasilla,AK,Anchorage,Matanuska Susitna Borough,2019-11,1270.823326,1261.5,9.323326,86.924401


In [51]:
fig = px.choropleth(rstable2,
                    locations='State',
                    color='residual_squared',
                    color_continuous_scale='Viridis',
                    hover_name='State',
                    locationmode='USA-states',
                    scope='usa')

fig.show()

In [53]:
# filename='Lasso-nonauto-allfeat'
# feature_importances.to_csv(f'models/features_{filename}.csv')
# rstable.to_csv(f'models/residuals_{filename}.csv')

In [46]:
testmonths=list(zri3[(zri3['year-month']>'2018-12') & (zri3['year-month']<='2019-12')]['year-month'].drop_duplicates())

R2_test=zri3[(zri3['year-month']>'2018-12') & (zri3['year-month']<='2019-12')][['ZRI_scaled','year-month']]
R2_test['ZRI_result']=0

for i,month in enumerate(testmonths):
    R2_test.loc[R2_test['year-month']==month,'ZRI_result']=list(zri3.loc[zri3['year-month']=='2018-12','ZRI_scaled'].apply(lambda x: x*(1+i*0.002)))

RSME_null=(sum((R2_test['ZRI_scaled']-R2_test['ZRI_result'])**2)/len(R2_test))**0.5
RSME_null


NameError: name 'zri3' is not defined

In [None]:
zips=rstable['RegionName'].unique()[0:10]

In [None]:
px.line(rstable[rstable['RegionName'].isin(zips)], y=['ZRI_scaled_x','ZRI_scaled_y'],x='year-month', color='RegionName')

In [None]:
zips=zri3['RegionName'].unique()[0:5]
px.line(zri3[zri3['RegionName'].isin(zips)], y=['ZRI_scaled'],x='year-month', color='RegionName')