In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Load main frame

In [2]:
all_vars = pd.read_csv('all_vars.csv', index_col=0, parse_dates=['Time'])
# all_vars.head()

In [3]:
## Set target here:
target_name = 'zori_ssa'

In [4]:
all_vars.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19780 entries, 0 to 19779
Data columns (total 80 columns):
 #   Column                                  Non-Null Count  Dtype         
---  ------                                  --------------  -----         
 0   Time                                    19780 non-null  datetime64[ns]
 1   zip_code                                19780 non-null  int64         
 2   zori_ssa                                19780 non-null  float64       
 3   mintempC                                19780 non-null  float64       
 4   maxtempC                                19780 non-null  float64       
 5   precipMM                                19780 non-null  float64       
 6   env_violation_count                     19780 non-null  float64       
 7   num_daycare_permit                      19780 non-null  float64       
 8   daycare_childcare_subsidies_ratio       19780 non-null  float64       
 9   daycare_corrective_action_ratio         19780 non-

In [5]:
## Get target, categorical and numerical features
target = all_vars[target_name]
categorical_features = all_vars.select_dtypes(exclude=['float64', 'int64'])
numerical_features = all_vars.select_dtypes(['datetime','float64', 'int64'])

In [6]:
categorical_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19780 entries, 0 to 19779
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Time    19780 non-null  datetime64[ns]
 1   State   19780 non-null  object        
 2   metro   19780 non-null  object        
 3   county  19780 non-null  object        
 4   city    19780 non-null  object        
dtypes: datetime64[ns](1), object(4)
memory usage: 927.2+ KB


In [7]:
numerical_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19780 entries, 0 to 19779
Data columns (total 76 columns):
 #   Column                                  Non-Null Count  Dtype         
---  ------                                  --------------  -----         
 0   Time                                    19780 non-null  datetime64[ns]
 1   zip_code                                19780 non-null  int64         
 2   zori_ssa                                19780 non-null  float64       
 3   mintempC                                19780 non-null  float64       
 4   maxtempC                                19780 non-null  float64       
 5   precipMM                                19780 non-null  float64       
 6   env_violation_count                     19780 non-null  float64       
 7   num_daycare_permit                      19780 non-null  float64       
 8   daycare_childcare_subsidies_ratio       19780 non-null  float64       
 9   daycare_corrective_action_ratio         19780 non-

## First Multilinear model

In [8]:

# housing_df_standard_scale=pd.DataFrame(StandardScaler().fit_transform(housing_df))

In [10]:
numerical_features = numerical_features.drop(['Time', 'zip_code'], axis=1)
scaler = StandardScaler()
scaled_frame = pd.DataFrame(scaler.fit_transform(numerical_features), columns=list(numerical_features.columns))

In [11]:
scaled_frame

Unnamed: 0,zori_ssa,mintempC,maxtempC,precipMM,env_violation_count,num_daycare_permit,daycare_childcare_subsidies_ratio,daycare_corrective_action_ratio,daycare_adverse_action_ratio,Consumer Confidence Index TX,...,taxpayer_org_type_is,taxpayer_org_type_foreign,payer_outlet_same_zipcode,liquor_permit_start,sales_tax_rate,total_sales_tax,total_sales_tax_last_year,per_diff_total_sales_tax,sap_case_shiller_index,delta_sap_case_shiller_index
0,0.205330,-1.545399,-1.358940,-0.939263,0.932785,7.982252,8.049586,-0.00711,-0.010056,-1.223551,...,0.020580,-0.126624,0.895862,-0.401273,-0.771966,-0.934484,-0.918745,-0.451103,-1.093466,0.649609
1,0.193462,-1.205614,-1.157713,-0.100263,-0.041066,-0.170852,-0.127831,-0.00711,-0.010056,-0.798766,...,-0.603763,-0.126624,-0.051993,-0.401273,-0.771966,-0.910030,-0.901654,0.517107,-1.077216,0.594480
2,0.181594,-0.843778,-0.755874,-0.444835,-0.138451,-0.170852,-0.127831,-0.00711,-0.010056,-0.979481,...,-0.603763,-0.126624,-0.431135,-0.401273,-0.771966,-0.939245,-0.923486,-0.468218,-1.034719,0.359392
3,0.169727,-0.202341,-0.041097,-0.631768,0.251089,-0.170852,-0.127831,-0.00711,-0.010056,-0.312321,...,-0.291591,-0.126624,0.327149,5.306133,-0.771966,-0.940555,-0.925830,-0.267730,-0.977326,0.164699
4,0.161815,0.292848,0.424413,0.272503,-0.138451,-0.170852,-0.127831,-0.00711,-0.010056,0.336428,...,-0.603763,-0.126624,0.327149,1.641719,-0.771966,-0.927831,-0.912313,-0.403834,-0.922061,-0.021915
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19775,0.818497,-0.478935,-0.355266,-0.997085,-0.138451,-0.170852,-0.127831,-0.00711,-0.010056,0.003287,...,0.332751,-0.126624,0.137578,-0.401273,-0.143126,0.633219,0.670012,-0.440508,1.632886,2.802160
19776,0.858056,-0.163674,0.110244,-0.218712,-0.138451,-0.170852,-0.127831,-0.00711,-0.010056,0.120179,...,0.644922,-0.126624,0.327149,-0.401273,-0.143126,0.414296,0.564036,-1.060717,1.815834,3.374199
19777,0.897616,0.475269,0.389952,1.289012,-0.138451,-0.170852,-0.127831,-0.00711,-0.010056,0.353962,...,1.581436,-0.126624,0.327149,-0.401273,-0.143126,1.346282,0.810101,2.246521,1.999201,4.075030
19778,0.937175,0.957672,0.960424,-0.087088,-0.138451,-0.170852,-0.127831,-0.00711,-0.010056,0.447476,...,-0.291591,-0.126624,-0.620706,-0.401273,-0.143126,0.928245,0.490089,2.253856,2.178731,4.747029


In [12]:
y = scaled_frame[target_name]
X = scaled_frame.drop(['zori_ssa'], axis=1)

In [13]:
lm = LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
lm.fit(X_train, y_train)
print(f'train_score: {lm.score(X_train, y_train)}')
print(f'test_score: {lm.score(X_test, y_test)}')

train_score: 0.23589181089194566
test_score: 0.21079698925917212


In [16]:
coef_table = pd.DataFrame({'Feature':X.columns, 'Coef':lm.coef_, 'AbsVal':np.abs(lm.coef_)})
coef_table = coef_table.sort_values('AbsVal', ascending=False)
top10 = coef_table.head(10)
top10

Unnamed: 0,Feature,Coef,AbsVal
48,tx_is_worse,-1441243000000.0,1441243000000.0
44,tx_is_a_little_worse,961408000000.0,961408000000.0
45,tx_is_a_lot_worse,594705900000.0,594705900000.0
37,monthly_avg_gas_price,18.51607,18.51607
24,Retail Gasoline Price TX,-18.38947,18.38947
29,Nonfarm Employment Texas,-12.12122,12.12122
13,Nonfarm Employment TX,11.8454,11.8454
49,fatalities,-0.7999325,0.7999325
50,caseCount,0.7862346,0.7862346
1,maxtempC,-0.668868,0.668868


#### Putting all numerical features in, train_score is 23.5899% and test score is 21.0939%