# Regressions


In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import datasets, models, transforms
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import util
import statsmodels.api as sm
from scipy import stats
import copy

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures

In [2]:
# read df and images
size = 18491
df_ = pd.read_csv("data_process/df_merged_tract_large.csv")
df = df_.iloc[:size,]

In [3]:
# linear regression 
def Linear_eval(x_train_, x_test_, y_train_, y_test_):
    linear_mod = sm.OLS(y_train_, x_train_)
    linear_mod_res = linear_mod.fit()
    # eval
    train_mse = mean_squared_error(y_train_, linear_mod_res.predict(x_train_))
    test_mse = mean_squared_error(y_test_, linear_mod_res.predict(x_test_))
    train_r2 = r2_score(y_train_, linear_mod_res.predict(x_train_))
    test_r2 = r2_score(y_test_, linear_mod_res.predict(x_test_))
    return linear_mod_res, train_mse, test_mse, train_r2, test_r2

In [4]:
# linear regression with regularization
def Linear_reg_eval(x_train_, x_test_, y_train_, y_test_, method, alpha, L1_wt):
    linear_mod = sm.OLS(y_train_, x_train_)
    linear_mod_res = linear_mod.fit_regularized(method=method, alpha=alpha, L1_wt=L1_wt)
    # eval
    train_mse = mean_squared_error(y_train_, linear_mod_res.predict(x_train_))
    test_mse = mean_squared_error(y_test_, linear_mod_res.predict(x_test_))
    train_r2 = r2_score(y_train_, linear_mod_res.predict(x_train_))
    test_r2 = r2_score(y_test_, linear_mod_res.predict(x_test_))
    return linear_mod_res, train_mse, test_mse, train_r2, test_r2

In [10]:
def initialize_data_linear_reg(df, BE_var, output_var, input_var, size, input_structure):
    # output: x train and test, y train and test.
    y_ = df[output_var].values 
    y = copy.deepcopy(y_)
    x = df[input_var]
    BE = df[BE_var]
    
    # randomization. 
    shuffle_idx = np.arange(size)
    np.random.seed(0) # Keey this seed consistent across scripts.
    np.random.shuffle(shuffle_idx)
    train_ratio = 0.8 # Keey this consistent across scripts.

    # train test.
    y_train = y[shuffle_idx[:int(train_ratio*size)]].astype("float32")
    y_test = y[shuffle_idx[int(train_ratio*size):]].astype("float32")
    BE_train = BE.values[shuffle_idx[:int(train_ratio*size)]].astype("float32")
    BE_test = BE.values[shuffle_idx[int(train_ratio*size):]].astype("float32")
    x_train = x.values[shuffle_idx[:int(train_ratio*size)]].astype("float32")
    x_test = x.values[shuffle_idx[int(train_ratio*size):]].astype("float32")
    # 
    
    if input_structure == 'BE (NHTS) linear':
        x_train_ = sm.add_constant(BE_train)
        x_test_ = sm.add_constant(BE_test)
        y_train_ = y_train[:]
        y_test_ = y_test[:]
        
    elif input_structure == 'SD (NHTS) linear':
        x_train_ = sm.add_constant(x_train)
        x_test_ = sm.add_constant(x_test)
        y_train_ = y_train[:]
        y_test_ = y_test[:]

    return x_train_, x_test_, y_train_, y_test_
    
# # test
# output_var = 'HHVEHCNT_mean'
# input_var = ['R_AGE_IMP_mean', 'HHSIZE_mean', 'HHFAMINC_mean', 'HBHTNRNT_mean', 'HBPPOPDN_mean', 'HBRESDN_mean', 
#       'R_SEX_IMP_2_mean', 'EDUC_2_mean', 'HH_RACE_2_mean', 'HOMEOWN_1_mean', 'HOMEOWN_2_mean',
#       'HBHUR_R_mean', 'HBHUR_S_mean', 'HBHUR_T_mean','HBHUR_U_mean']
# input_structure = 'BE and NHTS quadratic'
# x_train_, x_test_, y_train_, y_test_ = initialize_data_linear_reg(df, BE, output_var, input_var, size, input_structure)
# print(x_train_)
# print(y_train_)
# print(x_train_.shape)
# print(y_train_.shape)

In [11]:
# set up
output_var_list=['HHVEHCNT_mean_norm', 'HHVEHCNT_P_CAP_mean_norm', 'TRPTRANS_1_mean_norm', 'TRPTRANS_2_mean_norm', 'TRPTRANS_3_mean_norm']

#SD NHTS
input_var = ['R_AGE_IMP_mean', 'HHSIZE_mean', 'HHFAMINC_mean', 'R_SEX_IMP_2_mean', 'EDUC_2_mean', 'HH_RACE_2_mean', 'HOMEOWN_1_mean', 'HOMEOWN_2_mean']
#BE NHTS
BE_var = ['HBHTNRNT_mean', 'HBPPOPDN_mean', 'HBRESDN_mean', 'HBHUR_R_mean', 'HBHUR_S_mean', 'HBHUR_T_mean','HBHUR_U_mean']
input_structure_list = ['BE (NHTS) linear', 'SD (NHTS) linear']


In [12]:
# one example regression.
output_var = "HHVEHCNT_mean_norm"
input_structure = 'BE (NHTS) linear'
x_train_, x_test_, y_train_, y_test_ = initialize_data_linear_reg(df, BE_var, output_var, input_var, size, input_structure)
linear_mod_res, train_mse, test_mse, train_r2, test_r2 = Linear_eval(x_train_, x_test_, y_train_, y_test_)
linear_mod_res.summary()
# Note: results show that 3Ds are very significant.

0,1,2,3
Dep. Variable:,y,R-squared:,0.178
Model:,OLS,Adj. R-squared:,0.178
Method:,Least Squares,F-statistic:,457.5
Date:,"Tue, 14 Jul 2020",Prob (F-statistic):,0.0
Time:,20:15:33,Log-Likelihood:,-19600.0
No. Observations:,14792,AIC:,39220.0
Df Residuals:,14784,BIC:,39280.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.3122,0.026,11.784,0.000,0.260,0.364
x1,-1.3117,0.047,-27.658,0.000,-1.405,-1.219
x2,1.824e-05,3.28e-06,5.559,0.000,1.18e-05,2.47e-05
x3,-5.66e-05,4.14e-06,-13.666,0.000,-6.47e-05,-4.85e-05
x4,0.3884,0.028,13.726,0.000,0.333,0.444
x5,0.0493,0.026,1.923,0.055,-0.001,0.100
x6,0.1460,0.028,5.254,0.000,0.092,0.200
x7,-0.0355,0.029,-1.228,0.220,-0.092,0.021

0,1,2,3
Omnibus:,6148.267,Durbin-Watson:,1.983
Prob(Omnibus):,0.0,Jarque-Bera (JB):,54873.794
Skew:,1.761,Prob(JB):,0.0
Kurtosis:,11.753,Cond. No.,62200.0


In [14]:
# iterate over models.
performance_handcrafted = {}

for output_var in output_var_list:
    print("-----")
    print(output_var)
    performance_handcrafted[output_var] = {}

    for input_structure in input_structure_list:
        print(input_structure)
        x_train_, x_test_, y_train_, y_test_ = initialize_data_linear_reg(df, BE_var, output_var, input_var, size, input_structure)
        linear_mod_res, train_mse, test_mse, train_r2, test_r2 = Linear_eval(x_train_, x_test_, y_train_, y_test_)
        # save models
        linear_mod_res.save("models/"+output_var+"_"+input_structure+".pickle")
        # save
        performance_handcrafted[output_var][input_structure] = {}
        performance_handcrafted[output_var][input_structure]['train_mse']=train_mse
        performance_handcrafted[output_var][input_structure]['test_mse']=test_mse
        performance_handcrafted[output_var][input_structure]['train_r2']=train_r2
        performance_handcrafted[output_var][input_structure]['test_r2']=test_r2

-----
HHVEHCNT_mean_norm
BE (NHTS) linear
SD (NHTS) linear
-----
HHVEHCNT_P_CAP_mean_norm
BE (NHTS) linear
SD (NHTS) linear
-----
TRPTRANS_1_mean_norm
BE (NHTS) linear
SD (NHTS) linear
-----
TRPTRANS_2_mean_norm
BE (NHTS) linear
SD (NHTS) linear
-----
TRPTRANS_3_mean_norm
BE (NHTS) linear
SD (NHTS) linear


In [15]:
import pickle
with open('outputs/performance_handcrafted_BESD.pickle', 'wb') as h:
    pickle.dump(performance_handcrafted, h, protocol=pickle.HIGHEST_PROTOCOL)

In [16]:
# get only test r2 for analysis
performance_handcrafted_r2_test = {}
for output_var_key in performance_handcrafted.keys():
    performance_handcrafted_r2_test[output_var_key]={}
    for input_structure_key in performance_handcrafted[output_var_key].keys():
        performance_handcrafted_r2_test[output_var_key][input_structure_key]=\
            performance_handcrafted[output_var_key][input_structure_key]['test_r2']

r2_test_table = pd.DataFrame(performance_handcrafted_r2_test)
r2_test_table

Unnamed: 0,HHVEHCNT_mean_norm,HHVEHCNT_P_CAP_mean_norm,TRPTRANS_1_mean_norm,TRPTRANS_2_mean_norm,TRPTRANS_3_mean_norm
BE (NHTS) linear,0.17155,0.113491,0.249168,0.324251,0.168073
SD (NHTS) linear,0.271584,0.248023,0.099562,0.120024,0.062766


In [17]:
# get only train r2 for analysis
performance_handcrafted_r2_train = {}
for output_var_key in performance_handcrafted.keys():
    performance_handcrafted_r2_train[output_var_key]={}
    for input_structure_key in performance_handcrafted[output_var_key].keys():
        performance_handcrafted_r2_train[output_var_key][input_structure_key]=\
            performance_handcrafted[output_var_key][input_structure_key]['train_r2']

r2_train_table = pd.DataFrame(performance_handcrafted_r2_train)
r2_train_table

Unnamed: 0,HHVEHCNT_mean_norm,HHVEHCNT_P_CAP_mean_norm,TRPTRANS_1_mean_norm,TRPTRANS_2_mean_norm,TRPTRANS_3_mean_norm
BE (NHTS) linear,0.178054,0.128429,0.249678,0.312323,0.178396
SD (NHTS) linear,0.266214,0.255648,0.099706,0.12259,0.069613


## Combine extracted ResNet layers with NHTS data sets

In [None]:
# Train two other input structures.
# total models: 5 * 2 * 5 * 5 = 250 models.
# This part needs to be refined. Ideally we still need train/val/testing sets. 
method = 'elastic_net'

alpha_list = [10.0, 1.0, 0.1, 0.01, 0.001]
L1_wt_list = [0.01, 0.1, 0.5, 0.9, 0.99]
input_structure_list = ['CNN and NHTS linear', 'CNN BE NHTS linear']

performance_cnn_combined = {}
hyper_param_dic = {}

for output_var in output_var_list:
    print("-----")
    print(output_var)
    performance_cnn_combined[output_var] = {}
    hyper_param_dic[output_var]={}

    for input_structure in input_structure_list:
        print(input_structure)
        
        performance_cnn_combined[output_var][input_structure]={}
        hyper_param_dic[output_var][input_structure]={}
        
        x_train_, x_test_, y_train_, y_test_ = initialize_data_linear_reg(df, BE_var, output_var, input_var, last_layer_dic_train, last_layer_dic_test, size, input_structure)
            
        # search a bit. It takes a while...
        best_train_mse=0.0
        best_test_mse=0.0
        best_train_r2=0.0
        best_test_r2=0.0
        hyper_param_dic[output_var][input_structure]['alpha']=0.0
        hyper_param_dic[output_var][input_structure]['L1_wt']=0.0

        # search 5*5=25 models
        for alpha in alpha_list:
            for L1_wt in L1_wt_list:
                linear_mod_res, train_mse, test_mse, train_r2, test_r2 = Linear_reg_eval(x_train_, x_test_, y_train_, y_test_, method, alpha, L1_wt)
                
                if test_r2 > best_test_r2:
                    best_train_mse=train_mse
                    best_test_mse=test_mse
                    best_train_r2=train_r2
                    best_test_r2=test_r2
                    hyper_param_dic[output_var][input_structure]['alpha']=alpha
                    hyper_param_dic[output_var][input_structure]['L1_wt']=L1_wt
                    
        performance_cnn_combined[output_var][input_structure]['train_mse']=best_train_mse
        performance_cnn_combined[output_var][input_structure]['test_mse']=best_test_mse
        performance_cnn_combined[output_var][input_structure]['train_r2']=best_train_r2
        performance_cnn_combined[output_var][input_structure]['test_r2']=best_test_r2
                

In [None]:
print(hyper_param_dic)

In [None]:
import pickle
with open('outputs/performance_cnn_combined.pickle', 'wb') as h:
    pickle.dump(performance_cnn_combined, h, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# get only test r2 for analysis 
performance_cnn_combined_r2_test = {}
for output_var_key in performance_cnn_combined.keys():
    performance_cnn_combined_r2_test[output_var_key]={}
    for input_structure_key in performance_cnn_combined[output_var_key].keys():
        performance_cnn_combined_r2_test[output_var_key][input_structure_key]=\
            performance_cnn_combined[output_var_key][input_structure_key]['test_r2']

r2_test_table = pd.DataFrame(performance_cnn_combined_r2_test)
r2_test_table