In [1]:
import numpy as np
import pandas as pd
import cv2
import os
import matplotlib.pyplot as plt
import pickle
import copy
from sklearn.preprocessing import normalize

In [2]:
with open('../../statesToData.pickle', 'rb') as handle:
    statesToData = pickle.load(handle)

In [3]:
all_data_list = []
for i,state in enumerate(statesToData):
    df = statesToData[state]
    all_data_list.append(df)



In [4]:
all_data = pd.concat(all_data_list, axis = 0)

In [5]:
all_data.head()

Unnamed: 0,CDIVMSAR,CENSUS_D,CENSUS_R,DRIVER,DRVRCNT,ENDTIME,EDUC,GASPRICE,HBHTNRNT,HBHUR,...,PTRANS_AGREE,SPHONE_HIGH,TAB_HIGH,TAXI_HIGH,WALK_HIGH,WALK2SAVE_AGREE,WEBUSE17_HIGH,HHCNTYFP,HHCT,CENSUS_ID
0,53,5,3,1,3,1015,3,228.4,20,T,...,0,1,1,0,0,0,1,191,1301,37191001301
1,53,5,3,1,3,1530,3,228.4,20,T,...,0,1,1,0,0,0,1,191,1301,37191001301
2,53,5,3,1,3,900,3,228.4,20,T,...,0,1,1,0,0,0,1,191,1301,37191001301
3,53,5,3,1,3,2030,3,228.4,20,T,...,0,1,1,0,0,0,1,191,1301,37191001301
4,53,5,3,1,3,900,2,228.4,20,T,...,0,1,1,0,0,0,1,191,1301,37191001301


In [6]:
categorical_variables = ["TRIPPURP", "TRPTRANS"]
all_variables = ["CENSUS_ID", "HHVEHCNT", "TRIPPURP", "TRPTRANS"]
labels = all_data.copy(deep=True)[all_variables]

In [7]:
labels.head()


Unnamed: 0,CENSUS_ID,HHVEHCNT,TRIPPURP,TRPTRANS
0,37191001301,3,0,2
1,37191001301,3,0,2
2,37191001301,3,3,4
3,37191001301,3,3,4
4,37191001301,3,0,2


In [8]:
labels.shape

(781811, 4)

In [9]:
getEncoderList = labels.copy(deep=True)
encoderListAll = list(set(pd.get_dummies(data=getEncoderList[all_variables], columns=categorical_variables).columns))
encoderListAll.sort()
print(len(encoderListAll))
print(encoderListAll)

13
['CENSUS_ID', 'HHVEHCNT', 'TRIPPURP_0', 'TRIPPURP_1', 'TRIPPURP_2', 'TRIPPURP_3', 'TRIPPURP_4', 'TRPTRANS_1', 'TRPTRANS_2', 'TRPTRANS_3', 'TRPTRANS_4', 'TRPTRANS_5', 'TRPTRANS_6']


In [10]:
def one_hot_encode(variables, data_frame):
    data = data_frame.copy()
    #create columns for the indicator variables
    one_hot_encoding = pd.get_dummies(data = data, columns=variables)
    one_hot_encoding = one_hot_encoding.reindex(columns=encoderListAll, fill_value = 0)    
    return one_hot_encoding

In [11]:
encoded_labels = one_hot_encode(categorical_variables, labels)
print(encoded_labels.shape)

(781811, 13)


In [12]:
encoded_labels.head()

Unnamed: 0,CENSUS_ID,HHVEHCNT,TRIPPURP_0,TRIPPURP_1,TRIPPURP_2,TRIPPURP_3,TRIPPURP_4,TRPTRANS_1,TRPTRANS_2,TRPTRANS_3,TRPTRANS_4,TRPTRANS_5,TRPTRANS_6
0,37191001301,3,1,0,0,0,0,0,1,0,0,0,0
1,37191001301,3,1,0,0,0,0,0,1,0,0,0,0
2,37191001301,3,0,0,0,1,0,0,0,0,1,0,0
3,37191001301,3,0,0,0,1,0,0,0,0,1,0,0
4,37191001301,3,1,0,0,0,0,0,1,0,0,0,0


In [13]:
encoded_avg_labels = encoded_labels.groupby("CENSUS_ID").mean()

In [14]:
encoded_avg_labels.shape

(32681, 12)

In [15]:
encoded_avg_labels.head()

Unnamed: 0_level_0,HHVEHCNT,TRIPPURP_0,TRIPPURP_1,TRIPPURP_2,TRIPPURP_3,TRIPPURP_4,TRPTRANS_1,TRPTRANS_2,TRPTRANS_3,TRPTRANS_4,TRPTRANS_5,TRPTRANS_6
CENSUS_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1001020400,3.0,0.333333,0.222222,0.0,0.111111,0.333333,0.0,0.222222,0.666667,0.0,0.111111,0.0
1001020600,2.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0,0.0,0.0
1001020802,2.0,0.047619,0.0,0.0,0.142857,0.809524,0.0,0.666667,0.333333,0.0,0.0,0.0
1001020900,3.0,0.333333,0.666667,0.0,0.0,0.0,0.0,0.166667,0.166667,0.666667,0.0,0.0
1003010300,2.0,0.176471,0.411765,0.117647,0.0,0.294118,0.0,0.0,0.882353,0.117647,0.0,0.0


In [16]:
from sklearn.model_selection import train_test_split
encoded_avg_labels_train, encoded_avg_labels_test = train_test_split(encoded_avg_labels, train_size = 5/6, test_size = 1/6, shuffle = True)

In [17]:
print(encoded_avg_labels_train.shape)
encoded_avg_labels_train.head()

(27234, 12)


Unnamed: 0_level_0,HHVEHCNT,TRIPPURP_0,TRIPPURP_1,TRIPPURP_2,TRIPPURP_3,TRIPPURP_4,TRPTRANS_1,TRPTRANS_2,TRPTRANS_3,TRPTRANS_4,TRPTRANS_5,TRPTRANS_6
CENSUS_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
21089040100,2.0,0.0,0.0,0.6,0.2,0.2,0.0,1.0,0.0,0.0,0.0,0.0
48189950300,1.956522,0.26087,0.130435,0.130435,0.043478,0.434783,0.0,0.826087,0.0,0.173913,0.0,0.0
48091310603,1.903226,0.258065,0.225806,0.032258,0.096774,0.387097,0.064516,0.483871,0.0,0.387097,0.0,0.064516
53061051912,2.0,0.384615,0.076923,0.0,0.153846,0.384615,0.153846,0.307692,0.538462,0.0,0.0,0.0
6065031401,3.0,0.266667,0.066667,0.133333,0.133333,0.4,0.133333,0.733333,0.0,0.133333,0.0,0.0


In [18]:
print(encoded_avg_labels_test.shape)
encoded_avg_labels_test.head()

(5447, 12)


Unnamed: 0_level_0,HHVEHCNT,TRIPPURP_0,TRIPPURP_1,TRIPPURP_2,TRIPPURP_3,TRIPPURP_4,TRPTRANS_1,TRPTRANS_2,TRPTRANS_3,TRPTRANS_4,TRPTRANS_5,TRPTRANS_6
CENSUS_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
12127082802,2.0,0.25,0.0,0.5,0.0,0.25,0.5,0.0,0.5,0.0,0.0,0.0
25003900400,0.0,0.666667,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,1.0,0.0
55009002002,2.052174,0.226087,0.217391,0.113043,0.156522,0.286957,0.0,0.382609,0.347826,0.217391,0.0,0.052174
48201250301,2.333333,0.144928,0.289855,0.072464,0.173913,0.318841,0.028986,0.217391,0.449275,0.173913,0.028986,0.101449
48201541300,1.75,0.178571,0.071429,0.0,0.357143,0.392857,0.0,0.5,0.214286,0.285714,0.0,0.0


In [19]:
encoded_avg_labels_train.to_csv("encoded_avg_labels_train.csv")

In [20]:
encoded_avg_labels_test.to_csv("encoded_avg_labels_test.csv")