In [1]:
#import packages
import numpy as np
import pandas as pd
import os

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import impute

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [2]:
BASE_DIR = "/Users/Avinash/Documents/Kanchanah/Springboard/Data_Science_Track/Capstone_Two_Other_Material/"

data = "Opposite/Data/data.csv"

## Import Dataset

In [3]:
#import dataset
data = pd.read_csv(os.path.expanduser(os.path.join(BASE_DIR, data)),index_col=0)

In [4]:
#Dimension and first few few columns of X_train
data.shape

(200564, 52)

In [5]:
#review first few rows
data.head()

Unnamed: 0,HOUSEID,PERSONID,VEHID,CAR,CARSHARE,CAR_label,DRVRCNT,EDUC,EDUC_label,GT1JBLWK,...,VEHOWNMO,WKFTPT,WKFTPT_label,WKRMHM,WKRMHM_label,WRKCOUNT,YEARMILE,YOUNGCHILD,fueltype_numeric,fueltype_numeric_label
49977,30210010,2,3,5.0,0.0,DAILY,3.0,3.0,SOME COLLEGE OR ASSOCIATES DEGREE,2.0,...,0.0,1.0,FULL-TIME,2.0,NO,3.0,25000.0,0.0,1,1
152617,40136586,1,1,5.0,0.0,DAILY,1.0,3.0,SOME COLLEGE OR ASSOCIATES DEGREE,,...,,,,,,0.0,1000.0,0.0,1,1
219521,40588561,4,4,5.0,0.0,DAILY,3.0,2.0,HIGH SCHOOL GRADUATE OR GED,2.0,...,,2.0,PART-TIME,2.0,NO,2.0,5000.0,0.0,1,1
207962,40509371,1,1,3.0,10.0,A FEW TIMES A MONTH,1.0,4.0,BACHELOR'S DEGREE,2.0,...,3.0,1.0,FULL-TIME,,,1.0,500.0,0.0,1,1
108057,30455166,1,2,5.0,0.0,DAILY,2.0,5.0,GRADUATE DEGREE OR PROFESSIONAL DEGREE,,...,,,,,,2.0,10000.0,0.0,1,1


In [6]:
#Look at info
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200564 entries, 49977 to 38915
Data columns (total 52 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   HOUSEID                 200564 non-null  int64  
 1   PERSONID                200564 non-null  int64  
 2   VEHID                   200564 non-null  int64  
 3   CAR                     198663 non-null  float64
 4   CARSHARE                200271 non-null  float64
 5   CAR_label               198663 non-null  object 
 6   DRVRCNT                 200564 non-null  float64
 7   EDUC                    200424 non-null  float64
 8   EDUC_label              200424 non-null  object 
 9   GT1JBLWK                114467 non-null  float64
 10  GT1JBLWK_label          114467 non-null  object 
 11  HBHUR                   200392 non-null  object 
 12  HBHUR_label             200392 non-null  object 
 13  HBPPOPDN                200392 non-null  float64
 14  HBPPOPDN_label   

## Split Datasets

In [7]:
#drop ordinal variable labels
ord_label = ["CAR_label",
"EDUC_label",
"HBPPOPDN_label",
"HBRESDN_label",
"HHFAMINC_label",
"PLACE_label",
"PRICE_label"]

In [8]:
data.drop(columns=ord_label,inplace=True)

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200564 entries, 49977 to 38915
Data columns (total 45 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   HOUSEID                 200564 non-null  int64  
 1   PERSONID                200564 non-null  int64  
 2   VEHID                   200564 non-null  int64  
 3   CAR                     198663 non-null  float64
 4   CARSHARE                200271 non-null  float64
 5   DRVRCNT                 200564 non-null  float64
 6   EDUC                    200424 non-null  float64
 7   GT1JBLWK                114467 non-null  float64
 8   GT1JBLWK_label          114467 non-null  object 
 9   HBHUR                   200392 non-null  object 
 10  HBHUR_label             200392 non-null  object 
 11  HBPPOPDN                200392 non-null  float64
 12  HBRESDN                 200392 non-null  float64
 13  HHFAMINC                194349 non-null  float64
 14  HHSIZE           

In [10]:
#remove variables without _label
var_label = [a.replace("_label","") for a in data.columns if "_label" in a]

In [11]:
#drop irrelevant variables
X = data.drop(columns=['fueltype_numeric','fueltype_numeric_label'] + var_label)

In [12]:
#output variable
y = data['fueltype_numeric']

In [13]:
#train test split
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size = 0.3, random_state = 21)

In [14]:
#check first few rows
X_train.head()

Unnamed: 0,HOUSEID,PERSONID,VEHID,CAR,CARSHARE,DRVRCNT,EDUC,GT1JBLWK_label,HBHUR_label,HBPPOPDN,...,R_SEX_label,TIMETOWK,VEHAGE,VEHOWNED_label,VEHOWNMO,WKFTPT_label,WKRMHM_label,WRKCOUNT,YEARMILE,YOUNGCHILD
165952,40226077,1,1,5.0,0.0,1.0,4.0,NO,NOT RURAL,6.0,...,FEMALE,,1.0,YES,,FULL-TIME,,1.0,,0.0
161420,40196601,1,1,5.0,0.0,3.0,2.0,,NOT RURAL,5.0,...,MALE,,7.0,YES,,,,1.0,,0.0
231467,40667671,2,1,5.0,0.0,2.0,4.0,,NOT RURAL,2.0,...,FEMALE,,1.0,YES,,,,0.0,,0.0
109716,30461749,1,1,4.0,0.0,1.0,1.0,,NOT RURAL,6.0,...,MALE,,10.0,YES,,,,0.0,1000.0,0.0
187414,40370084,2,2,5.0,0.0,2.0,3.0,YES,NOT RURAL,5.0,...,MALE,60.0,14.0,NO,4.0,FULL-TIME,NO,2.0,,0.0


In [15]:
#check dimension
X_train.shape

(140394, 32)

In [16]:
#check info
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 140394 entries, 165952 to 97650
Data columns (total 32 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   HOUSEID         140394 non-null  int64  
 1   PERSONID        140394 non-null  int64  
 2   VEHID           140394 non-null  int64  
 3   CAR             139080 non-null  float64
 4   CARSHARE        140186 non-null  float64
 5   DRVRCNT         140394 non-null  float64
 6   EDUC            140294 non-null  float64
 7   GT1JBLWK_label  80283 non-null   object 
 8   HBHUR_label     140273 non-null  object 
 9   HBPPOPDN        140273 non-null  float64
 10  HBRESDN         140273 non-null  float64
 11  HHFAMINC        136027 non-null  float64
 12  HHSIZE          140394 non-null  float64
 13  HHSTATE_label   140394 non-null  object 
 14  HHVEHCNT        140394 non-null  float64
 15  HOMEOWN_label   140359 non-null  object 
 16  OCCAT_label     80251 non-null   object 
 17  PLACE 

In [17]:
#check first few rows
X_test.head()

Unnamed: 0,HOUSEID,PERSONID,VEHID,CAR,CARSHARE,DRVRCNT,EDUC,GT1JBLWK_label,HBHUR_label,HBPPOPDN,...,R_SEX_label,TIMETOWK,VEHAGE,VEHOWNED_label,VEHOWNMO,WKFTPT_label,WKRMHM_label,WRKCOUNT,YEARMILE,YOUNGCHILD
205300,40491553,2,2,5.0,0.0,2.0,4.0,NO,NOT RURAL,6.0,...,FEMALE,15.0,4.0,YES,,FULL-TIME,YES,2.0,13000.0,0.0
150902,40125081,2,1,5.0,0.0,2.0,1.0,,RURAL,1.0,...,MALE,,4.0,NO,4.0,,,0.0,3000.0,0.0
70631,30296300,1,1,5.0,0.0,1.0,2.0,,NOT RURAL,2.0,...,FEMALE,,16.0,NO,10.0,,,1.0,,1.0
53212,30223835,2,1,5.0,0.0,2.0,3.0,NO,NOT RURAL,7.0,...,MALE,45.0,17.0,YES,,PART-TIME,NO,2.0,5000.0,0.0
197639,40437650,1,1,5.0,0.0,2.0,4.0,NO,NOT RURAL,5.0,...,MALE,15.0,2.0,YES,,FULL-TIME,YES,1.0,18000.0,2.0


In [18]:
#check dimension
X_test.shape

(60170, 32)

In [19]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60170 entries, 205300 to 239295
Data columns (total 32 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   HOUSEID         60170 non-null  int64  
 1   PERSONID        60170 non-null  int64  
 2   VEHID           60170 non-null  int64  
 3   CAR             59583 non-null  float64
 4   CARSHARE        60085 non-null  float64
 5   DRVRCNT         60170 non-null  float64
 6   EDUC            60130 non-null  float64
 7   GT1JBLWK_label  34184 non-null  object 
 8   HBHUR_label     60119 non-null  object 
 9   HBPPOPDN        60119 non-null  float64
 10  HBRESDN         60119 non-null  float64
 11  HHFAMINC        58322 non-null  float64
 12  HHSIZE          60170 non-null  float64
 13  HHSTATE_label   60170 non-null  object 
 14  HHVEHCNT        60170 non-null  float64
 15  HOMEOWN_label   60163 non-null  object 
 16  OCCAT_label     34168 non-null  object 
 17  PLACE           59034 non

## Impute Datasets

In [20]:
# median imputer
imp_median = impute.SimpleImputer(missing_values=np.nan, strategy="median")

# categorical imputer
imp_cat = impute.SimpleImputer(missing_values=np.nan, strategy="most_frequent")

In [21]:
#split data types
X_train_cat = X_train.select_dtypes('object')
X_train_num = X_train.select_dtypes('float64')
X_test_cat = X_test.select_dtypes('object')
X_test_num = X_test.select_dtypes('float64')

In [22]:
print("The shape of the X_train categorical features is {0}".format(X_train_cat.shape))
print("The shape of the X_train numerical features is {0}".format(X_train_num.shape))
print("The shape of the X_test categorical features is {0}".format(X_test_cat.shape))
print("The shape of the X_test numerical features is {0}".format(X_test_num.shape))

The shape of the X_train categorical features is (140394, 11)
The shape of the X_train numerical features is (140394, 18)
The shape of the X_test categorical features is (60170, 11)
The shape of the X_test numerical features is (60170, 18)


In [23]:
#fit training
# impute categorical data
X_train_cat_data_impute = imp_cat.fit(X_train_cat)

In [24]:
#fit training
# impute numerical data
X_train_num_data_impute = imp_median.fit(X_train_num)

In [25]:
#transform training
X_train_cat_fit = X_train_cat_data_impute.transform(X_train_cat)

In [26]:
#transform test
X_test_cat_fit = X_train_cat_data_impute.transform(X_test_cat)

In [27]:
#transform training
X_train_num_fit = X_train_num_data_impute.transform(X_train_num)

In [28]:

#transform test
X_test_num_fit = X_train_num_data_impute.transform(X_test_num)

In [29]:
print("The shape of the X_train categorical fit features is {0}".format(X_train_cat_fit.shape))
print("The shape of the X_train numerical fit features is {0}".format(X_train_num_fit.shape))
print("The shape of the X_test categorical fit features is {0}".format(X_test_cat_fit.shape))
print("The shape of the X_test numerical fit features is {0}".format(X_test_num_fit.shape))

The shape of the X_train categorical fit features is (140394, 11)
The shape of the X_train numerical fit features is (140394, 18)
The shape of the X_test categorical fit features is (60170, 11)
The shape of the X_test numerical fit features is (60170, 18)


## Scale and One Hot Code Datasets

In [30]:
#scale dataset

sc = MinMaxScaler()

X_train_num_fit_sc = sc.fit_transform(X_train_num_fit)

X_test_num_fit_sc = sc.transform(X_test_num_fit)

In [31]:
#convert to data frame
X_train_done_n = pd.DataFrame(X_train_num_fit_sc,columns=X_train_num.columns)
X_test_done_n= pd.DataFrame(X_test_num_fit_sc,columns=X_test_num.columns)

In [32]:
#convert to data frame
X_train_done_c = pd.DataFrame(X_train_cat_fit,columns=X_train_cat.columns)
X_test_done_c = pd.DataFrame(X_test_cat_fit ,columns=X_test_cat.columns)

In [33]:
#do dummies
X_train_done_cat = pd.get_dummies(X_train_done_c,prefix=X_train_done_c.columns.tolist(), columns = X_train_done_c.columns.tolist(), drop_first=True)

In [34]:
#do dummies
X_test_done_cat = pd.get_dummies(X_test_done_c,prefix=X_test_done_c.columns.tolist(), columns = X_test_done_c.columns.tolist(), drop_first=True)

In [35]:
#merge train and test
X_train = pd.concat([X_train_done_cat, X_train_done_n], axis=1)
X_test = pd.concat([X_test_done_cat, X_test_done_n], axis=1)

In [36]:
X_train.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140394 entries, 0 to 140393
Data columns (total 47 columns):
 #   Column                                                            Non-Null Count   Dtype  
---  ------                                                            --------------   -----  
 0   GT1JBLWK_label_YES                                                140394 non-null  uint8  
 1   HBHUR_label_RURAL                                                 140394 non-null  uint8  
 2   HHSTATE_label_EAST SOUTH CENTRAL                                  140394 non-null  uint8  
 3   HHSTATE_label_MIDDLE ATLANTIC                                     140394 non-null  uint8  
 4   HHSTATE_label_MOUNTAIN                                            140394 non-null  uint8  
 5   HHSTATE_label_NEW ENGLAND                                         140394 non-null  uint8  
 6   HHSTATE_label_PACIFIC                                             140394 non-null  uint8  
 7   HHSTATE_label_SOUTH 

In [37]:
X_test.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60170 entries, 0 to 60169
Data columns (total 47 columns):
 #   Column                                                            Non-Null Count  Dtype  
---  ------                                                            --------------  -----  
 0   GT1JBLWK_label_YES                                                60170 non-null  uint8  
 1   HBHUR_label_RURAL                                                 60170 non-null  uint8  
 2   HHSTATE_label_EAST SOUTH CENTRAL                                  60170 non-null  uint8  
 3   HHSTATE_label_MIDDLE ATLANTIC                                     60170 non-null  uint8  
 4   HHSTATE_label_MOUNTAIN                                            60170 non-null  uint8  
 5   HHSTATE_label_NEW ENGLAND                                         60170 non-null  uint8  
 6   HHSTATE_label_PACIFIC                                             60170 non-null  uint8  
 7   HHSTATE_label_SOUTH ATLANTIC   

In [38]:
X_train.columns = X_train.columns.str.replace(' ','_').str.replace('[,\-,\',$]','')

In [39]:
X_test.columns = X_test.columns.str.replace(' ','_').str.replace('[,\-,\',$]','')

In [40]:
X_train.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140394 entries, 0 to 140393
Data columns (total 47 columns):
 #   Column                                                         Non-Null Count   Dtype  
---  ------                                                         --------------   -----  
 0   GT1JBLWK_label_YES                                             140394 non-null  uint8  
 1   HBHUR_label_RURAL                                              140394 non-null  uint8  
 2   HHSTATE_label_EAST_SOUTH_CENTRAL                               140394 non-null  uint8  
 3   HHSTATE_label_MIDDLE_ATLANTIC                                  140394 non-null  uint8  
 4   HHSTATE_label_MOUNTAIN                                         140394 non-null  uint8  
 5   HHSTATE_label_NEW_ENGLAND                                      140394 non-null  uint8  
 6   HHSTATE_label_PACIFIC                                          140394 non-null  uint8  
 7   HHSTATE_label_SOUTH_ATLANTIC                   

In [41]:
X_test.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60170 entries, 0 to 60169
Data columns (total 47 columns):
 #   Column                                                         Non-Null Count  Dtype  
---  ------                                                         --------------  -----  
 0   GT1JBLWK_label_YES                                             60170 non-null  uint8  
 1   HBHUR_label_RURAL                                              60170 non-null  uint8  
 2   HHSTATE_label_EAST_SOUTH_CENTRAL                               60170 non-null  uint8  
 3   HHSTATE_label_MIDDLE_ATLANTIC                                  60170 non-null  uint8  
 4   HHSTATE_label_MOUNTAIN                                         60170 non-null  uint8  
 5   HHSTATE_label_NEW_ENGLAND                                      60170 non-null  uint8  
 6   HHSTATE_label_PACIFIC                                          60170 non-null  uint8  
 7   HHSTATE_label_SOUTH_ATLANTIC                              

In [42]:
print("The shape of the X_train features is {0}".format(X_train.shape))
print("The shape of the X_test features is {0}".format(X_test.shape))

The shape of the X_train features is (140394, 47)
The shape of the X_test features is (60170, 47)


In [43]:
#Get mean and variance for train and test data
print("The mean of y_train is {0}".format(np.mean(y_train)))
print("The variance of y_train is {0}".format(np.var(y_train)))

The mean of y_train is 0.9702978759776059
The variance of y_train is 0.02881990785086755


Could try approximating the model with Poisson...

### Export Datasets

In [44]:
X_train.to_csv(os.path.expanduser(os.path.join(BASE_DIR, "Opposite/Data/X_train.csv"))) 

In [45]:
X_test.to_csv(os.path.expanduser(os.path.join(BASE_DIR, "Opposite/Data/X_test.csv"))) 

In [46]:
y_train.to_csv(os.path.expanduser(os.path.join(BASE_DIR, "Opposite/Data/y_train.csv"))) 

In [47]:
y_test.to_csv(os.path.expanduser(os.path.join(BASE_DIR, "Opposite/Data/y_test.csv"))) 