## Team # 4
### Team Member: Jagrati Joshi
    Task: Source data from 2007 and 2013 NRS csv files and prepare for merge and analysis

In [9]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
from uszipcode import SearchEngine

ModuleNotFoundError: No module named 'uszipcode'

In [10]:
# Reading raw csv files
data07_path = "Resource/nrs_2007_final.csv"
data13_path = "Resource/nrs_2013_combined.csv"

# Setting output data file names
output07_data_file = "output/nrs_2007_CleanedData.csv"
output13_data_file = "output/nrs_2013_CleanedData.csv"
# Storing data in raw format in a Dataframe
data07_raw_df = pd.read_csv(data07_path)
print(f"2007 dataframe size is {data07_raw_df.shape}")
data13_raw_df = pd.read_csv(data13_path)
print(f"2013 dataframe size is {data13_raw_df.shape}")

2007 dataframe size is (11120, 313)
2013 dataframe size is (11322, 995)


In [11]:
# Getting selected columns for the 2007 data
col_list = ["STATE", "NRS_ZIPCODE", "DATE", "session", "NRS_RACE" ,"NRS_DSEX", "NRS_AGE", "NRS_SCHOOL"
            ,"NRS_EMPLOY", "NRS_VTYPE", "NRS_FROM", "NRS_HEADED", "NRS_BETMI", "NRS_TODRK", "resulbac2"]
data07_df = pd.read_csv(data07_path, low_memory=False, usecols=col_list)

# Checking list of columns imported from CSV to ensure matchh with sort listed columns
col_list_check = data07_df.columns.values.tolist()
col_list_check

['session',
 'STATE',
 'DATE',
 'NRS_FROM',
 'NRS_HEADED',
 'NRS_BETMI',
 'NRS_TODRK',
 'NRS_AGE',
 'NRS_ZIPCODE',
 'NRS_SCHOOL',
 'NRS_EMPLOY',
 'NRS_RACE',
 'NRS_VTYPE',
 'NRS_DSEX',
 'resulbac2']

In [12]:
# Column index 38 has an issue in import hence checking name of the column at index 38
col38_name = data07_raw_df.columns[38]
col38_name

'NRS_ZIPCODE'

In [13]:
# Checking datatypes of the 2007 dataframe
data07_df.dtypes

session          int64
STATE           object
DATE            object
NRS_FROM        object
NRS_HEADED      object
NRS_BETMI       object
NRS_TODRK       object
NRS_AGE         object
NRS_ZIPCODE     object
NRS_SCHOOL      object
NRS_EMPLOY      object
NRS_RACE        object
NRS_VTYPE       object
NRS_DSEX       float64
resulbac2       object
dtype: object

In [14]:
# Displaying 2007 data
data07_df

Unnamed: 0,session,STATE,DATE,NRS_FROM,NRS_HEADED,NRS_BETMI,NRS_TODRK,NRS_AGE,NRS_ZIPCODE,NRS_SCHOOL,NRS_EMPLOY,NRS_RACE,NRS_VTYPE,NRS_DSEX,resulbac2
0,3,TN,11/3/2007,,,,,#NULL!,,,,,,,0
1,1,IL,,,,,,#NULL!,,,,,,,#NULL!
2,1,IN,,,,,,#NULL!,,,,,,,0
3,1,MI,10/12/2007,,,,,#NULL!,,,,,,,0
4,5,IA,,,,,,#NULL!,,,,,,,#NULL!
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11115,2,IN,,Sport or rec facility / park,Home (own home),0 - 5,No,48,46373,Some college,Employed/self-employed,White,Car,1.0,0
11116,3,PA,,Restaurant / eating place,Home (own home),6 - 10,Yes,39,19038,College graduate,Employed/self-employed,White,Car,2.0,#NULL!
11117,5,FL,,Other,Home (own home),11 - 20,No,19,33065,Some college,Student,White,Car,2.0,#NULL!
11118,1,PA,,Restaurant / eating place,Home (own home),16 - 20,,#NULL!,19312,College graduate,Retired,White,Car,2.0,#NULL!


In [15]:
# Checking count of NA or NaN values in the dataframe
data07_df.isna().sum()

session           0
STATE             0
DATE           7646
NRS_FROM       2147
NRS_HEADED     2148
NRS_BETMI      2147
NRS_TODRK      5074
NRS_AGE           0
NRS_ZIPCODE    2202
NRS_SCHOOL     2174
NRS_EMPLOY     2177
NRS_RACE       2179
NRS_VTYPE        73
NRS_DSEX         73
resulbac2         0
dtype: int64

In [16]:
# Removing all NA or NaN values from the dataframe
data07_df = data07_df
data07_df

Unnamed: 0,session,STATE,DATE,NRS_FROM,NRS_HEADED,NRS_BETMI,NRS_TODRK,NRS_AGE,NRS_ZIPCODE,NRS_SCHOOL,NRS_EMPLOY,NRS_RACE,NRS_VTYPE,NRS_DSEX,resulbac2
0,3,TN,11/3/2007,,,,,#NULL!,,,,,,,0
1,1,IL,,,,,,#NULL!,,,,,,,#NULL!
2,1,IN,,,,,,#NULL!,,,,,,,0
3,1,MI,10/12/2007,,,,,#NULL!,,,,,,,0
4,5,IA,,,,,,#NULL!,,,,,,,#NULL!
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11115,2,IN,,Sport or rec facility / park,Home (own home),0 - 5,No,48,46373,Some college,Employed/self-employed,White,Car,1.0,0
11116,3,PA,,Restaurant / eating place,Home (own home),6 - 10,Yes,39,19038,College graduate,Employed/self-employed,White,Car,2.0,#NULL!
11117,5,FL,,Other,Home (own home),11 - 20,No,19,33065,Some college,Student,White,Car,2.0,#NULL!
11118,1,PA,,Restaurant / eating place,Home (own home),16 - 20,,#NULL!,19312,College graduate,Retired,White,Car,2.0,#NULL!


In [17]:
# Checking count of NA or NaN values in the dataframe after cleanup
data07_df.isna().sum()

session           0
STATE             0
DATE           7646
NRS_FROM       2147
NRS_HEADED     2148
NRS_BETMI      2147
NRS_TODRK      5074
NRS_AGE           0
NRS_ZIPCODE    2202
NRS_SCHOOL     2174
NRS_EMPLOY     2177
NRS_RACE       2179
NRS_VTYPE        73
NRS_DSEX         73
resulbac2         0
dtype: int64

In [18]:
# Checking datatypes
data07_df.dtypes

session          int64
STATE           object
DATE            object
NRS_FROM        object
NRS_HEADED      object
NRS_BETMI       object
NRS_TODRK       object
NRS_AGE         object
NRS_ZIPCODE     object
NRS_SCHOOL      object
NRS_EMPLOY      object
NRS_RACE        object
NRS_VTYPE       object
NRS_DSEX       float64
resulbac2       object
dtype: object

In [19]:
# Changing Zipcode to int64 and RACE to string
data07_df = data07_df.astype({"NRS_ZIPCODE": "int64", "NRS_RACE": "string"})
data07_df.dtypes

ValueError: cannot convert float NaN to integer

In [12]:
search = SearchEngine(simple_zipcode=True)
for index, row in data07_df.iterrows():
    try:
        zipcode = row["NRS_ZIPCODE"]
        getCityData = search.by_zipcode(zipcode)
        data07_df.loc[index, "CityName"] = getCityData.major_city
    except ValueError:
        print(f"Skipping {index} due to invalid zipcode data")

data07_df.head()

Unnamed: 0,session,STATE,DATE,NRS_FROM,NRS_HEADED,NRS_BETMI,NRS_TODRK,NRS_AGE,NRS_ZIPCODE,NRS_SCHOOL,NRS_EMPLOY,NRS_RACE,NRS_VTYPE,NRS_DSEX,resulbac2,CityName
1968,4,IL,7/22/2007,Work,Home (own home),0 - 5,No,20,60154,Some college,Employed/self-employed,Hispanic,Car,1.0,0.0,Westchester
1990,3,CA,7/28/2022,Other,Home (own home),6 - 10,Yes,23,90732,High school graduate,Employed/self-employed,White,Car,2.0,0.043,San Pedro
1992,1,CA,7/27/2007,Home (own home),Restaurant / eating place,0 - 5,No,31,90503,Some college,Employed/self-employed,White,Minivan,2.0,0.0,Torrance
1993,3,WI,8/4/2007,Work,Someone else's home,6 - 10,No,46,53018,Some college,Employed/self-employed,White,Car,2.0,0.0,Delafield
1994,2,IL,7/21/2007,Work,Someone else's home,0 - 5,No,21,60613,College graduate,Employed/self-employed,Asian,Car,2.0,0.0,Chicago


In [13]:
# Renaming NRS_EMPLOY to AreYouEmployed to match data with 2013
data07_df = data07_df.rename(columns={"NRS_EMPLOY": "AreYouEmployed"})
data07_df.head()


Unnamed: 0,session,STATE,DATE,NRS_FROM,NRS_HEADED,NRS_BETMI,NRS_TODRK,NRS_AGE,NRS_ZIPCODE,NRS_SCHOOL,AreYouEmployed,NRS_RACE,NRS_VTYPE,NRS_DSEX,resulbac2,CityName
1968,4,IL,7/22/2007,Work,Home (own home),0 - 5,No,20,60154,Some college,Employed/self-employed,Hispanic,Car,1.0,0.0,Westchester
1990,3,CA,7/28/2022,Other,Home (own home),6 - 10,Yes,23,90732,High school graduate,Employed/self-employed,White,Car,2.0,0.043,San Pedro
1992,1,CA,7/27/2007,Home (own home),Restaurant / eating place,0 - 5,No,31,90503,Some college,Employed/self-employed,White,Minivan,2.0,0.0,Torrance
1993,3,WI,8/4/2007,Work,Someone else's home,6 - 10,No,46,53018,Some college,Employed/self-employed,White,Car,2.0,0.0,Delafield
1994,2,IL,7/21/2007,Work,Someone else's home,0 - 5,No,21,60613,College graduate,Employed/self-employed,Asian,Car,2.0,0.0,Chicago


In [14]:
# Creating a copy of AreYouEmployed column to match with 2013 AreYouStudent column
data07_df["AreYouStudent"] = data07_df["AreYouEmployed"]
data07_df.head()

Unnamed: 0,session,STATE,DATE,NRS_FROM,NRS_HEADED,NRS_BETMI,NRS_TODRK,NRS_AGE,NRS_ZIPCODE,NRS_SCHOOL,AreYouEmployed,NRS_RACE,NRS_VTYPE,NRS_DSEX,resulbac2,CityName,AreYouStudent
1968,4,IL,7/22/2007,Work,Home (own home),0 - 5,No,20,60154,Some college,Employed/self-employed,Hispanic,Car,1.0,0.0,Westchester,Employed/self-employed
1990,3,CA,7/28/2022,Other,Home (own home),6 - 10,Yes,23,90732,High school graduate,Employed/self-employed,White,Car,2.0,0.043,San Pedro,Employed/self-employed
1992,1,CA,7/27/2007,Home (own home),Restaurant / eating place,0 - 5,No,31,90503,Some college,Employed/self-employed,White,Minivan,2.0,0.0,Torrance,Employed/self-employed
1993,3,WI,8/4/2007,Work,Someone else's home,6 - 10,No,46,53018,Some college,Employed/self-employed,White,Car,2.0,0.0,Delafield,Employed/self-employed
1994,2,IL,7/21/2007,Work,Someone else's home,0 - 5,No,21,60613,College graduate,Employed/self-employed,Asian,Car,2.0,0.0,Chicago,Employed/self-employed


In [15]:
# Checking unique values in AreYouEmployed column
data07_df["AreYouEmployed"].unique()

array(['Employed/self-employed', 'Student', 'On Disability', 'Retired',
       'Unemployed', 'Homemaker', 'Other'], dtype=object)

In [16]:
# Converting AreYouEmployed to binary
data07_df["AreYouEmployed"] = data07_df["AreYouEmployed"].apply(lambda x: 1 if x == "Employed/self-employed" else 0)
data07_df

Unnamed: 0,session,STATE,DATE,NRS_FROM,NRS_HEADED,NRS_BETMI,NRS_TODRK,NRS_AGE,NRS_ZIPCODE,NRS_SCHOOL,AreYouEmployed,NRS_RACE,NRS_VTYPE,NRS_DSEX,resulbac2,CityName,AreYouStudent
1968,4,IL,7/22/2007,Work,Home (own home),0 - 5,No,20,60154,Some college,1,Hispanic,Car,1.0,0,Westchester,Employed/self-employed
1990,3,CA,7/28/2022,Other,Home (own home),6 - 10,Yes,23,90732,High school graduate,1,White,Car,2.0,0.043,San Pedro,Employed/self-employed
1992,1,CA,7/27/2007,Home (own home),Restaurant / eating place,0 - 5,No,31,90503,Some college,1,White,Minivan,2.0,0,Torrance,Employed/self-employed
1993,3,WI,8/4/2007,Work,Someone else's home,6 - 10,No,46,53018,Some college,1,White,Car,2.0,0,Delafield,Employed/self-employed
1994,2,IL,7/21/2007,Work,Someone else's home,0 - 5,No,21,60613,College graduate,1,Asian,Car,2.0,0,Chicago,Employed/self-employed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11091,4,IL,7/22/2022,Other,Someone else's home,6 - 10,Yes,46,60077,High school graduate,1,White,SUV,2.0,0,Skokie,Employed/self-employed
11095,5,NE,8/5/2007,Someone else's home,Someone else's home,0 - 5,Yes,25,68154,College graduate,1,White,SUV,2.0,0,Omaha,Employed/self-employed
11097,2,AL,7/21/2007,Home (own home),Restaurant / eating place,More than 20,No,21,35150,High school graduate,1,Black or African American,SUV,1.0,0,Sylacauga,Employed/self-employed
11101,2,CA,7/28/2007,Someone else's home,Home (own home),More than 20,No,28,90810,Some college,1,Black or African American,Car,2.0,0.033,Long Beach,Employed/self-employed


In [17]:
# Converting AreYouStudent to binary
data07_df["AreYouStudent"] = data07_df["AreYouStudent"].apply(lambda x: 1 if x == "Student" else 0)
data07_df

Unnamed: 0,session,STATE,DATE,NRS_FROM,NRS_HEADED,NRS_BETMI,NRS_TODRK,NRS_AGE,NRS_ZIPCODE,NRS_SCHOOL,AreYouEmployed,NRS_RACE,NRS_VTYPE,NRS_DSEX,resulbac2,CityName,AreYouStudent
1968,4,IL,7/22/2007,Work,Home (own home),0 - 5,No,20,60154,Some college,1,Hispanic,Car,1.0,0,Westchester,0
1990,3,CA,7/28/2022,Other,Home (own home),6 - 10,Yes,23,90732,High school graduate,1,White,Car,2.0,0.043,San Pedro,0
1992,1,CA,7/27/2007,Home (own home),Restaurant / eating place,0 - 5,No,31,90503,Some college,1,White,Minivan,2.0,0,Torrance,0
1993,3,WI,8/4/2007,Work,Someone else's home,6 - 10,No,46,53018,Some college,1,White,Car,2.0,0,Delafield,0
1994,2,IL,7/21/2007,Work,Someone else's home,0 - 5,No,21,60613,College graduate,1,Asian,Car,2.0,0,Chicago,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11091,4,IL,7/22/2022,Other,Someone else's home,6 - 10,Yes,46,60077,High school graduate,1,White,SUV,2.0,0,Skokie,0
11095,5,NE,8/5/2007,Someone else's home,Someone else's home,0 - 5,Yes,25,68154,College graduate,1,White,SUV,2.0,0,Omaha,0
11097,2,AL,7/21/2007,Home (own home),Restaurant / eating place,More than 20,No,21,35150,High school graduate,1,Black or African American,SUV,1.0,0,Sylacauga,0
11101,2,CA,7/28/2007,Someone else's home,Home (own home),More than 20,No,28,90810,Some college,1,Black or African American,Car,2.0,0.033,Long Beach,0


In [18]:
# Checking Unique values for AreYouEmployed and AreYouStudent
print(data07_df["AreYouEmployed"].unique())
print(data07_df["AreYouStudent"].unique())

[1 0]
[0 1]


In [19]:
# Checking Unique values for gender
data07_df["NRS_DSEX"].unique()

array([1., 2.])

In [20]:
# Converting gender from numeric to string
data07_df["NRS_DSEX"] = data07_df["NRS_DSEX"].apply(lambda x: "Male" if x == 1.0 else "Female")
data07_df

Unnamed: 0,session,STATE,DATE,NRS_FROM,NRS_HEADED,NRS_BETMI,NRS_TODRK,NRS_AGE,NRS_ZIPCODE,NRS_SCHOOL,AreYouEmployed,NRS_RACE,NRS_VTYPE,NRS_DSEX,resulbac2,CityName,AreYouStudent
1968,4,IL,7/22/2007,Work,Home (own home),0 - 5,No,20,60154,Some college,1,Hispanic,Car,Male,0,Westchester,0
1990,3,CA,7/28/2022,Other,Home (own home),6 - 10,Yes,23,90732,High school graduate,1,White,Car,Female,0.043,San Pedro,0
1992,1,CA,7/27/2007,Home (own home),Restaurant / eating place,0 - 5,No,31,90503,Some college,1,White,Minivan,Female,0,Torrance,0
1993,3,WI,8/4/2007,Work,Someone else's home,6 - 10,No,46,53018,Some college,1,White,Car,Female,0,Delafield,0
1994,2,IL,7/21/2007,Work,Someone else's home,0 - 5,No,21,60613,College graduate,1,Asian,Car,Female,0,Chicago,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11091,4,IL,7/22/2022,Other,Someone else's home,6 - 10,Yes,46,60077,High school graduate,1,White,SUV,Female,0,Skokie,0
11095,5,NE,8/5/2007,Someone else's home,Someone else's home,0 - 5,Yes,25,68154,College graduate,1,White,SUV,Female,0,Omaha,0
11097,2,AL,7/21/2007,Home (own home),Restaurant / eating place,More than 20,No,21,35150,High school graduate,1,Black or African American,SUV,Male,0,Sylacauga,0
11101,2,CA,7/28/2007,Someone else's home,Home (own home),More than 20,No,28,90810,Some college,1,Black or African American,Car,Female,0.033,Long Beach,0


In [21]:
# Exporting Final cleaned data to csv
# Naming the index of the dataframe for export to csv
data07_df.index.names = ["nrs_id"]

# Exporting the data to CSV file
data07_df.to_csv(output07_data_file)

In [22]:
# Getting selected columns for the 2013 data
col_list = ["State", "MyZipCodeIs", "Time_Stamp", "data_session_original", "race_n", "Gender", "DriverAgeYears"
            ,"EducationLevel", "AreYouStudent", "AreYouEmployed", "VehicleType", "FromWhere", "ToWhere"
            ,"MilesTraveling", "AlcoholToday", "bac_from_blood"]
data13_df = pd.read_csv(data13_path, low_memory=False, usecols=col_list)

# Checking list of columns imported from CSV to ensure matchh with sort listed columns
col_list_check = data13_df.columns.values.tolist()
col_list_check

['data_session_original',
 'State',
 'Time_Stamp',
 'race_n',
 'Gender',
 'VehicleType',
 'FromWhere',
 'ToWhere',
 'MilesTraveling',
 'AlcoholToday',
 'DriverAgeYears',
 'MyZipCodeIs',
 'EducationLevel',
 'AreYouStudent',
 'AreYouEmployed',
 'bac_from_blood']

In [23]:
# Checking datatypes of the 2013 dataframe
data13_df.dtypes

data_session_original      int64
State                     object
Time_Stamp                object
race_n                    object
Gender                    object
VehicleType               object
FromWhere                 object
ToWhere                   object
MilesTraveling            object
AlcoholToday              object
DriverAgeYears           float64
MyZipCodeIs              float64
EducationLevel            object
AreYouStudent             object
AreYouEmployed            object
bac_from_blood            object
dtype: object

In [24]:
# Displaying 2013 data
data13_df

Unnamed: 0,data_session_original,State,Time_Stamp,race_n,Gender,VehicleType,FromWhere,ToWhere,MilesTraveling,AlcoholToday,DriverAgeYears,MyZipCodeIs,EducationLevel,AreYouStudent,AreYouEmployed,bac_from_blood
0,3,CA,12/07/2013 01:36:27,8,Male,Car,Other,Other,0-5,No,43.0,93003.0,High school graduate,No,Employed Full-time,#NULL!
1,3,CA,12/07/2013 01:56:35,1,Male,Car,Other,Own home,More than 20,No,21.0,93033.0,Some college - no degree,No,Employed Full-time,#NULL!
2,3,CA,12/07/2013 02:17:15,8,Male,Car,Other,Own home,0-5,Yes,23.0,93035.0,Some college - no degree,Other/ Technical or Trade Program,Other __________________________,#NULL!
3,3,CA,12/07/2013 02:34:15,8,Male,Car,Someone else's home,Own home,0-5,No,19.0,93030.0,Some college - no degree,College/ Grad or Law School,Employed Part-time,0
4,3,CA,12/07/2013 02:41:16,#NULL!,Male,SUV/ Crossover,,,,,,,,,,#NULL!
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11317,4,UT,02/08/2014 23:46:43,1,Female,Car,Work,Own home,11-20,,56.0,84010.0,Associate's degree,No,Employed Full-time,0
11318,5,UT,02/09/2014 01:29:47,1,Female,SUV/ Crossover,Someone else's home,Own home,11-20,,18.0,84054.0,High school graduate,College/ Grad or Law School,Employed Part-time,0
11319,5,UT,02/09/2014 01:43:25,1,Female,Car,Someone else's home,Own home,11-20,,22.0,84037.0,Some college - no degree,College/ Grad or Law School,Employed Part-time,0
11320,5,UT,02/09/2014 02:03:32,1,Female,Car,Store or gas station,Other,6-10,No,20.0,84606.0,Some college - no degree,College/ Grad or Law School,Employed Part-time,0


In [25]:
# Checking count of NA or NaN values in the dataframe
data13_df.isna().sum()

data_session_original       0
State                       0
Time_Stamp                  0
race_n                      0
Gender                    275
VehicleType               253
FromWhere                2521
ToWhere                  2522
MilesTraveling           2526
AlcoholToday             5189
DriverAgeYears           2562
MyZipCodeIs              2573
EducationLevel           2541
AreYouStudent            2542
AreYouEmployed           2544
bac_from_blood              0
dtype: int64

In [26]:
# Removing all NA or NaN values from the dataframe
data13_df = data13_df.dropna()
data13_df

Unnamed: 0,data_session_original,State,Time_Stamp,race_n,Gender,VehicleType,FromWhere,ToWhere,MilesTraveling,AlcoholToday,DriverAgeYears,MyZipCodeIs,EducationLevel,AreYouStudent,AreYouEmployed,bac_from_blood
0,3,CA,12/07/2013 01:36:27,8,Male,Car,Other,Other,0-5,No,43.0,93003.0,High school graduate,No,Employed Full-time,#NULL!
1,3,CA,12/07/2013 01:56:35,1,Male,Car,Other,Own home,More than 20,No,21.0,93033.0,Some college - no degree,No,Employed Full-time,#NULL!
2,3,CA,12/07/2013 02:17:15,8,Male,Car,Other,Own home,0-5,Yes,23.0,93035.0,Some college - no degree,Other/ Technical or Trade Program,Other __________________________,#NULL!
3,3,CA,12/07/2013 02:34:15,8,Male,Car,Someone else's home,Own home,0-5,No,19.0,93030.0,Some college - no degree,College/ Grad or Law School,Employed Part-time,0
5,3,CA,12/07/2013 13:28:52,1,Male,Car,Other,Other,More than 20,No,23.0,93033.0,Some college - no degree,No,Employed Full-time,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11301,2,UT,02/07/2014 23:25:54,1,Female,Car,Work,Own home,11-20,No,21.0,84041.0,High school graduate,No,Employed Part-time,#NULL!
11302,2,UT,02/07/2014 23:43:53,1,Male,Pickup,Own home,Someone else's home,0-5,No,19.0,84010.0,High school graduate,High School,Employed Full-time,#NULL!
11306,3,UT,02/08/2014 02:37:35,1,Male,Car,Own home,School/church,More than 20,No,23.0,84321.0,Associate's degree,College/ Grad or Law School,Employed Full-time,#NULL!
11314,4,UT,02/08/2014 23:01:07,5,Female,Car,Restaurant/eating place,Own home,0-5,No,40.0,84010.0,High school graduate,No,Employed Full-time,0


In [27]:
# Checking count of NA or NaN values in the dataframe after cleanup
data13_df.isna().sum()

data_session_original    0
State                    0
Time_Stamp               0
race_n                   0
Gender                   0
VehicleType              0
FromWhere                0
ToWhere                  0
MilesTraveling           0
AlcoholToday             0
DriverAgeYears           0
MyZipCodeIs              0
EducationLevel           0
AreYouStudent            0
AreYouEmployed           0
bac_from_blood           0
dtype: int64

In [28]:
# Checking data after cleanup
data13_df

Unnamed: 0,data_session_original,State,Time_Stamp,race_n,Gender,VehicleType,FromWhere,ToWhere,MilesTraveling,AlcoholToday,DriverAgeYears,MyZipCodeIs,EducationLevel,AreYouStudent,AreYouEmployed,bac_from_blood
0,3,CA,12/07/2013 01:36:27,8,Male,Car,Other,Other,0-5,No,43.0,93003.0,High school graduate,No,Employed Full-time,#NULL!
1,3,CA,12/07/2013 01:56:35,1,Male,Car,Other,Own home,More than 20,No,21.0,93033.0,Some college - no degree,No,Employed Full-time,#NULL!
2,3,CA,12/07/2013 02:17:15,8,Male,Car,Other,Own home,0-5,Yes,23.0,93035.0,Some college - no degree,Other/ Technical or Trade Program,Other __________________________,#NULL!
3,3,CA,12/07/2013 02:34:15,8,Male,Car,Someone else's home,Own home,0-5,No,19.0,93030.0,Some college - no degree,College/ Grad or Law School,Employed Part-time,0
5,3,CA,12/07/2013 13:28:52,1,Male,Car,Other,Other,More than 20,No,23.0,93033.0,Some college - no degree,No,Employed Full-time,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11301,2,UT,02/07/2014 23:25:54,1,Female,Car,Work,Own home,11-20,No,21.0,84041.0,High school graduate,No,Employed Part-time,#NULL!
11302,2,UT,02/07/2014 23:43:53,1,Male,Pickup,Own home,Someone else's home,0-5,No,19.0,84010.0,High school graduate,High School,Employed Full-time,#NULL!
11306,3,UT,02/08/2014 02:37:35,1,Male,Car,Own home,School/church,More than 20,No,23.0,84321.0,Associate's degree,College/ Grad or Law School,Employed Full-time,#NULL!
11314,4,UT,02/08/2014 23:01:07,5,Female,Car,Restaurant/eating place,Own home,0-5,No,40.0,84010.0,High school graduate,No,Employed Full-time,0


In [29]:
# Removing #NULL! values from bac_from_blood
data13_df = data13_df[data13_df["bac_from_blood"] != "#NULL!"]
data13_df

Unnamed: 0,data_session_original,State,Time_Stamp,race_n,Gender,VehicleType,FromWhere,ToWhere,MilesTraveling,AlcoholToday,DriverAgeYears,MyZipCodeIs,EducationLevel,AreYouStudent,AreYouEmployed,bac_from_blood
3,3,CA,12/07/2013 02:34:15,8,Male,Car,Someone else's home,Own home,0-5,No,19.0,93030.0,Some college - no degree,College/ Grad or Law School,Employed Part-time,0
5,3,CA,12/07/2013 13:28:52,1,Male,Car,Other,Other,More than 20,No,23.0,93033.0,Some college - no degree,No,Employed Full-time,0
9,1,FL,06/28/2013 10:54:21,1,Male,SUV/ Crossover,Own home,Other,6-10,No,46.0,33125.0,9th - 11th grade,No,Employed Full-time,0
15,4,FL,06/29/2013 23:13:17,1,Male,Car,Own home,Own home,11-20,Yes,25.0,33146.0,Professional degree,No,Employed Full-time,0
22,5,FL,06/30/2013 02:30:54,6,Female,Car,Someone else's home,Own home,0-5,No,23.0,33137.0,Associate's degree,College/ Grad or Law School,Employed Full-time,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11287,4,OK,09/07/2013 23:42:59,1,Male,Pickup,Someone else's home,Own home,6-10,No,27.0,73159.0,High school graduate,No,Employed Full-time,0
11292,5,OK,09/08/2013 03:08:10,1,Male,Pickup,Restaurant/eating place,Own home,0-5,Did not answer,22.0,73728.0,Professional degree,College/ Grad or Law School,Other __________________________,0
11299,1,UT,02/07/2014 16:37:07,1,Male,Car,Store or gas station,Own home,0-5,No,22.0,84010.0,High school graduate,No,Unemployed,0
11314,4,UT,02/08/2014 23:01:07,5,Female,Car,Restaurant/eating place,Own home,0-5,No,40.0,84010.0,High school graduate,No,Employed Full-time,0


In [30]:
data13_df["DriverAgeYears"].unique()

array([  19.,   23.,   46.,   25.,   43.,   33.,   26.,   20.,   22.,
         60.,   69.,   59.,   51.,   45.,   74.,   36.,   64.,   52.,
         24.,   56.,   30.,   49.,   53.,   18.,   41.,   21.,   27.,
         44.,   47.,   50.,   31.,   76.,   73.,   48.,   32.,   55.,
         65.,   63.,   42.,   37.,   28.,   29.,   34.,   70.,   38.,
         54.,   72.,   62.,   35.,   57.,   58.,   75.,   40.,   39.,
         79.,   66., 4516.,   71.,   61.,   67.,   82.,   81.,   16.,
         80.,   78.,   68.,   83.,   77.,   17.,    0.,   99., 7216.])

In [31]:
# Changing Zipcode and race_n to int64
data13_df = data13_df.astype({"MyZipCodeIs": "int64"
                              ,"race_n": "int64"
                              ,"DriverAgeYears": "int64"
                             })
data13_df.dtypes

data_session_original     int64
State                    object
Time_Stamp               object
race_n                    int64
Gender                   object
VehicleType              object
FromWhere                object
ToWhere                  object
MilesTraveling           object
AlcoholToday             object
DriverAgeYears            int64
MyZipCodeIs               int64
EducationLevel           object
AreYouStudent            object
AreYouEmployed           object
bac_from_blood           object
dtype: object

In [32]:
# Getting city name based on zip code in data
search = SearchEngine(simple_zipcode=True)
for index, row in data13_df.iterrows():
    try:
        zipcode = row["MyZipCodeIs"]
        getCityData = search.by_zipcode(zipcode)
        data13_df.loc[index, "CityName"] = getCityData.major_city
    except ValueError:
        print(f"Skipping {index} due to invalid zipcode data")

data13_df.head()

Unnamed: 0,data_session_original,State,Time_Stamp,race_n,Gender,VehicleType,FromWhere,ToWhere,MilesTraveling,AlcoholToday,DriverAgeYears,MyZipCodeIs,EducationLevel,AreYouStudent,AreYouEmployed,bac_from_blood,CityName
3,3,CA,12/07/2013 02:34:15,8,Male,Car,Someone else's home,Own home,0-5,No,19,93030,Some college - no degree,College/ Grad or Law School,Employed Part-time,0,Oxnard
5,3,CA,12/07/2013 13:28:52,1,Male,Car,Other,Other,More than 20,No,23,93033,Some college - no degree,No,Employed Full-time,0,Oxnard
9,1,FL,06/28/2013 10:54:21,1,Male,SUV/ Crossover,Own home,Other,6-10,No,46,33125,9th - 11th grade,No,Employed Full-time,0,Miami
15,4,FL,06/29/2013 23:13:17,1,Male,Car,Own home,Own home,11-20,Yes,25,33146,Professional degree,No,Employed Full-time,0,Miami
22,5,FL,06/30/2013 02:30:54,6,Female,Car,Someone else's home,Own home,0-5,No,23,33137,Associate's degree,College/ Grad or Law School,Employed Full-time,0,Miami


In [33]:
# Checking Unique values in AreYouEmployed
data13_df["AreYouEmployed"].unique()

array(['Employed Part-time', 'Employed Full-time', 'Unemployed',
       'Retired', 'On Disability', 'Homemaker',
       'Other __________________________', 'Did not answer'], dtype=object)

In [34]:
# Converting AreYouEmployed to binary
data13_df["AreYouEmployed"] = data13_df["AreYouEmployed"].apply(lambda x: 1 if (x == "Employed Part-time") | 
                                                                (x == "Employed Full-time") 
                                                                else 0)
data13_df["AreYouEmployed"].unique()

array([1, 0], dtype=int64)

In [35]:
data13_df["AreYouStudent"].unique()

array(['College/ Grad or Law School', 'No',
       'Other/ Technical or Trade Program', 'High School',
       'Did not answer'], dtype=object)

In [36]:
# Converting AreYouStudent to binary
data13_df["AreYouStudent"] = data13_df["AreYouStudent"].apply(lambda x: 1 if (x != "Did not answer") & 
                                                                (x != "No") 
                                                                else 0)
data13_df["AreYouStudent"].unique()

array([1, 0], dtype=int64)

In [37]:
# Check unique race values
data13_df["race_n"].unique()

array([8, 1, 6, 9, 7, 3, 2, 5, 4], dtype=int64)

In [38]:
# Replacing numeric race values with string
race_dict = {1: "White"
             ,2: "Black/African American"
             ,3: "Asian"
             ,4: "Native American/Alaskan"
             ,5: "Native Hawaiian/Other Pacific Islander"
             ,6: "Unknown"
             ,7: "Other"
             ,8: "More than one"
             ,9: "no response"
            }
data13_df["race_n"] = data13_df["race_n"].map(lambda x: race_dict.get(x) if x in race_dict else x)
data13_df

Unnamed: 0,data_session_original,State,Time_Stamp,race_n,Gender,VehicleType,FromWhere,ToWhere,MilesTraveling,AlcoholToday,DriverAgeYears,MyZipCodeIs,EducationLevel,AreYouStudent,AreYouEmployed,bac_from_blood,CityName
3,3,CA,12/07/2013 02:34:15,More than one,Male,Car,Someone else's home,Own home,0-5,No,19,93030,Some college - no degree,1,1,0,Oxnard
5,3,CA,12/07/2013 13:28:52,White,Male,Car,Other,Other,More than 20,No,23,93033,Some college - no degree,0,1,0,Oxnard
9,1,FL,06/28/2013 10:54:21,White,Male,SUV/ Crossover,Own home,Other,6-10,No,46,33125,9th - 11th grade,0,1,0,Miami
15,4,FL,06/29/2013 23:13:17,White,Male,Car,Own home,Own home,11-20,Yes,25,33146,Professional degree,0,1,0,Miami
22,5,FL,06/30/2013 02:30:54,Unknown,Female,Car,Someone else's home,Own home,0-5,No,23,33137,Associate's degree,1,1,0,Miami
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11287,4,OK,09/07/2013 23:42:59,White,Male,Pickup,Someone else's home,Own home,6-10,No,27,73159,High school graduate,0,1,0,Oklahoma City
11292,5,OK,09/08/2013 03:08:10,White,Male,Pickup,Restaurant/eating place,Own home,0-5,Did not answer,22,73728,Professional degree,1,0,0,Cherokee
11299,1,UT,02/07/2014 16:37:07,White,Male,Car,Store or gas station,Own home,0-5,No,22,84010,High school graduate,0,0,0,Bountiful
11314,4,UT,02/08/2014 23:01:07,Native Hawaiian/Other Pacific Islander,Female,Car,Restaurant/eating place,Own home,0-5,No,40,84010,High school graduate,0,1,0,Bountiful


#### Team Member: Jagrati Joshi
    End of Task of sourcing data from csv files, cleaning up data and saving in 2 dataframes - data07_df and data13_df

In [39]:
# Exporting Final cleaned data to csv
# Naming the index of the dataframe for export to csv
data13_df.index.names = ["nrs_id"]

# Exporting the data to CSV file
data13_df.to_csv(output13_data_file)