In [None]:
import numpy as np
import pandas as pd
import os

In [None]:
from google.colab import drive
import glob
drive.mount('/content/drive')

# Folder where CSV files are stored
# folder_path = '/content/drive/MyDrive/CE252DATA/2017'  # replace with your folder path


# List all CSV files in the folder
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))


Mounted at /content/drive


In [None]:
# Data analysis on hhv2pub.csv file
file_path = os.path.join(folder_path, 'hhpub.csv')


df_original = pd.read_csv(file_path)
print("Shape:", df_original.shape)

display(df_original.head(5))


Shape: (129696, 58)


  df_original = pd.read_csv(file_path)


Unnamed: 0,HOUSEID,TRAVDAY,SAMPSTRAT,HOMEOWN,HHSIZE,HHVEHCNT,HHFAMINC,PC,SPHONE,TAB,...,SMPLSRCE,WTHHFIN,HBHUR,HTHTNRNT,HTPPOPDN,HTRESDN,HTEEMPDN,HBHTNRNT,HBPPOPDN,HBRESDN
0,30000007,2,3,1,3,5,7,2,1,2,...,2,187.31432,T,50,1500,750,750,20,750,300
1,30000008,5,2,1,2,4,8,1,1,2,...,2,69.513032,R,5,300,300,150,5,300,300
2,30000012,5,3,1,1,2,10,1,1,3,...,2,79.419586,C,80,17000,17000,5000,60,17000,7000
3,30000019,5,3,1,2,2,3,1,5,5,...,2,279.143588,S,40,300,300,150,50,750,300
4,30000029,3,3,1,2,2,5,2,5,1,...,2,103.240304,S,40,1500,750,750,40,1500,750


In [None]:
# Create a working copy
df_working = df_original.copy()

print("Shape:", df_working.shape)

Shape: (129696, 58)


In [None]:
# Keep only rows where HHSTATE is 'CA'
df_working = df_working[df_working['HHSTATE'] == 'CA']

# Count how many rows remain
row_count = len(df_working)
print(f"Number of rows where HHSTATE = 'CA': {row_count}")

# Preview first 3 rows
display(df_working.head(5))

Number of rows where HHSTATE = 'CA': 26099


Unnamed: 0,HOUSEID,TRAVDAY,SAMPSTRAT,HOMEOWN,HHSIZE,HHVEHCNT,HHFAMINC,PC,SPHONE,TAB,...,SMPLSRCE,WTHHFIN,HBHUR,HTHTNRNT,HTPPOPDN,HTRESDN,HTEEMPDN,HBHTNRNT,HBPPOPDN,HBRESDN
6,30000041,4,3,1,2,2,11,1,1,1,...,2,788.61424,T,40,300,50,25,40,300,300
9,30000085,1,2,1,1,2,9,1,1,4,...,2,190.669041,U,50,17000,17000,5000,50,30000,17000
11,30000094,3,3,2,1,1,4,1,1,5,...,2,163.382292,T,20,300,300,150,20,300,300
19,30000155,1,1,1,1,2,-7,1,5,1,...,2,120.772451,C,30,3000,1500,1500,30,7000,1500
23,30000227,1,2,1,2,2,6,1,5,1,...,2,62.01579,S,5,3000,750,1500,5,750,300


In [None]:
# Convert to string, strip spaces, remove leading single quote, pad with zero
df_working['HHFAMINC'] = df_working['HHFAMINC'].astype(str) \
                                .str.strip() \
                                .str.lstrip("'") \
                                .str.zfill(2)

# Check unique cleaned values
print("Unique cleaned HHFAMINC values:")
print(sorted(df_working['HHFAMINC'].unique()))


Unique cleaned HHFAMINC values:
['-7', '-8', '-9', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11']


In [None]:
# Define income code groups and descriptions
group_mapping = {
    1: {'codes': ['01','02','03','04'], 'desc': '$0 - 35,000'},
    2: {'codes': ['05','06'], 'desc': '$35,001 - 74,999'},
    3: {'codes': ['07','08'], 'desc': '$75,000 - 125,000'},
    4: {'codes': ['09','10','11'], 'desc': '$125,001 and above'}
}

# Function to assign group with formatted description
def assign_group_desc(code):
    code_str = str(code).zfill(2)  # ensure codes like 1 -> '01'
    for group_num, info in group_mapping.items():
        if code_str in info['codes']:
            return f"{group_num} ({info['desc']})"
    return 'Unknown'  # for missing or invalid codes

# Apply function to create a new column
df_working['HHFAMINC_GROUP_DESC'] = df_working['HHFAMINC'].apply(assign_group_desc)

# Check counts per group
group_counts = df_working['HHFAMINC_GROUP_DESC'].value_counts().sort_index()
print("Number of households per income group:")
display(group_counts)


Number of households per income group:


Unnamed: 0_level_0,count
HHFAMINC_GROUP_DESC,Unnamed: 1_level_1
"1 ($0 - 35,000)",6532
"2 ($35,001 - 74,999)",6860
"3 ($75,000 - 125,000)",6214
"4 ($125,001 and above)",5692
Unknown,801


In [None]:
# First, clean the 'URBRUR' column in case it has quotes/spaces
df_working['URBRUR'] = df_working['URBRUR'].astype(str).str.strip().str.zfill(2)  # ensure '01' or '02'

# Map codes to readable labels
urb_rur_mapping = {
    '01': 'Urban',
    '02': 'Rural'
}
df_working['URBRUR_DESC'] = df_working['URBRUR'].map(urb_rur_mapping)

# Group by income group and urban/rural
grouped_counts = df_working.groupby(['HHFAMINC_GROUP_DESC', 'URBRUR_DESC']).size().unstack(fill_value=0)

print("Number of households per income group by Urban/Rural:")
display(grouped_counts)


Number of households per income group by Urban/Rural:


URBRUR_DESC,Rural,Urban
HHFAMINC_GROUP_DESC,Unnamed: 1_level_1,Unnamed: 2_level_1
"1 ($0 - 35,000)",865,5667
"2 ($35,001 - 74,999)",930,5930
"3 ($75,000 - 125,000)",774,5440
"4 ($125,001 and above)",554,5138
Unknown,88,713


In [None]:
# Group by income group and Urban/Rural, calculate average vehicles per household
avg_vehicles = df_working.groupby(['HHFAMINC_GROUP_DESC', 'URBRUR_DESC'])['HHVEHCNT'].mean().unstack(fill_value=0)

print("Average number of vehicles per household by income group and Urban/Rural:")
display(avg_vehicles.round(2))  # round to 2 decimal places for readability


Average number of vehicles per household by income group and Urban/Rural:


URBRUR_DESC,Rural,Urban
HHFAMINC_GROUP_DESC,Unnamed: 1_level_1,Unnamed: 2_level_1
"1 ($0 - 35,000)",1.93,1.3
"2 ($35,001 - 74,999)",2.55,1.83
"3 ($75,000 - 125,000)",2.83,2.21
"4 ($125,001 and above)",3.25,2.43
Unknown,2.35,1.77


In [None]:
# Data analysis on vehicle information
# Path to the CSV file
file_path = os.path.join(folder_path, 'vehpub.csv')

# Read the CSV
df_vehpub = pd.read_csv(file_path)

In [None]:
df_vehpub.columns

Index(['HOUSEID', 'VEHID', 'VEHYEAR', 'VEHAGE', 'MAKE', 'MODEL', 'FUELTYPE',
       'VEHTYPE', 'WHOMAIN', 'OD_READ', 'HFUEL', 'VEHOWNED', 'VEHOWNMO',
       'ANNMILES', 'HYBRID', 'PERSONID', 'TRAVDAY', 'HOMEOWN', 'HHSIZE',
       'HHVEHCNT', 'HHFAMINC', 'DRVRCNT', 'HHSTATE', 'HHSTFIPS', 'NUMADLT',
       'WRKCOUNT', 'TDAYDATE', 'LIF_CYC', 'MSACAT', 'MSASIZE', 'RAIL', 'URBAN',
       'URBANSIZE', 'URBRUR', 'CENSUS_D', 'CENSUS_R', 'CDIVMSAR', 'HH_RACE',
       'HH_HISP', 'HH_CBSA', 'SMPLSRCE', 'WTHHFIN', 'BESTMILE', 'BEST_FLG',
       'BEST_EDT', 'BEST_OUT', 'HBHUR', 'HTHTNRNT', 'HTPPOPDN', 'HTRESDN',
       'HTEEMPDN', 'HBHTNRNT', 'HBPPOPDN', 'HBRESDN', 'GSYRGAL', 'GSTOTCST',
       'FEGEMPG', 'FEGEMPGA', 'GSCOST', 'FEGEMPGF'],
      dtype='object')

In [None]:
# Create a working copy
df_vehpub_working = df_vehpub.copy()

print("Shape:", df_vehpub_working.shape)

Shape: (256115, 60)


In [None]:
# Keep only rows where HHSTATE is 'CA'
df_vehpub_working= df_vehpub_working[df_vehpub_working['HHSTATE'] == 'CA']

# Count how many rows remain
row_count = len(df_vehpub_working)
print(f"Number of rows where HHSTATE = 'CA': {row_count}")

# Preview first 3 rows
display(df_vehpub_working.head(5))

Number of rows where HHSTATE = 'CA': 52215


Unnamed: 0,HOUSEID,VEHID,VEHYEAR,VEHAGE,MAKE,MODEL,FUELTYPE,VEHTYPE,WHOMAIN,OD_READ,...,HTEEMPDN,HBHTNRNT,HBPPOPDN,HBRESDN,GSYRGAL,GSTOTCST,FEGEMPG,FEGEMPGA,GSCOST,FEGEMPGF
17,30000041,1,2016,1,49,49046,3,1,2,1532,...,25,40,300,300,331.251446,947.02028,50,-9,2.858917,1
18,30000041,2,2011,6,49,49046,3,1,1,40560,...,25,40,300,300,207.269346,592.565787,48,-9,2.858917,1
27,30000085,1,2004,13,49,49482,1,4,1,145000,...,5000,50,30000,17000,902.362298,2579.778614,15,-9,2.858917,1
28,30000085,2,2004,13,2,2407,1,3,1,175000,...,5000,50,30000,17000,459.193033,1312.794615,23,-9,2.858917,1
31,30000094,1,2009,8,37,37031,1,1,1,-88,...,150,20,300,300,229.671414,656.611434,27,-9,2.858917,1


In [None]:
# Remove rows where VEHFUEL is -8 or -7 (Invalid/Uncertain Input)
df_vehpub_working = df_vehpub_working[~df_vehpub_working['FUELTYPE'].isin([-8, -7])]

# Check the result
print("Shape after removing FUELTYPE = -8 and -7:", df_vehpub_working.shape)

# Verify unique values
print("Unique values in FUELTYPE after removal:")
print(sorted(df_vehpub_working['FUELTYPE'].unique()))

Shape after removing FUELTYPE = -8 and -7: (52153, 60)
Unique values in FUELTYPE after removal:
[np.int64(1), np.int64(2), np.int64(3), np.int64(97)]


In [None]:
# Remove rows where ANNMILES == -9  (Invalid/Uncertain Input)
df_vehpub_working = df_vehpub_working[df_vehpub_working['ANNMILES'] != -9]

# Check the result
print("Shape after removing ANNMILES = -9:", df_vehpub_working.shape)

# Optional: check unique values in ANNMILES
print("Unique values in ANNMILES after removal:")
print(sorted(df_vehpub_working['ANNMILES'].unique()))

Shape after removing ANNMILES = -9: (38719, 60)
Unique values in ANNMILES after removal:
[np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(10), np.int64(12), np.int64(14), np.int64(15), np.int64(16), np.int64(18), np.int64(20), np.int64(22), np.int64(24), np.int64(25), np.int64(26), np.int64(28), np.int64(29), np.int64(30), np.int64(32), np.int64(34), np.int64(35), np.int64(39), np.int64(40), np.int64(43), np.int64(44), np.int64(45), np.int64(48), np.int64(50), np.int64(54), np.int64(55), np.int64(60), np.int64(65), np.int64(66), np.int64(68), np.int64(70), np.int64(72), np.int64(75), np.int64(80), np.int64(84), np.int64(85), np.int64(87), np.int64(90), np.int64(92), np.int64(96), np.int64(100), np.int64(106), np.int64(109), np.int64(110), np.int64(113), np.int64(120), np.int64(121), np.int64(125), np.int64(129), np.int64(130), np.int64(133), np.int64(135), np.int64(140), np.int64(144), np.int64(145), np.int64

In [None]:
# Convert to string, strip spaces, remove leading single quote, pad with zero
df_vehpub_working['HHFAMINC'] = df_vehpub_working['HHFAMINC'].astype(str) \
                                .str.strip() \
                                .str.lstrip("'") \
                                .str.zfill(2)

# Check unique cleaned values
print("Unique cleaned HHFAMINC values:")
print(sorted(df_vehpub_working['HHFAMINC'].unique()))

Unique cleaned HHFAMINC values:
['-7', '-8', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11']


In [None]:
# Remove rows where HHFAMINC is '-7' or '-8'
df_vehpub_working = df_vehpub_working[~df_vehpub_working['HHFAMINC'].isin(['-7', '-8'])]

# Check the shape after removal
print("Shape after removing HHFAMINC= -7 or -8:", df_vehpub_working.shape)

# Optional: check unique values remaining
print("Unique HHFAMINC values after removal:")
print(sorted(df_vehpub_working['HHFAMINC'].unique()))

Shape after removing HHFAMINC= -7 or -8: (37854, 60)
Unique HHFAMINC values after removal:
['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11']


In [None]:
# Define income code groups and descriptions
group_mapping = {
    1: {'codes': ['01','02','03','04'], 'desc': '$0 - 35,000'},
    2: {'codes': ['05','06'], 'desc': '$35,001 - 74,999'},
    3: {'codes': ['07','08'], 'desc': '$75,000 - 125,000'},
    4: {'codes': ['09','10','11'], 'desc': '$125,001 and above'}
}

# Function to assign group with formatted description
def assign_group_desc(code):
    code_str = str(code).zfill(2)  # ensure codes like 1 -> '01'
    for group_num, info in group_mapping.items():
        if code_str in info['codes']:
            return f"{group_num} ({info['desc']})"
    return 'Unknown'  # for missing or invalid codes

# Apply function to create a new column
df_vehpub_working['HHFAMINC_GROUP_DESC'] = df_vehpub_working['HHFAMINC'].apply(assign_group_desc)

# Check counts per group
group_counts = df_vehpub_working['HHFAMINC_GROUP_DESC'].value_counts().sort_index()
print("Number of vehicles per income group:")
display(group_counts)


Number of vehicles per income group:


Unnamed: 0_level_0,count
HHFAMINC_GROUP_DESC,Unnamed: 1_level_1
"1 ($0 - 35,000)",6097
"2 ($35,001 - 74,999)",9689
"3 ($75,000 - 125,000)",10719
"4 ($125,001 and above)",11349


In [None]:
# First, clean the 'URBRUR' column in case it has quotes/spaces
df_vehpub_working['URBRUR'] = df_vehpub_working['URBRUR'].astype(str).str.strip().str.zfill(2)  # ensure '01' or '02'

# Map codes to readable labels
urb_rur_mapping = {
    '01': 'Urban',
    '02': 'Rural'
}
df_vehpub_working['URBRUR_DESC'] = df_vehpub_working['URBRUR'].map(urb_rur_mapping)

# Group by income group and urban/rural
grouped_counts = df_vehpub_working.groupby(['HHFAMINC_GROUP_DESC', 'URBRUR_DESC']).size().unstack(fill_value=0)

print("Number of vehicles per income group by Urban/Rural:")
display(grouped_counts)


Number of vehicles per income group by Urban/Rural:


URBRUR_DESC,Rural,Urban
HHFAMINC_GROUP_DESC,Unnamed: 1_level_1,Unnamed: 2_level_1
"1 ($0 - 35,000)",1253,4844
"2 ($35,001 - 74,999)",1825,7864
"3 ($75,000 - 125,000)",1690,9029
"4 ($125,001 and above)",1437,9912


In [None]:
# Ensure vehicle age column is numeric
df_vehpub_working['VEHAGE'] = pd.to_numeric(df_vehpub_working['VEHAGE'], errors='coerce')  # convert invalid to NaN

# Remove rows where VEHAGE is -7 or -8
df_vehpub_working = df_vehpub_working[(df_vehpub_working['VEHAGE'] != -7) &
                                      (df_vehpub_working['VEHAGE'] != -8)]

# Group by income group and Urban/Rural, calculate average vehicle age
avg_vehicle_age = df_vehpub_working.groupby(['HHFAMINC_GROUP_DESC', 'URBRUR_DESC'])['VEHAGE'].mean().unstack(fill_value=0)

print("Average vehicle age by income group and Urban/Rural:")
display(avg_vehicle_age.round(1))  # round to 1 decimal places


Average vehicle age by income group and Urban/Rural:


URBRUR_DESC,Rural,Urban
HHFAMINC_GROUP_DESC,Unnamed: 1_level_1,Unnamed: 2_level_1
"1 ($0 - 35,000)",17.0,13.8
"2 ($35,001 - 74,999)",14.8,12.1
"3 ($75,000 - 125,000)",12.8,10.9
"4 ($125,001 and above)",11.2,9.5


In [None]:
# Ensure ANNMILES is numeric
df_vehpub_working['ANNMILES'] = pd.to_numeric(df_vehpub_working['ANNMILES'], errors='coerce')

# Group by income group and Urban/Rural, calculate average annual miles
avg_annual_miles = df_vehpub_working.groupby(['HHFAMINC_GROUP_DESC', 'URBRUR_DESC'])['ANNMILES'].mean().unstack(fill_value=0)

print("Average annual miles per vehicle by income group and Urban/Rural:")
display(avg_annual_miles.round(0))  # round to 0 decimal places


Average annual miles per vehicle by income group and Urban/Rural:


URBRUR_DESC,Rural,Urban
HHFAMINC_GROUP_DESC,Unnamed: 1_level_1,Unnamed: 2_level_1
"1 ($0 - 35,000)",6017.0,7628.0
"2 ($35,001 - 74,999)",7758.0,8141.0
"3 ($75,000 - 125,000)",8664.0,8947.0
"4 ($125,001 and above)",9848.0,9341.0


In [None]:
# Ensure Fuel Economy column is numeric
df_vehpub_working['FEGEMPG'] = pd.to_numeric(df_vehpub_working['FEGEMPG'], errors='coerce')  # convert invalid to NaN

# Count total valid rows for MPG
total_mpg_rows = len(df_vehpub_working)
print(f"Total rows with valid FEGEMPG: {total_mpg_rows}")

# Group by income group and Urban/Rural, calculate average MPG
avg_mpg = df_vehpub_working.groupby(
    ['HHFAMINC_GROUP_DESC', 'URBRUR_DESC']
)['FEGEMPG'].mean().unstack(fill_value=0)

print("Average miles per gallon by income group and Urban/Rural:")
display(avg_mpg.round(1))  # round to 1 decimal places

Total rows with valid FEGEMPG: 37749
Average miles per gallon by income group and Urban/Rural:


URBRUR_DESC,Rural,Urban
HHFAMINC_GROUP_DESC,Unnamed: 1_level_1,Unnamed: 2_level_1
"1 ($0 - 35,000)",21.4,23.1
"2 ($35,001 - 74,999)",21.5,23.5
"3 ($75,000 - 125,000)",22.3,24.2
"4 ($125,001 and above)",23.4,25.4


In [None]:
df_vehpub_working

Unnamed: 0,HOUSEID,VEHID,VEHYEAR,VEHAGE,MAKE,MODEL,FUELTYPE,VEHTYPE,WHOMAIN,OD_READ,...,HBPPOPDN,HBRESDN,GSYRGAL,GSTOTCST,FEGEMPG,FEGEMPGA,GSCOST,FEGEMPGF,HHFAMINC_GROUP_DESC,URBRUR_DESC
17,30000041,1,2016,1,49,49046,3,1,2,1532,...,300,300,331.251446,947.020280,50,-9,2.858917,1,"4 ($125,001 and above)",Urban
18,30000041,2,2011,6,49,49046,3,1,1,40560,...,300,300,207.269346,592.565787,48,-9,2.858917,1,"4 ($125,001 and above)",Urban
27,30000085,1,2004,13,49,49482,1,4,1,145000,...,30000,17000,902.362298,2579.778614,15,-9,2.858917,1,"4 ($125,001 and above)",Urban
28,30000085,2,2004,13,02,2407,1,3,1,175000,...,30000,17000,459.193033,1312.794615,23,-9,2.858917,1,"4 ($125,001 and above)",Urban
52,30000227,2,2004,13,07,7472,1,4,1,111710,...,750,300,399.788118,1142.960912,16,-9,2.858917,1,"2 ($35,001 - 74,999)",Urban
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256090,40794179,1,2004,13,07,7482,2,4,1,253178,...,17000,3000,1338.281935,3729.903277,12,-9,2.787083,2,"2 ($35,001 - 74,999)",Urban
256094,40794233,1,2010,7,49,49482,1,4,1,47000,...,750,300,38.812640,110.962105,16,-9,2.858917,1,"3 ($75,000 - 125,000)",Urban
256095,40794233,2,2005,12,30,30047,1,1,2,-88,...,750,300,487.093044,1392.558421,23,-9,2.858917,1,"3 ($75,000 - 125,000)",Urban
256096,40794233,3,1997,20,34,34035,1,1,1,136000,...,750,300,25.141115,71.876353,13,-9,2.858917,1,"3 ($75,000 - 125,000)",Urban
