In [1]:
import pandas as pd 
import os

In [13]:
# Define the directory containing the CSV files
csv_directory = "./PassengerVehicle_Stats"

# Get all CSV file paths in the directory
csv_files = [os.path.join(csv_directory, file) for file in os.listdir(csv_directory) if file.endswith('.csv')]

# Load all CSV files into a list
dataframes = [pd.read_csv(file) for file in csv_files]

# Concatenate all DataFrames into a single DataFrame
vehicles_df = pd.concat(dataframes, ignore_index=True)

In [15]:
print(vehicles_df.head())

   Unnamed: 0  Public Vehicle Number    Status Vehicle Make Vehicle Model  \
0        1286                  12009  RESERVED    CHEVROLET       EXPRESS   
1        2095                  12248  INACTIVE     MERCEDES      SPRINTER   
2        7950                  13527  INACTIVE     VAN HOOL         TD925   
3        8700                  12248  INACTIVE     MERCEDES      SPRINTER   
4        9359                  13528  INACTIVE     VAN HOOL         TD925   

   Vehicle Model Year Vehicle Color Vehicle Fuel Source Wheelchair Accessible  \
0              2014.0         BLACK          Bio-Diesel                     N   
1              2010.0        SILVER          Bio-Diesel                     N   
2              2008.0           RED          Bio-Diesel                     N   
3              2010.0        SILVER          Bio-Diesel                     N   
4              2008.0           RED          Bio-Diesel                     N   

                              Company Name        

In [21]:
# Check for null values
print(vehicles_df.isnull().sum())


Unnamed: 0                                0
Public Vehicle Number                     0
Status                                    0
Vehicle Make                           7668
Vehicle Model                          7852
Vehicle Model Year                     7768
Vehicle Color                          7944
Vehicle Fuel Source                       0
Wheelchair Accessible                     0
Company Name                              0
Address                                7144
City                                   7144
State                                  7144
ZIP Code                               7144
Taxi Affiliation                      37016
Taxi Medallion License Management     37124
Record ID                                 0
dtype: int64


In [22]:
# Check for duplicates
print(vehicles_df.duplicated().sum())

49806


In [25]:
# Columns with null values
null_columns = vehicles_df.columns[vehicles_df.isnull().any()]
print(vehicles_df[null_columns].isnull().sum())


Vehicle Model                          7852
Vehicle Color                          7944
Address                                7144
City                                   7144
State                                  7144
ZIP Code                               7144
Taxi Affiliation                      37016
Taxi Medallion License Management     37124
dtype: int64


In [26]:
# Fill numerical columns with mean
vehicles_df['Vehicle Model Year'] = vehicles_df['Vehicle Model Year'].fillna(vehicles_df['Vehicle Model Year'].mean())

# Fill categorical columns with mode
vehicles_df['Vehicle Make'] = vehicles_df['Vehicle Make'].fillna(vehicles_df['Vehicle Make'].mode()[0])


In [27]:
# Drop duplicate rows
vehicles_df = vehicles_df.drop_duplicates()

# Reset the index after removing duplicates
vehicles_df = vehicles_df.reset_index(drop=True)


In [28]:
# Example: Filter invalid vehicle model years
vehicles_df = vehicles_df[vehicles_df['Vehicle Model Year'] >= 1900]

# Example: Filter invalid ZIP Codes
vehicles_df = vehicles_df[vehicles_df['ZIP Code'].apply(lambda x: str(x).isdigit())]
