In [3]:
import pandas as pd 
import os

In [4]:
# Define the directory containing the CSV files
csv_directory = "./PassengerVehicle_Stats"

# Get all CSV file paths in the directory
csv_files = [os.path.join(csv_directory, file) for file in os.listdir(csv_directory) if file.endswith('.csv')]

# Load all CSV files into a list
dataframes = [pd.read_csv(file) for file in csv_files]

# Concatenate all DataFrames into a single DataFrame
vehicles_df = pd.concat(dataframes, ignore_index=True)

In [5]:
print(vehicles_df.head())

   Unnamed: 0  Public Vehicle Number    Status Vehicle Make Vehicle Model  \
0        1286                  12009  RESERVED    CHEVROLET       EXPRESS   
1        2095                  12248  INACTIVE     MERCEDES      SPRINTER   
2        7950                  13527  INACTIVE     VAN HOOL         TD925   
3        8700                  12248  INACTIVE     MERCEDES      SPRINTER   
4        9359                  13528  INACTIVE     VAN HOOL         TD925   

   Vehicle Model Year Vehicle Color Vehicle Fuel Source Wheelchair Accessible  \
0              2014.0         BLACK          Bio-Diesel                     N   
1              2010.0        SILVER          Bio-Diesel                     N   
2              2008.0           RED          Bio-Diesel                     N   
3              2010.0        SILVER          Bio-Diesel                     N   
4              2008.0           RED          Bio-Diesel                     N   

                              Company Name        

In [6]:
vehicles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66408 entries, 0 to 66407
Data columns (total 17 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Unnamed: 0                          66408 non-null  int64  
 1   Public Vehicle Number               66408 non-null  int64  
 2   Status                              66408 non-null  object 
 3   Vehicle Make                        58740 non-null  object 
 4   Vehicle Model                       58556 non-null  object 
 5   Vehicle Model Year                  58640 non-null  float64
 6   Vehicle Color                       58464 non-null  object 
 7   Vehicle Fuel Source                 66408 non-null  object 
 8   Wheelchair Accessible               66408 non-null  object 
 9   Company Name                        66408 non-null  object 
 10  Address                             59264 non-null  object 
 11  City                                59264

In [7]:
# Check for null values
print(vehicles_df.isnull().sum())

Unnamed: 0                                0
Public Vehicle Number                     0
Status                                    0
Vehicle Make                           7668
Vehicle Model                          7852
Vehicle Model Year                     7768
Vehicle Color                          7944
Vehicle Fuel Source                       0
Wheelchair Accessible                     0
Company Name                              0
Address                                7144
City                                   7144
State                                  7144
ZIP Code                               7144
Taxi Affiliation                      37016
Taxi Medallion License Management     37124
Record ID                                 0
dtype: int64


In [8]:
# Check for duplicates
print(vehicles_df.duplicated().sum())

49806


In [9]:
# Remove duplicates based on the "Public Vehicle Number" column
vehicles_df = vehicles_df.drop_duplicates(subset='Public Vehicle Number', keep='first')

In [10]:
# Reset the index after dropping duplicates
vehicles_df = vehicles_df.reset_index(drop=True)

In [11]:
# Check for duplicates
print(vehicles_df.duplicated().sum())

0


In [12]:
vehicles_df.shape

(14473, 17)

In [13]:
# Remove rows where 'Vehicle Model Year' is less than 1900
vehicles_df = vehicles_df[vehicles_df['Vehicle Model Year'] >= 1900]

# Reset the index
vehicles_df = vehicles_df.reset_index(drop=True)

In [14]:
vehicles_df.shape

(12652, 17)

In [15]:
vehicles_df.isnull().sum()

Unnamed: 0                               0
Public Vehicle Number                    0
Status                                   0
Vehicle Make                            12
Vehicle Model                           48
Vehicle Model Year                       0
Vehicle Color                           64
Vehicle Fuel Source                      0
Wheelchair Accessible                    0
Company Name                             0
Address                                  0
City                                     0
State                                    0
ZIP Code                                 0
Taxi Affiliation                      6270
Taxi Medallion License Management     6293
Record ID                                0
dtype: int64

In [16]:
columns_to_check = ['Address', 'City', 'State','ZIP Code']

vehicles_df.dropna(subset=columns_to_check, inplace=True)

In [17]:
vehicles_df.isnull().sum()

Unnamed: 0                               0
Public Vehicle Number                    0
Status                                   0
Vehicle Make                            12
Vehicle Model                           48
Vehicle Model Year                       0
Vehicle Color                           64
Vehicle Fuel Source                      0
Wheelchair Accessible                    0
Company Name                             0
Address                                  0
City                                     0
State                                    0
ZIP Code                                 0
Taxi Affiliation                      6270
Taxi Medallion License Management     6293
Record ID                                0
dtype: int64

In [19]:
vehicles_df['Vehicle Model Year'].fillna(0, inplace = True)
vehicles_df['Vehicle Model Year']=vehicles_df['Vehicle Model Year'].astype(int)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  vehicles_df['Vehicle Model Year'].fillna(0, inplace = True)


In [20]:
vehicles_df.fillna("unknown", inplace = True)

In [21]:
Q1 = vehicles_df['Vehicle Model Year'].quantile(0.25)
Q3 = vehicles_df['Vehicle Model Year'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = vehicles_df[(vehicles_df['Vehicle Model Year'] < lower_bound) | (vehicles_df['Vehicle Model Year'] > upper_bound)]

print("Outliers:\n", outliers)

vehicles_df['Vehicle Model Year'].value_counts()

Outliers:
        Unnamed: 0  Public Vehicle Number    Status  Vehicle Make  \
101           584                  50000  INACTIVE  FREIGHTLINER   
102           881                  15065  INACTIVE      CHAMPION   
104          1422                1000009  INACTIVE      CHAMPION   
109          2457                1000013  INACTIVE      CHAMPION   
113          3026                  13754  INACTIVE          INTL   
...           ...                    ...       ...           ...   
7539         1351                     11  INACTIVE         YODER   
7542         6393                     48  INACTIVE        MARTIN   
7547        12146                     23  INACTIVE        MARTIN   
7548        12169                     12  INACTIVE    STUDEBAKER   
10117        8009                   5286   REVOKED       unknown   

        Vehicle Model  Vehicle Model Year Vehicle Color Vehicle Fuel Source  \
101    X LINE SHUTTLE                2000         GREEN              Diesel   
102           

Vehicle Model Year
2023    1395
2014    1302
2013    1151
2012    1040
2022     993
2019     740
2024     708
2011     705
2015     700
2016     669
2020     556
2021     545
2017     487
2018     453
2010     267
2008     167
2006     154
2007     109
2009     107
2005      64
2001      57
2025      50
2004      43
2002      35
2003      29
1999      27
1997      22
2000      20
1998      13
1996       7
1988       5
1991       5
1992       4
1995       4
1984       3
1993       3
1986       2
1990       2
1994       2
1989       2
1981       1
1983       1
1987       1
1985       1
1980       1
Name: count, dtype: int64

In [22]:
vehicles_df.shape

(12652, 17)