In [13]:
import pandas as pd

#  load the csv file into a pandas dataframe
df = pd.read_csv("global_cars_dataset_synthetic.csv")

# create a copy of the dataset to avoid modifying original data
clean_df = df.copy()

#  standardize column names
# convert to lowercase and remove extra spaces for consistency
clean_df.columns = clean_df.columns.str.lower().str.strip()

#  check and remove duplicate rows (if any)
clean_df = clean_df.drop_duplicates()

# keep only realistic manufacture years
# cars manufactured between 1990 and 2025 are considered valid
clean_df = clean_df[
    (clean_df["manufacture_year"] >= 1990) &
    (clean_df["manufacture_year"] <= 2025)
]

#  remove invalid values from numeric columns
# engine capacity, horsepower, mileage and price should be positive
numeric_cols = ["engine_cc", "horsepower", "mileage_km_per_l", "price_usd"]
for col in numeric_cols:
    clean_df = clean_df[clean_df[col] > 0]

#  save the cleaned dataset to a new csv file
clean_df.to_csv("cleaned_global_cars_dataset.csv", index=False)

print(df)


       Car_ID     Brand  Manufacture_Year  Body_Type Fuel_Type Transmission  \
0    CAR_0001  Mercedes              2006        SUV    Petrol       Manual   
1    CAR_0002    Nissan              2023      Coupe    Petrol    Automatic   
2    CAR_0003    Nissan              2007  Hatchback    Diesel       Manual   
3    CAR_0004    Nissan              2013      Coupe    Petrol       Manual   
4    CAR_0005   Hyundai              2009  Hatchback    Hybrid    Automatic   
..        ...       ...               ...        ...       ...          ...   
295  CAR_0296      Audi              2015     Pickup    Hybrid    Automatic   
296  CAR_0297      Ford              2023  Hatchback    Petrol       Manual   
297  CAR_0298  Mercedes              2020        SUV  Electric    Automatic   
298  CAR_0299      Ford              2023      Coupe    Diesel       Manual   
299  CAR_0300       Kia              2023        SUV    Hybrid       Manual   

     Engine_CC  Horsepower  Mileage_km_per_l  Price