In [6]:
import pandas as pd

# load the csv file
df = pd.read_csv("global_cars_dataset_synthetic.csv")

# create a copy of the dataset
clean_df = df.copy()

# standardize column names
clean_df.columns = clean_df.columns.str.lower().str.strip()

# remove duplicate rows
clean_df = clean_df.drop_duplicates()

# keep realistic manufacture years
clean_df = clean_df[
    (clean_df["manufacture_year"] >= 1990) &
    (clean_df["manufacture_year"] <= 2025)
]

# remove invalid numeric values
numeric_cols = ["engine_cc", "horsepower", "mileage_km_per_l", "price_usd"]
for col in numeric_cols:
    clean_df = clean_df[clean_df[col] > 0]

# save cleaned dataset
clean_df.to_csv("cleaned_global_cars_dataset.csv", index=False)

# compare shapes
print("original shape:", df.shape)
print("cleaned shape:", clean_df.shape)

if df.shape == clean_df.shape:
    print("\nThe dataset was already clean. No changes were required.")
else:
    print("\nCleaning applied. Some rows were removed.")

# show first 5 rows
print("\nfirst 5 rows of cleaned dataset:")
print(clean_df.head())


original shape: (300, 11)
cleaned shape: (300, 11)

The dataset was already clean. No changes were required.

first 5 rows of cleaned dataset:
     car_id     brand  manufacture_year  body_type fuel_type transmission  \
0  CAR_0001  Mercedes              2006        SUV    Petrol       Manual   
1  CAR_0002    Nissan              2023      Coupe    Petrol    Automatic   
2  CAR_0003    Nissan              2007  Hatchback    Diesel       Manual   
3  CAR_0004    Nissan              2013      Coupe    Petrol       Manual   
4  CAR_0005   Hyundai              2009  Hatchback    Hybrid    Automatic   

   engine_cc  horsepower  mileage_km_per_l  price_usd manufacturing_country  
0       4089         547                17      73407                   USA  
1       4618         167                25      79370                   USA  
2       1802         110                16      76549                 China  
3       1835         373                16      48722                   USA  
4   