In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load Dataset
df = pd.read_csv('../data/EV_Battery_Performance.csv')
df.index = range(1, len(df) + 1)

df.head()

Unnamed: 0,brand,model,top_speed_kmh,battery_capacity_kWh,battery_type,number_of_cells,torque_nm,efficiency_wh_per_km,range_km,acceleration_0_100_s,...,towing_capacity_kg,cargo_volume_l,seats,drivetrain,segment,length_mm,width_mm,height_mm,car_body_type,source_url
1,Abarth,500e Convertible,155,37.8,Lithium-ion,192.0,235.0,156,225,7.0,...,0.0,185.0,4,FWD,B - Compact,3673,1683,1518,Hatchback,https://ev-database.org/car/1904/Abarth-500e-C...
2,Abarth,500e Hatchback,155,37.8,Lithium-ion,192.0,235.0,149,225,7.0,...,0.0,185.0,4,FWD,B - Compact,3673,1683,1518,Hatchback,https://ev-database.org/car/1903/Abarth-500e-H...
3,Abarth,600e Scorpionissima,200,50.8,Lithium-ion,102.0,345.0,158,280,5.9,...,0.0,360.0,5,FWD,JB - Compact,4187,1779,1557,SUV,https://ev-database.org/car/3057/Abarth-600e-S...
4,Abarth,600e Turismo,200,50.8,Lithium-ion,102.0,345.0,158,280,6.2,...,0.0,360.0,5,FWD,JB - Compact,4187,1779,1557,SUV,https://ev-database.org/car/3056/Abarth-600e-T...
5,Aiways,U5,150,60.0,Lithium-ion,,310.0,156,315,7.5,...,,496.0,5,FWD,JC - Medium,4680,1865,1700,SUV,https://ev-database.org/car/1678/Aiways-U5


In [None]:
# Check Missing & Duplicate Values
print("\nMissing Values:\n", df.isnull().sum())
print("\nDuplicate Rows:", df.duplicated().sum())


Missing Values:
 brand                          0
model                          0
top_speed_kmh                  0
battery_capacity_kWh           0
battery_type                   0
number_of_cells              200
torque_nm                      7
efficiency_wh_per_km           0
range_km                       0
acceleration_0_100_s           0
fast_charging_power_kw_dc      1
fast_charge_port               1
towing_capacity_kg            26
cargo_volume_l                 1
seats                          0
drivetrain                     0
segment                        0
length_mm                      0
width_mm                       0
height_mm                      0
car_body_type                  0
source_url                     0
dtype: int64

Duplicate Rows: 0


In [None]:
# Drop Unnecessary Columns
df.drop(columns=['source_url', 'fast_charge_port', 'seats'], inplace=True, errors='ignore')
print("Irrelevant columns dropped")

Irrelevant columns dropped


In [None]:
# Handle Missing Values (Preserve Data Types)
for col in df.select_dtypes(include=['float64', 'int64']).columns:
    if df[col].isnull().sum() > 0:
        mean_val = df[col].mean()
        if df[col].dtype == 'int64':
            df[col] = df[col].fillna(int(round(mean_val)))
        else:
            df[col] = df[col].fillna(round(mean_val, 2))


In [None]:
# For categorical columns — fill with mode
for col in df.select_dtypes(include=['object']).columns:
    if df[col].isnull().sum() > 0:
        mode_val = df[col].mode()[0]
        df[col] = df[col].fillna(mode_val)

In [None]:
# Remove Duplicate Rows
initial_len = len(df)
df.drop_duplicates(inplace=True)
print(f"Duplicates removed: {initial_len - len(df)} rows dropped")


Duplicates removed: 0 rows dropped


In [None]:
# Handle Outliers using IQR (Preserving Data Type)
for col in df.select_dtypes(include=['float64', 'int64']).columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)


In [None]:
# Replace Zero Values in Important Columns with Mean
numeric_cols = [
    'battery_capacity_kWh', 'number_of_cells', 'torque_nm',
    'acceleration_0_100_s', 'fast_charging_power_kw_dc', 'towing_capacity_kg'
]

for col in numeric_cols:
    if col in df.columns:
        mean_val = df.loc[df[col] != 0, col].mean()
        if df[col].dtype == 'int64':
            df[col] = df[col].replace(0, int(round(mean_val)))
        else:
            df[col] = df[col].replace(0, round(mean_val, 2))


In [None]:
# Confirm Data Types & Info
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 474 entries, 1 to 474
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   brand                      474 non-null    object 
 1   model                      474 non-null    object 
 2   top_speed_kmh              474 non-null    float64
 3   battery_capacity_kWh       474 non-null    float64
 4   battery_type               474 non-null    object 
 5   number_of_cells            474 non-null    float64
 6   torque_nm                  474 non-null    float64
 7   efficiency_wh_per_km       474 non-null    int64  
 8   range_km                   474 non-null    int64  
 9   acceleration_0_100_s       474 non-null    float64
 10  fast_charging_power_kw_dc  474 non-null    float64
 11  towing_capacity_kg         474 non-null    float64
 12  cargo_volume_l             474 non-null    float64
 13  drivetrain                 474 non-null    object 

In [None]:
# List Categorical Columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print("Categorical Columns:", categorical_cols)

print(df[categorical_cols].head())


Categorical Columns: ['brand', 'model', 'battery_type', 'drivetrain', 'segment', 'car_body_type']
    brand                model battery_type drivetrain       segment  \
1  Abarth     500e Convertible  Lithium-ion        FWD   B - Compact   
2  Abarth       500e Hatchback  Lithium-ion        FWD   B - Compact   
3  Abarth  600e Scorpionissima  Lithium-ion        FWD  JB - Compact   
4  Abarth         600e Turismo  Lithium-ion        FWD  JB - Compact   
5  Aiways                   U5  Lithium-ion        FWD   JC - Medium   

  car_body_type  
1     Hatchback  
2     Hatchback  
3           SUV  
4           SUV  
5           SUV  


In [None]:
# Final Check for Missing Values
print("Remaining Missing Values:\n", df.isnull().sum())


Remaining Missing Values:
 brand                        0
model                        0
top_speed_kmh                0
battery_capacity_kWh         0
battery_type                 0
number_of_cells              0
torque_nm                    0
efficiency_wh_per_km         0
range_km                     0
acceleration_0_100_s         0
fast_charging_power_kw_dc    0
towing_capacity_kg           0
cargo_volume_l               0
drivetrain                   0
segment                      0
length_mm                    0
width_mm                     0
height_mm                    0
car_body_type                0
dtype: int64


In [None]:
# Save the Cleaned Dataset
df.to_csv('../data/EV_Battery_Performance_Clean.csv', index=False)
print("Cleaned dataset saved as: EV_Battery_Performance_Clean.csv")


Cleaned dataset saved as: EV_Battery_Performance_Clean.csv
