In [12]:
# ===========================================
# Week 1 - Data Understanding & Cleaning
# ===========================================

# 1Ô∏è‚É£ Import Required Libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# ===========================================
# 2Ô∏è‚É£ Load the Dataset (from your actual path)
# ===========================================
data_path = r"E:\sills4future\electric_vehicles_spec_2025.csv(1).csv"

if not os.path.exists(data_path):
    raise FileNotFoundError(f"‚ùå File not found at: {data_path}")

df = pd.read_csv(data_path)
print("‚úÖ Dataset loaded successfully!")
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())

# ===========================================
# 3Ô∏è‚É£ Handle Empty Spaces as Missing Values
# ===========================================
df.replace(r'^\s*$', np.nan, regex=True, inplace=True)
print("\nüö® Missing values after converting blanks:")
print(df.isnull().sum())

# ===========================================
# 4Ô∏è‚É£ Basic Info and Summary
# ===========================================
print("\nüîç Dataset Info:")
df.info()

print("\nüìä Summary Statistics:")
print(df.describe(include='all'))

# ===========================================
# 5Ô∏è‚É£ Handle Missing Values
# ===========================================
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
for col in numeric_cols:
    df[col].fillna(df[col].median(), inplace=True)

categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

print("\n‚úÖ Missing values handled successfully!")

# ===========================================
# 6Ô∏è‚É£ Clean Numeric Columns (Remove Units)
# ===========================================
def clean_numeric(col):
    return (
        col.astype(str)
        .str.replace(" km/h", "", regex=False)
        .str.replace(" km", "", regex=False)
        .str.replace(" sec", "", regex=False)
        .str.replace(" Wh/km", "", regex=False)
        .str.replace(" kW", "", regex=False)
        .str.replace("*", "", regex=False)
        .str.replace(",", "", regex=False)
        .str.extract("([-+]?\d*\.?\d+)")[0]
        .astype(float)
    )

cols_to_clean = ['Range', 'Range (km)', 'Battery Capacity (kWh)', 'Top Speed', 'Efficiency', '0 - 100']
for col in cols_to_clean:
    if col in df.columns:
        df[col] = clean_numeric(df[col])

print("\nüßπ Numeric columns cleaned (units removed).")

# ===========================================
# 7Ô∏è‚É£ One-Hot Encode Brand Column
# ===========================================
if 'Brand' in df.columns:
    df = pd.get_dummies(df, columns=['Brand'], prefix='Brand', drop_first=True)
    print("‚úÖ 'Brand' column one-hot encoded successfully!")
else:
    print("‚ö†Ô∏è 'Brand' column not found in dataset, skipping encoding.")

# ===========================================
# 8Ô∏è‚É£ Detect and Visualize Outliers
# ===========================================
plt.figure(figsize=(8, 5))
range_col = 'Range (km)' if 'Range (km)' in df.columns else 'Range'
if range_col in df.columns:
    sns.boxplot(x=df[range_col])
    plt.title('üìà Range Distribution with Outliers')
    os.makedirs(r"E:\sills4future", exist_ok=True)
    plt.savefig(r"E:\sills4future\range_distribution.png")
    plt.show()
else:
    print("‚ö†Ô∏è Range column not found, skipping outlier plot.")

# ===========================================
# üîü Save Cleaned Dataset
# ===========================================
cleaned_path = r"E:\sills4future\cleaned_ev_data.csv"
os.makedirs(os.path.dirname(cleaned_path), exist_ok=True)
df.to_csv(cleaned_path, index=False)
print(f"\nüíæ Cleaned dataset saved as: {cleaned_path}")

print("\n‚úÖ Week 1 - Data Understanding & Cleaning completed successfully!")


‚úÖ Dataset loaded successfully!
Shape: (478, 22)
Columns: ['brand', 'model', 'top_speed_kmh', 'battery_capacity_kWh', 'battery_type', 'number_of_cells', 'torque_nm', 'efficiency_wh_per_km', 'range_km', 'acceleration_0_100_s', 'fast_charging_power_kw_dc', 'fast_charge_port', 'towing_capacity_kg', 'cargo_volume_l', 'seats', 'drivetrain', 'segment', 'length_mm', 'width_mm', 'height_mm', 'car_body_type', 'source_url']

üö® Missing values after converting blanks:
brand                          0
model                          1
top_speed_kmh                  0
battery_capacity_kWh           0
battery_type                   0
number_of_cells              202
torque_nm                      7
efficiency_wh_per_km           0
range_km                       0
acceleration_0_100_s           0
fast_charging_power_kw_dc      1
fast_charge_port               1
towing_capacity_kg            26
cargo_volume_l                 1
seats                          0
drivetrain                     0
segment

<Figure size 800x500 with 0 Axes>