In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# 1️⃣ Load the dataset
df = pd.read_csv('car_data.csv')
print("✅ Loaded dataset. Shape:", df.shape)

# 2️⃣ Drop redundant columns
#If 'car_name' is present, it is dropped.
df.drop(columns=['Unnamed: 0'], inplace=True)
if 'car_name' in df.columns:
    df.drop(columns=['car_name'], inplace=True)

# 3️⃣ Fill missing values
for col in df.select_dtypes(include='number'):
    df[col].fillna(df[col].median(), inplace=True)

for col in df.select_dtypes(include='object'):
    df[col].fillna(df[col].mode()[0], inplace=True)

# 4️⃣ Log transform skewed numeric features
for col in ['km_driven', 'selling_price', 'engine']:
    df[col] = np.log1p(df[col])

# 5️⃣ Create interaction feature: price per km
df['price_per_km'] = df['selling_price'] / (np.expm1(df['km_driven']) + 1)

# 6️⃣ Encode categorical variables

# Categories:
# - transmission_type: Manual, Automatic → Binary
# - fuel_type: Petrol, Diesel, CNG → One-hot (2 cols)
# - seller_type: Individual, Dealer, Trustmark Dealer → One-hot (2 cols)

cat_cols = ['transmission_type', 'fuel_type', 'seller_type', 'brand', 'model']
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# 7️⃣ Normalize selected numeric columns
scale_cols = ['mileage', 'engine', 'max_power', 'price_per_km']
scaler = StandardScaler()
df[scale_cols] = scaler.fit_transform(df[scale_cols])

# 🔁 Save cleaned dataset
df.to_csv('car_data_cleaned.csv', index=False)

# ✅ Summary
print("✅ Cleaning & Feature Engineering Done")
print("📊 Final Shape:", df.shape)
print("🧠 Columns:", df.columns.tolist())
df.head()
