In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# 1Ô∏è‚É£ Load the dataset
df = pd.read_csv('car_data.csv')
print("‚úÖ Loaded dataset. Shape:", df.shape)

# 2Ô∏è‚É£ Drop redundant columns
#If 'car_name' is present, it is dropped.
df.drop(columns=['Unnamed: 0'], inplace=True)
if 'car_name' in df.columns:
    df.drop(columns=['car_name'], inplace=True)

# 3Ô∏è‚É£ Fill missing values
for col in df.select_dtypes(include='number'):
    df[col].fillna(df[col].median(), inplace=True)

for col in df.select_dtypes(include='object'):
    df[col].fillna(df[col].mode()[0], inplace=True)

# 4Ô∏è‚É£ Log transform skewed numeric features
for col in ['km_driven', 'selling_price', 'engine']:
    df[col] = np.log1p(df[col])

# 5Ô∏è‚É£ Create interaction feature: price per km
df['price_per_km'] = df['selling_price'] / (np.expm1(df['km_driven']) + 1)

# 6Ô∏è‚É£ Encode categorical variables

# Categories:
# - transmission_type: Manual, Automatic ‚Üí Binary
# - fuel_type: Petrol, Diesel, CNG ‚Üí One-hot (2 cols)
# - seller_type: Individual, Dealer, Trustmark Dealer ‚Üí One-hot (2 cols)

cat_cols = ['transmission_type', 'fuel_type', 'seller_type', 'brand', 'model']
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# 7Ô∏è‚É£ Normalize selected numeric columns
scale_cols = ['mileage', 'engine', 'max_power', 'price_per_km']
scaler = StandardScaler()
df[scale_cols] = scaler.fit_transform(df[scale_cols])

# üîÅ Save cleaned dataset
df.to_csv('car_data_cleaned.csv', index=False)

# ‚úÖ Summary
print("‚úÖ Cleaning & Feature Engineering Done")
print("üìä Final Shape:", df.shape)
print("üß† Columns:", df.columns.tolist())
df.head()
