# 02 - Feature Engineering
**Objective:** Clean the dataset, handle missing values, and convert categorical features to numeric for ML.


In [None]:
import pandas as pd

df = pd.read_csv("../data/raw/used_cars.csv")
df.head()


Unnamed: 0,make_year,mileage_kmpl,engine_cc,fuel_type,owner_count,price_usd,brand,transmission,color,service_history,accidents_reported,insurance_valid
0,2001,8.17,4000,Petrol,4,8587.64,Chevrolet,Manual,White,,0,No
1,2014,17.59,1500,Petrol,4,5943.5,Honda,Manual,Black,,0,Yes
2,2023,18.09,2500,Diesel,5,9273.58,BMW,Automatic,Black,Full,1,Yes
3,2009,11.28,800,Petrol,1,6836.24,Hyundai,Manual,Blue,Full,0,Yes
4,2005,12.23,1000,Petrol,2,4625.79,Nissan,Automatic,Red,Full,0,Yes


### Handle Missing Values
- Fill missing `service_history` with "Unknown"
- Drop rows with missing critical numeric values


In [None]:
df['service_history'] = df['service_history'].fillna("Unknown")
df = df.dropna(subset=['mileage_kmpl', 'engine_cc', 'price_usd'])


### Add New Features
- `car_age` = 2026 - `make_year`
- Drop `make_year` column


In [None]:
df['car_age'] = 2026 - df['make_year']
df = df.drop(columns=['make_year'])


### Convert Categorical Columns to Numeric
We will use one-hot encoding for:
- brand
- fuel_type
- transmission
- service_history


In [None]:
categorical_cols = ["brand", "fuel_type", "transmission", "service_history"]
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=False)
df_encoded.head()


Unnamed: 0,mileage_kmpl,engine_cc,owner_count,price_usd,color,accidents_reported,insurance_valid,car_age,brand_BMW,brand_Chevrolet,...,brand_Toyota,brand_Volkswagen,fuel_type_Diesel,fuel_type_Electric,fuel_type_Petrol,transmission_Automatic,transmission_Manual,service_history_Full,service_history_Partial,service_history_Unknown
0,8.17,4000,4,8587.64,White,0,No,25,False,True,...,False,False,False,False,True,False,True,False,False,True
1,17.59,1500,4,5943.5,Black,0,Yes,12,False,False,...,False,False,False,False,True,False,True,False,False,True
2,18.09,2500,5,9273.58,Black,1,Yes,3,True,False,...,False,False,True,False,False,True,False,True,False,False
3,11.28,800,1,6836.24,Blue,0,Yes,17,False,False,...,False,False,False,False,True,False,True,True,False,False
4,12.23,1000,2,4625.79,Red,0,Yes,21,False,False,...,False,False,False,False,True,True,False,True,False,False


### Save Processed Data
We save this dataset for training.

In [None]:
df_encoded.to_csv("../data/processed/cleaned_data.csv", index=False)