# Car Price Prediction Using Random Forest Regressor
Predicting car prices accurately using the Random Forest Regressor for robust and reliable estimations.

## Step 1: Import libraries

In [91]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, f1_score

## Step 2 Load DataSet and perform basic EDA 

In [92]:
df = pd.read_csv("car_price_prediction.csv")
df.head()

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
0,45654403,13328,1399,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,186005 km,6.0,Automatic,4x4,04-May,Left wheel,Silver,12
1,44731507,16621,1018,CHEVROLET,Equinox,2011,Jeep,No,Petrol,3.0,192000 km,6.0,Tiptronic,4x4,04-May,Left wheel,Black,8
2,45774419,8467,-,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,200000 km,4.0,Variator,Front,04-May,Right-hand drive,Black,2
3,45769185,3607,862,FORD,Escape,2011,Jeep,Yes,Hybrid,2.5,168966 km,4.0,Automatic,4x4,04-May,Left wheel,White,0
4,45809263,11726,446,HONDA,FIT,2014,Hatchback,Yes,Petrol,1.3,91901 km,4.0,Automatic,Front,04-May,Left wheel,Silver,4


In [93]:
df.describe()     #return the mean, standard derivation, quantile, percentile, deciles of the numerical columns

Unnamed: 0,ID,Price,Prod. year,Cylinders,Airbags
count,19237.0,19237.0,19237.0,19237.0,19237.0
mean,45576540.0,18555.93,2010.912824,4.582991,6.582627
std,936591.4,190581.3,5.668673,1.199933,4.320168
min,20746880.0,1.0,1939.0,1.0,0.0
25%,45698370.0,5331.0,2009.0,4.0,4.0
50%,45772310.0,13172.0,2012.0,4.0,6.0
75%,45802040.0,22075.0,2015.0,4.0,12.0
max,45816650.0,26307500.0,2020.0,16.0,16.0


In [94]:
df.info()       # return all the information about dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19237 entries, 0 to 19236
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                19237 non-null  int64  
 1   Price             19237 non-null  int64  
 2   Levy              19237 non-null  object 
 3   Manufacturer      19237 non-null  object 
 4   Model             19237 non-null  object 
 5   Prod. year        19237 non-null  int64  
 6   Category          19237 non-null  object 
 7   Leather interior  19237 non-null  object 
 8   Fuel type         19237 non-null  object 
 9   Engine volume     19237 non-null  object 
 10  Mileage           19237 non-null  object 
 11  Cylinders         19237 non-null  float64
 12  Gear box type     19237 non-null  object 
 13  Drive wheels      19237 non-null  object 
 14  Doors             19237 non-null  object 
 15  Wheel             19237 non-null  object 
 16  Color             19237 non-null  object

In [95]:
df.isna().sum()         # check the null values

ID                  0
Price               0
Levy                0
Manufacturer        0
Model               0
Prod. year          0
Category            0
Leather interior    0
Fuel type           0
Engine volume       0
Mileage             0
Cylinders           0
Gear box type       0
Drive wheels        0
Doors               0
Wheel               0
Color               0
Airbags             0
dtype: int64

In [96]:
df.shape              # shape attribute tell the dimension of the dataset


(19237, 18)

## Step 3: Cleaning the Data

In [97]:
df.duplicated().sum()                 # count the duplicated values in the dataframe

np.int64(313)

In [98]:
# drop duplicated value from the dataframe
df.drop_duplicates(inplace = True)     #duplicate from same dataframe not create copy of dataframe


In [99]:
# Convert 'Engine volume' and 'Mileage' by removing non-numeric characters

df["Engine volume"] = df["Engine volume"].astype(str).str.extract(r'(\d+\.\d+|\d+)').astype(float)

df["Mileage"] = df["Mileage"].astype(str).str.replace(r'\D', '', regex=True).astype(float)


In [100]:
# Drop columns (not useful for prediction)
# axis=1 means drop column-wise
# inplace=True to modify a DataFrame directly instead of creating a new copy.

df.drop(columns=["ID", "Model"], axis = 1, inplace = True)    # drop ID and Model name


While I was training my model, I faced an error because my data contained '-'. After that, I applied the following steps to fix it:

- Counted how many rows contained '-'.
- Replaced '-' with the column mean.

In [101]:
# Find which column contain '-'
columns_with_hyphen = df.columns[df.apply(lambda col: col.astype(str).str.contains('-').any())]
columns_with_hyphen


Index(['Levy', 'Manufacturer', 'Fuel type', 'Doors', 'Wheel'], dtype='object')

In [102]:
# Replace '-' with NaN
df.replace('-', pd.NA, inplace=True)

In [103]:
# Convert levy column to numeric because it contain numerical values
df['Levy'] = pd.to_numeric(df['Levy'], errors='coerce')      
    

In [104]:

# Now replace it with column mean
df.fillna(df['Levy'].mean(), inplace=True)

# Fill Categorical NaNs with Most Frequent Value (Mode)
categorical_cols = ['Manufacturer', 'Doors', 'Fuel type', 'Wheel']
df[categorical_cols] = df[categorical_cols].apply(lambda col: col.fillna(col.mode()[0]))


# Step 4: Convert Columns Categorical to Numerical
Here we convert categorical column to numerical. In the perivous step we convert column that already contain numerical data.
- perform `LabelEncoder` on ordinal columns
- perform `One-Hot Enocoder` on non ordinal columns 

In [105]:
column = ["Manufacturer", "Fuel type", "Color", "Category", "Gear box type", "Drive wheels"]
df = pd.get_dummies(df, columns=column, drop_first = True)   #one hot encoding 

#create object of label encoder
le = LabelEncoder()

# Label encoding
df["Leather interior"] = le.fit_transform(df["Leather interior"])
df["Wheel"]  = le.fit_transform(df["Wheel"])
df["Doors"] = le.fit_transform(df["Doors"])
df["Airbags"] = le.fit_transform(df["Airbags"])


In [106]:
# After converting and extracting string from the columns
#Now data is ready for train the model
df.head()

Unnamed: 0,Price,Levy,Prod. year,Leather interior,Engine volume,Mileage,Cylinders,Doors,Wheel,Airbags,...,Category_Microbus,Category_Minivan,Category_Pickup,Category_Sedan,Category_Universal,Gear box type_Manual,Gear box type_Tiptronic,Gear box type_Variator,Drive wheels_Front,Drive wheels_Rear
0,13328,1399.0,2010,1,3.5,186005.0,6.0,1,0,12,...,False,False,False,False,False,False,False,False,False,False
1,16621,1018.0,2011,0,3.0,192000.0,6.0,1,0,8,...,False,False,False,False,False,False,True,False,False,False
2,8467,906.299205,2006,0,1.3,200000.0,4.0,1,1,2,...,False,False,False,False,False,False,False,True,True,False
3,3607,862.0,2011,1,2.5,168966.0,4.0,1,0,0,...,False,False,False,False,False,False,False,False,False,False
4,11726,446.0,2014,1,1.3,91901.0,4.0,1,0,4,...,False,False,False,False,False,False,False,False,True,False


## Step 5: split data into 80% training, 20% testing 

In [107]:
X = df.drop("Price", axis = 1)     # Features
Y = df["Price"]                   #Target

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

## Step 6: Train the model

In [108]:
# Create object
Rf = RandomForestRegressor(n_estimators= 100, random_state= 42)     #n_estimators --> uses 100 decision tree

Rf.fit(x_train, y_train)

## Step 7: predict and Evaluate the model 

In [109]:
# Predict
y_pred = Rf.predict(x_test)

#Evaluate
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)


In [110]:
print("R sqaure  : ", r2)
print("mean_absolute_error  :", mae)
print("mean_squared_error  :", mse)

R sqaure  :  -36.40848583859663
mean_absolute_error  : 7648.175243897323
mean_squared_error  : 12951873005.037529


## Step 8: train the model on importance feature to evaluate r2 score positive
When we train a Random Forest model, it consists of multiple decision trees.
Each decision tree splits the data based on feature values to minimize prediction error.
feature_importances_ is an attribute of the trained RandomForestRegressor or RandomForestClassifier model that tells how important each feature was in making predictions.

1) Extract Importance Features
2) Split the Data Again using Importance Features
3) Train the Model Again
4) Make Prediction and Evaluate