**Train Random Forest Regressor Model on Car Price Prediction Dataset**

In [48]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,StandardScaler,OneHotEncoder,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error,mean_squared_error,root_mean_squared_error,accuracy_score,r2_score

In [49]:
df = pd.read_csv("Car_Price_Prediction.csv")

In [50]:
df.head()

Unnamed: 0,Make,Model,Year,Engine Size,Mileage,Fuel Type,Transmission,Price
0,Honda,Model B,2015,3.9,74176,Petrol,Manual,30246.207931
1,Ford,Model C,2014,1.7,94799,Electric,Automatic,22785.747684
2,BMW,Model B,2006,4.1,98385,Electric,Manual,25760.290347
3,Honda,Model B,2015,2.6,88919,Electric,Automatic,25638.003491
4,Honda,Model C,2004,3.4,138482,Petrol,Automatic,21021.386657


In [51]:
df.columns

Index(['Make', 'Model', 'Year', 'Engine Size', 'Mileage', 'Fuel Type',
       'Transmission', 'Price'],
      dtype='object')

In [52]:
df.shape

(1000, 8)

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Make          1000 non-null   object 
 1   Model         1000 non-null   object 
 2   Year          1000 non-null   int64  
 3   Engine Size   1000 non-null   float64
 4   Mileage       1000 non-null   int64  
 5   Fuel Type     1000 non-null   object 
 6   Transmission  1000 non-null   object 
 7   Price         1000 non-null   float64
dtypes: float64(2), int64(2), object(4)
memory usage: 62.6+ KB


In [54]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Year,1000.0,2010.688,6.288577,2000.0,2005.0,2011.0,2016.0,2021.0
Engine Size,1000.0,2.7983,1.024137,1.0,1.9,2.8,3.7,4.5
Mileage,1000.0,97192.487,59447.31576,56.0,44768.75,94411.5,148977.75,199867.0
Price,1000.0,25136.61553,5181.401368,6704.953524,21587.87837,25189.325247,28806.368974,41780.504635


In [55]:
df.dtypes

Make             object
Model            object
Year              int64
Engine Size     float64
Mileage           int64
Fuel Type        object
Transmission     object
Price           float64
dtype: object

In [56]:
df["Make"].nunique()

5

In [57]:
df["Model"].nunique()

5

In [58]:
df["Fuel Type"].nunique()

3

In [59]:
df["Transmission"].nunique()

2

In [60]:
df["Transmission"].unique()

array(['Manual', 'Automatic'], dtype=object)

In [61]:
# Define categorical and numeric features
categorical_features = ['Make', 'Model', 'Fuel Type', 'Transmission']
numeric_features = ['Year', 'Engine Size', 'Mileage']

# Create ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'  # numeric features pass through without change
)


In [62]:
df.isnull().sum()

Make            0
Model           0
Year            0
Engine Size     0
Mileage         0
Fuel Type       0
Transmission    0
Price           0
dtype: int64

In [63]:
X = df.drop("Price", axis=1)
y= df["Price"]

In [64]:
X_train , X_test , y_train , y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [65]:
numeric_transformer = Pipeline(steps=[ ('scaler', StandardScaler())])

In [66]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [67]:

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()


categorical_features = X.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()



In [68]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)])


In [69]:
pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('model', DecisionTreeRegressor(random_state=42))
])

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [71]:
pipeline.fit(X_train, y_train)

In [72]:
y_pred = pipeline.predict(X_test)

In [73]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)


print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R-squared (R²): {r2:.2f}")
print(f"Root meaan square error: {rmse:.2f}")


Mean Squared Error (MSE): 10189165.49
Mean Absolute Error (MAE): 2590.12
R-squared (R²): 0.63
Root meaan square error: 3192.05
