In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder

In [2]:
df=pd.read_csv("car.csv")
df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


Preprocess Dataset and IQR technique

In [4]:
df.drop("Car_Name",axis=1,inplace=True)
df.head()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [6]:
#Car current age
df["Current_Age"]=2025-df["Year"]
df.drop("Year",axis=1,inplace=True)
df.head()

Unnamed: 0,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,Current_Age
0,3.35,5.59,27000,Petrol,Dealer,Manual,0,11
1,4.75,9.54,43000,Diesel,Dealer,Manual,0,12
2,7.25,9.85,6900,Petrol,Dealer,Manual,0,8
3,2.85,4.15,5200,Petrol,Dealer,Manual,0,14
4,4.6,6.87,42450,Diesel,Dealer,Manual,0,11


In [7]:
# Label encoding for fuel type,seller type, transmission
le=LabelEncoder()
df["Fuel_Type"]=le.fit_transform(df["Fuel_Type"])
df["Seller_Type"]=le.fit_transform(df["Seller_Type"])
df["Transmission"]=le.fit_transform(df["Transmission"])
df.head()

Unnamed: 0,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,Current_Age
0,3.35,5.59,27000,2,0,1,0,11
1,4.75,9.54,43000,1,0,1,0,12
2,7.25,9.85,6900,2,0,1,0,8
3,2.85,4.15,5200,2,0,1,0,14
4,4.6,6.87,42450,1,0,1,0,11


In [None]:
#IQR technique on selling price
df["Selling_Price"].describe()

count    301.000000
mean       4.661296
std        5.082812
min        0.100000
25%        0.900000
50%        3.600000
75%        6.000000
max       35.000000
Name: Selling_Price, dtype: float64

In [None]:
#This is how IQR works
q1=df["Selling_Price"].quantile(0.25)
q3=df["Selling_Price"].quantile(0.75)
iqr=q3-q1
lower_limit=q1-1.5*iqr
upper_limit=q3+1.5*iqr
print(lower_limit,upper_limit)
df1=df[(df["Selling_Price"]>lower_limit)&(df["Selling_Price"]<upper_limit)]
df1.head()

-6.749999999999999 13.649999999999999


Unnamed: 0,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,Current_Age
0,3.35,5.59,27000,2,0,1,0,11
1,4.75,9.54,43000,1,0,1,0,12
2,7.25,9.85,6900,2,0,1,0,8
3,2.85,4.15,5200,2,0,1,0,14
4,4.6,6.87,42450,1,0,1,0,11


In [None]:
#Creating function to perform IQR technique on every attribute
label=["Selling_Price","Present_Price","Kms_Driven","Current_Age"]
for i in range(len(label)):
    q1=df[label[i]].quantile(0.25)
    q3=df[label[i]].quantile(0.75)
    iqr=q3-q1
    lower_limit=q1-1.5*iqr
    upper_limit=q3+1.5*iqr
    dataframe=df[(df[label[i]]>lower_limit)&(df[label[i]]<upper_limit)]

dataframe.head()    

Unnamed: 0,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,Current_Age
0,3.35,5.59,27000,2,0,1,0,11
1,4.75,9.54,43000,1,0,1,0,12
2,7.25,9.85,6900,2,0,1,0,8
3,2.85,4.15,5200,2,0,1,0,14
4,4.6,6.87,42450,1,0,1,0,11


In [15]:
dataframe.shape,df.shape

((290, 8), (301, 8))

Data Splitting

In [16]:
#Feature and Target
X=dataframe.drop("Selling_Price",axis=1)
y=dataframe["Selling_Price"]

In [17]:
X.head()

Unnamed: 0,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,Current_Age
0,5.59,27000,2,0,1,0,11
1,9.54,43000,1,0,1,0,12
2,9.85,6900,2,0,1,0,8
3,4.15,5200,2,0,1,0,14
4,6.87,42450,1,0,1,0,11


In [18]:
y.head()

0    3.35
1    4.75
2    7.25
3    2.85
4    4.60
Name: Selling_Price, dtype: float64

In [19]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
#Random Forest model
rf=RandomForestRegressor(random_state=42)
rf

In [27]:
#Hyperparameter tuning
param_grid={
    'n_estimators':[100,200,300,400,500],
    'max_depth':[5,10,15,20,None],
    'min_samples_split':[2,5,10],
    'min_samples_leaf':[1,2,4],
    'max_features':[None,'sqrt','log2']
}


In [28]:
#Randomized Search CV
random_search=RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=30,
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

In [29]:
#Fit model
random_search.fit(X_train,y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [30]:
#Best model
model=random_search.best_estimator_
model

In [31]:
#r2 and rmse
import math
y_pred=model.predict(X_test)
r2=r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = math.sqrt(mse)

In [33]:
# Output results
print("Best Parameters:", random_search.best_params_)
print("R-Squared Score:",r2)
print("RMSE:",rmse)

Best Parameters: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None, 'max_depth': 15}
R-Squared Score: 0.7683961228999204
RMSE: 3.287309853467964


In [34]:
import joblib
joblib.dump(model,'car_price_model.pkl')


['car_price_model.pkl']