###  Importing All the basic necessary Libraries 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV



### 1️⃣ Stage 1 : Data Understanding & Preprocessing 

In [3]:
df = pd.read_csv(r"C:\Users\MOHAMMED YAZIN N\Downloads\CarPrice_Assignment.csv")
df

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115,5500,18,22,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,201,-1,volvo 145e (sw),gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845.0
201,202,-1,volvo 144ea,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045.0
202,203,-1,volvo 244dl,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485.0
203,204,-1,volvo 246,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106,4800,26,27,22470.0


### Exploratory Data Analysis

In [4]:
df.shape

(205, 26)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    object 
 16  enginesize        205 non-null    int64  
 1

In [6]:
## Statistical Summary 
df.describe()

Unnamed: 0,car_ID,symboling,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
count,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0
mean,103.0,0.834146,98.756585,174.049268,65.907805,53.724878,2555.565854,126.907317,3.329756,3.255415,10.142537,104.117073,5125.121951,25.219512,30.75122,13276.710571
std,59.322565,1.245307,6.021776,12.337289,2.145204,2.443522,520.680204,41.642693,0.270844,0.313597,3.97204,39.544167,476.985643,6.542142,6.886443,7988.852332
min,1.0,-2.0,86.6,141.1,60.3,47.8,1488.0,61.0,2.54,2.07,7.0,48.0,4150.0,13.0,16.0,5118.0
25%,52.0,0.0,94.5,166.3,64.1,52.0,2145.0,97.0,3.15,3.11,8.6,70.0,4800.0,19.0,25.0,7788.0
50%,103.0,1.0,97.0,173.2,65.5,54.1,2414.0,120.0,3.31,3.29,9.0,95.0,5200.0,24.0,30.0,10295.0
75%,154.0,2.0,102.4,183.1,66.9,55.5,2935.0,141.0,3.58,3.41,9.4,116.0,5500.0,30.0,34.0,16503.0
max,205.0,3.0,120.9,208.1,72.3,59.8,4066.0,326.0,3.94,4.17,23.0,288.0,6600.0,49.0,54.0,45400.0


In [7]:
# Check missing values
print(df.isnull().sum())

car_ID              0
symboling           0
CarName             0
fueltype            0
aspiration          0
doornumber          0
carbody             0
drivewheel          0
enginelocation      0
wheelbase           0
carlength           0
carwidth            0
carheight           0
curbweight          0
enginetype          0
cylindernumber      0
enginesize          0
fuelsystem          0
boreratio           0
stroke              0
compressionratio    0
horsepower          0
peakrpm             0
citympg             0
highwaympg          0
price               0
dtype: int64


In [8]:
## Unique Values in Each columns 
df.nunique()

car_ID              205
symboling             6
CarName             147
fueltype              2
aspiration            2
doornumber            2
carbody               5
drivewheel            3
enginelocation        2
wheelbase            53
carlength            75
carwidth             44
carheight            49
curbweight          171
enginetype            7
cylindernumber        7
enginesize           44
fuelsystem            8
boreratio            38
stroke               37
compressionratio     32
horsepower           59
peakrpm              23
citympg              29
highwaympg           30
price               189
dtype: int64

In [9]:
## visulisiing the data types 
df.dtypes

car_ID                int64
symboling             int64
CarName              object
fueltype             object
aspiration           object
doornumber           object
carbody              object
drivewheel           object
enginelocation       object
wheelbase           float64
carlength           float64
carwidth            float64
carheight           float64
curbweight            int64
enginetype           object
cylindernumber       object
enginesize            int64
fuelsystem           object
boreratio           float64
stroke              float64
compressionratio    float64
horsepower            int64
peakrpm               int64
citympg               int64
highwaympg            int64
price               float64
dtype: object

In [10]:
df.duplicated().sum()

np.int64(0)

In [11]:
df.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [12]:
df.tail()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
200,201,-1,volvo 145e (sw),gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845.0
201,202,-1,volvo 144ea,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045.0
202,203,-1,volvo 244dl,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485.0
203,204,-1,volvo 246,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.4,23.0,106,4800,26,27,22470.0
204,205,-1,volvo 264gl,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114,5400,19,25,22625.0


In [13]:
df.columns

Index(['car_ID', 'symboling', 'CarName', 'fueltype', 'aspiration',
       'doornumber', 'carbody', 'drivewheel', 'enginelocation', 'wheelbase',
       'carlength', 'carwidth', 'carheight', 'curbweight', 'enginetype',
       'cylindernumber', 'enginesize', 'fuelsystem', 'boreratio', 'stroke',
       'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg',
       'price'],
      dtype='object')

In [14]:
# Extract brand from 'CarName'
df['CarBrand'] = df['CarName'].apply(lambda x: x.split()[0].lower())
df.drop(['CarName'], axis=1, inplace=True)

# Fix inconsistent brand names
df['CarBrand'] = df['CarBrand'].replace({'vw': 'volkswagen', 'vokswagen': 'volkswagen', 'maxda': 'mazda', 'porcshce': 'porsche', 'toyouta': 'toyota'})
df['CarBrand'].unique()

# Drop car_ID column (not useful for modeling)
df.drop(['car_ID'], axis=1, inplace=True)

In [15]:
df

Unnamed: 0,symboling,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,...,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price,CarBrand
0,3,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0,alfa-romero
1,3,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0,alfa-romero
2,1,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,...,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0,alfa-romero
3,2,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,...,mpfi,3.19,3.40,10.0,102,5500,24,30,13950.0,audi
4,2,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,...,mpfi,3.19,3.40,8.0,115,5500,18,22,17450.0,audi
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,gas,std,four,sedan,rwd,front,109.1,188.8,68.9,...,mpfi,3.78,3.15,9.5,114,5400,23,28,16845.0,volvo
201,-1,gas,turbo,four,sedan,rwd,front,109.1,188.8,68.8,...,mpfi,3.78,3.15,8.7,160,5300,19,25,19045.0,volvo
202,-1,gas,std,four,sedan,rwd,front,109.1,188.8,68.9,...,mpfi,3.58,2.87,8.8,134,5500,18,23,21485.0,volvo
203,-1,diesel,turbo,four,sedan,rwd,front,109.1,188.8,68.9,...,idi,3.01,3.40,23.0,106,4800,26,27,22470.0,volvo


### Seperating Categorical and numerical features of the dataset 

In [16]:
# Identify categorical features
cat_cols = df.select_dtypes(include='object').columns
cat_cols



Index(['fueltype', 'aspiration', 'doornumber', 'carbody', 'drivewheel',
       'enginelocation', 'enginetype', 'cylindernumber', 'fuelsystem',
       'CarBrand'],
      dtype='object')

### Encoding Categorical Variables


In [17]:
# Performing One-Hot Encoding 
df_encoded = pd.get_dummies(df, columns=cat_cols,drop_first=True,dtype=int)
df_encoded

Unnamed: 0,symboling,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,...,CarBrand_nissan,CarBrand_peugeot,CarBrand_plymouth,CarBrand_porsche,CarBrand_renault,CarBrand_saab,CarBrand_subaru,CarBrand_toyota,CarBrand_volkswagen,CarBrand_volvo
0,3,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,...,0,0,0,0,0,0,0,0,0,0
1,3,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,...,0,0,0,0,0,0,0,0,0,0
2,1,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,...,0,0,0,0,0,0,0,0,0,0
3,2,99.8,176.6,66.2,54.3,2337,109,3.19,3.40,10.0,...,0,0,0,0,0,0,0,0,0,0
4,2,99.4,176.6,66.4,54.3,2824,136,3.19,3.40,8.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,109.1,188.8,68.9,55.5,2952,141,3.78,3.15,9.5,...,0,0,0,0,0,0,0,0,0,1
201,-1,109.1,188.8,68.8,55.5,3049,141,3.78,3.15,8.7,...,0,0,0,0,0,0,0,0,0,1
202,-1,109.1,188.8,68.9,55.5,3012,173,3.58,2.87,8.8,...,0,0,0,0,0,0,0,0,0,1
203,-1,109.1,188.8,68.9,55.5,3217,145,3.01,3.40,23.0,...,0,0,0,0,0,0,0,0,0,1


### Defining features and target variable

In [18]:
# Features and target
X = df_encoded.drop('price', axis=1)
y = df_encoded['price']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [19]:
X_train

Unnamed: 0,symboling,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,...,CarBrand_nissan,CarBrand_peugeot,CarBrand_plymouth,CarBrand_porsche,CarBrand_renault,CarBrand_saab,CarBrand_subaru,CarBrand_toyota,CarBrand_volkswagen,CarBrand_volvo
66,0,104.9,175.0,66.1,54.4,2700,134,3.43,3.64,22.0,...,0,0,0,0,0,0,0,0,0,0
111,0,107.9,186.7,68.4,56.7,3075,120,3.46,2.19,8.4,...,0,1,0,0,0,0,0,0,0,0
153,0,95.7,169.7,63.6,59.1,2280,92,3.05,3.03,9.0,...,0,0,0,0,0,0,0,1,0,0
96,1,94.5,165.3,63.8,54.5,1971,97,3.15,3.29,9.4,...,1,0,0,0,0,0,0,0,0,0
38,0,96.5,167.5,65.2,53.3,2289,110,3.15,3.58,9.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,1,99.2,178.5,67.9,49.7,3139,181,3.43,3.27,9.0,...,1,0,0,0,0,0,0,0,0,0
14,1,103.5,189.0,66.9,55.7,3055,164,3.31,3.19,9.0,...,0,0,0,0,0,0,0,0,0,0
92,1,94.5,165.3,63.8,54.5,1938,97,3.15,3.29,9.4,...,1,0,0,0,0,0,0,0,0,0
179,3,102.9,183.5,67.7,52.0,3016,171,3.27,3.35,9.3,...,0,0,0,0,0,0,0,1,0,0


In [20]:
X_test

Unnamed: 0,symboling,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,...,CarBrand_nissan,CarBrand_peugeot,CarBrand_plymouth,CarBrand_porsche,CarBrand_renault,CarBrand_saab,CarBrand_subaru,CarBrand_toyota,CarBrand_volkswagen,CarBrand_volvo
15,0,103.5,189.0,66.9,55.7,3230,209,3.62,3.39,8.0,...,0,0,0,0,0,0,0,0,0,0
9,0,99.5,178.2,67.9,52.0,3053,131,3.13,3.4,7.0,...,0,0,0,0,0,0,0,0,0,0
100,0,97.2,173.4,65.2,54.7,2302,120,3.33,3.47,8.5,...,1,0,0,0,0,0,0,0,0,0
132,3,99.1,186.6,66.5,56.1,2658,121,3.54,3.07,9.31,...,0,0,0,0,0,1,0,0,0,0
68,-1,110.0,190.9,70.3,58.7,3750,183,3.58,3.64,21.5,...,0,0,0,0,0,0,0,0,0,0
95,1,94.5,165.6,63.8,53.3,2028,97,3.15,3.29,9.4,...,1,0,0,0,0,0,0,0,0,0
159,0,95.7,166.3,64.4,52.8,2275,110,3.27,3.35,22.5,...,0,0,0,0,0,0,0,1,0,0
162,0,95.7,166.3,64.4,52.8,2140,98,3.19,3.03,9.0,...,0,0,0,0,0,0,0,1,0,0
147,0,97.0,173.5,65.4,53.0,2455,108,3.62,2.64,9.0,...,0,0,0,0,0,0,1,0,0,0
182,2,97.3,171.7,65.5,55.7,2261,97,3.01,3.4,23.0,...,0,0,0,0,0,0,0,0,1,0


In [21]:
y_train

66     18344.0
111    15580.0
153     6918.0
96      7499.0
38      9095.0
        ...   
106    18399.0
14     24565.0
92      6849.0
179    15998.0
102    14399.0
Name: price, Length: 164, dtype: float64

In [22]:
y_test

15     30760.000
9      17859.167
100     9549.000
132    11850.000
68     28248.000
95      7799.000
159     7788.000
162     9258.000
147    10198.000
182     7775.000
191    13295.000
164     8238.000
65     18280.000
175     9988.000
73     40960.000
152     6488.000
18      5151.000
82     12629.000
86      8189.000
143     9960.000
60      8495.000
101    13499.000
98      8249.000
30      6479.000
25      6692.000
16     41315.000
168     9639.000
195    13415.000
97      7999.000
194    12940.000
67     25552.000
120     6229.000
154     7898.000
202    21485.000
79      7689.000
69     28176.000
145    11259.000
55     10945.000
45      8916.500
84     14489.000
146     7463.000
Name: price, dtype: float64

### Scaling Procedure

In [23]:
##initialising Statndard scaler 

scaler = StandardScaler()



X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled



array([[-0.72738032,  1.03708546,  0.07244984, ..., -0.42409446,
        -0.2548236 , -0.22645541],
       [-0.72738032,  1.54123134,  1.04391556, ..., -0.42409446,
        -0.2548236 , -0.22645541],
       [-0.72738032, -0.50896191, -0.36761583, ...,  2.35796522,
        -0.2548236 , -0.22645541],
       ...,
       [ 0.07863571, -0.71062026, -0.73295337, ..., -0.42409446,
        -0.2548236 , -0.22645541],
       [ 1.69066776,  0.70098821,  0.77821554, ...,  2.35796522,
        -0.2548236 , -0.22645541],
       [-0.72738032,  0.28086664,  0.86954992, ..., -0.42409446,
        -0.2548236 , -0.22645541]])

In [24]:
X_test_scaled = scaler.transform(X_test)
X_test_scaled

array([[-0.72738032,  0.80181738,  1.23488746, ..., -0.42409446,
        -0.2548236 , -0.22645541],
       [-0.72738032,  0.12962287,  0.33814986, ..., -0.42409446,
        -0.2548236 , -0.22645541],
       [-0.72738032, -0.25688897, -0.06040018, ..., -0.42409446,
        -0.2548236 , -0.22645541],
       ...,
       [-0.72738032, -0.71062026, -1.51344721, ..., -0.42409446,
        -0.2548236 , -0.22645541],
       [ 1.69066776, -0.47535218, -0.07700643, ..., -0.42409446,
        -0.2548236 , -0.22645541],
       [-0.72738032, -0.29049869, -0.05209705, ..., -0.42409446,
        -0.2548236 , -0.22645541]])

### Model Training & Evaluation


In [25]:
## Building a model to  Implement  regression algorithms:
def build_model(model,model_name,X_train, y_train,X_test, y_test):
    print("Running Model Name :",model)
    model.fit(X_train,y_train)
    results=model.predict(X_test)
    print("-------------------------------------------------------------------------------")



    return{
        "Model Name" : model_name+ "✅",
    "MSE" : mean_squared_error(y_test, results),
    "MAE" : mean_absolute_error(y_test, results),
    "R2 Score" : r2_score(y_test, results) }

In [26]:
## For Storing the Data 

results_list=[]

In [27]:
#1. LinearRegression

## Initialising the  Model 
lr_model=LinearRegression()

In [28]:
## Building and Evaluation of the model
lr_result = build_model(lr_model,"LinearRegression", X_train_scaled, y_train,X_test_scaled, y_test)
results_list.append(lr_result)
print(lr_result)

Running Model Name : LinearRegression()
-------------------------------------------------------------------------------
{'Model Name': 'LinearRegression✅', 'MSE': 7128546.783600219, 'MAE': 1763.5659861998556, 'R2 Score': 0.9097012452093493}


In [29]:
#2.DecisionTreeRegressor

## Initialising the  Model 
dtree_model= DecisionTreeRegressor()

In [30]:
## Building and Evaluation of the model
dtree_result = build_model(dtree_model,"DecisionTreeRegressor", X_train_scaled, y_train,X_test_scaled, y_test)
results_list.append(dtree_result)
print(dtree_result)

Running Model Name : DecisionTreeRegressor()
-------------------------------------------------------------------------------
{'Model Name': 'DecisionTreeRegressor✅', 'MSE': 8165288.502485098, 'MAE': 1857.4430975609757, 'R2 Score': 0.8965686265850042}


In [31]:
#3. RandomForestRegressor

## Initialising the  Model 
rfr_model= RandomForestRegressor()

In [32]:
## Building and Evaluation of the model
rfr_result = build_model(rfr_model,"RandomForestRegressor", X_train_scaled, y_train,X_test_scaled, y_test)
results_list.append(rfr_result)
print(rfr_result)

Running Model Name : RandomForestRegressor()
-------------------------------------------------------------------------------
{'Model Name': 'RandomForestRegressor✅', 'MSE': 3336327.21707568, 'MAE': 1256.122146341463, 'R2 Score': 0.9577380632516606}


In [33]:
#4. GradientBoostingRegressor

## Initialising the  Model 
gbr_model= GradientBoostingRegressor()

In [34]:
## Building and Evaluation of the model
gbr_result = build_model(gbr_model,"GradientBoostingRegressor", X_train_scaled, y_train,X_test_scaled, y_test)
results_list.append(gbr_result)
print(gbr_result)

Running Model Name : GradientBoostingRegressor()
-------------------------------------------------------------------------------
{'Model Name': 'GradientBoostingRegressor✅', 'MSE': 6080108.25561758, 'MAE': 1736.948940719558, 'R2 Score': 0.9229820296981566}


In [35]:
#5.Support Vector Regressor 

## Initialising the  Model 
svr_model= SVR()

In [36]:
## Building and Evaluation of the model
svr_result = build_model(svr_model,"Support Vector Regressor",X_train_scaled, y_train,X_test_scaled, y_test)
results_list.append(svr_result)
print(svr_result)

Running Model Name : SVR()
-------------------------------------------------------------------------------
{'Model Name': 'Support Vector Regressor✅', 'MSE': 86889433.59033778, 'MAE': 5701.535891309875, 'R2 Score': -0.10064616195306408}


### Comparing All results into an Dataframe 

In [37]:
results_df=pd.DataFrame(results_list)
results_df = results_df.sort_values(by='R2 Score', ascending=False).reset_index(drop=True)
results_df


Unnamed: 0,Model Name,MSE,MAE,R2 Score
0,RandomForestRegressor✅,3336327.0,1256.122146,0.957738
1,GradientBoostingRegressor✅,6080108.0,1736.948941,0.922982
2,LinearRegression✅,7128547.0,1763.565986,0.909701
3,DecisionTreeRegressor✅,8165289.0,1857.443098,0.896569
4,Support Vector Regressor✅,86889430.0,5701.535891,-0.100646


### ✅ Best Working Model → RandomForestRegressor

#### Reasoning & Justification:

It has the highest R² score (0.9576) → explains ~95.8% of variance in the target variable.

Lowest errors: both MSE (3.34e+06) and MAE (1243.23) are the smallest among all models.

Indicates excellent generalization, capturing both linear and non-linear relationships effectively.

🔎 Why so good?
Random Forest is an ensemble method that averages predictions from many decision trees. This reduces overfitting (common in single Decision Trees) while maintaining flexibility for complex patterns.

### ❌ Worst Working Model → Support Vector Regressor (SVR)

#### Reasoning & Justification:

Negative R² score (-0.1006) → means the model is performing worse than a simple mean predictor.

Highest error values: MSE (8.68e+07) and MAE (5701.54), much larger than other models.

Indicates that SVR is not suitable for this dataset (at least with current parameters).

### 🔎 Why so bad?

SVR requires careful hyperparameter tuning (C, epsilon, kernel choice, scaling).

On high-dimensional or non-linearly separable data, the default SVR often underperforms.

It’s also sensitive to outliers, which may be affecting it here.

### Defining Feature Importance 

In [38]:
# Extract feature importances
f_imp= rfr_model.feature_importances_
feature_names = X_test.columns
feature_df = pd.DataFrame({'Feature': feature_names,'Importance': f_imp}).sort_values(by='Importance', ascending=False).reset_index(drop=True)



feature_df



Unnamed: 0,Feature,Importance
0,enginesize,5.968549e-01
1,curbweight,2.399696e-01
2,highwaympg,4.800083e-02
3,carwidth,2.814988e-02
4,horsepower,2.665305e-02
...,...,...
59,cylindernumber_twelve,3.969935e-06
60,enginetype_rotor,3.435595e-06
61,cylindernumber_two,2.333864e-06
62,enginetype_dohcv,6.346374e-07


plt.figure(figsize=(10,8))

sns.barplot(
    x="Feature",
    y="Importance",
    data=feature_df,
    )


plt.title('Feature Importance - Random Forest')
plt.xlabel("Features")
plt.ylabel("Importance",)
plt.tight_layout()
plt.xticks(rotation=90) 

plt.show()

### Grid Search CV for Hyperparameter Tuning 

In [43]:
# Initialize model
model = RandomForestRegressor(random_state=42)

# Create a grid of hyperparameters
param_grid = {
    'n_estimators': [100, 200, 300],           # Number of trees
    'max_depth': [None, 10, 20],               # Max depth of each tree
    'min_samples_split': [2, 5, 10]            # Minimum samples required to split a node
}

#  Setup Grid Search
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,                          # 5-fold cross-validation
    scoring='r2',                  # Evaluation metric
    n_jobs=-1                      # Use all available cores
)

# Fit the Grid Search
grid_search.fit(X_train_scaled, y_train)

# View best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best R2 Score:", grid_search.best_score_)

# Predict using the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)

# Evaluate performance
print("Test R2 Score:", r2_score(y_test, y_pred))



Best Parameters: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 300}
Best R2 Score: 0.8882275885740816
Test R2 Score: 0.9592352507777084


### Evaluating Model Performances

#### Base Model

In [61]:
##initialising 

base_model=RandomForestRegressor(random_state=42)


##fitting
base_model.fit(X_train,y_train)

#predicting & Evaluating 
y_pred=base_model.predict(X_test)
r2_base=r2_score(y_test,y_pred)

print("R2_Score:",r2_base)

R2_Score: 0.9584165239875437


#### Tuned Model

In [62]:
##initialising 

tuned_model=grid_search.best_estimator_

#predicting & Evaluating 
y_pred_tuned=tuned_model.predict(X_test)
r2_tuned=r2_score(y_test,y_pred_tuned)

print("R2_Score:",r2_tuned)

R2_Score: 0.9592582108394468


#### Base Model    VS     Tuned Model

In [72]:
## Before Tuning
print(f"Performance of Base Model(r2_score) : {r2_base:.4f}")

## After Tuning 
print(f"Performance of Tuned Model(r2_score) : {r2_tuned:.4f}")



## Differences 
print(f"Difference in base_score and Tuned_score : {diff:.4f}")

Performance of Base Model(r2_score) : 0.9584
Performance of Tuned Model(r2_score) : 0.9593
Difference in base_score and Tuned_score : 0.0008


### 🚗 Car Price Prediction – Management Insights

 The car price prediction model was built using the available independent variables, and it provides **valuable insights** into how prices vary with different features. This will help management in **design decisions, pricing strategy, and new market planning**.

### 🔑 Key Findings

* 📈 **Engine size, curb weight, horsepower, and car width** are the strongest drivers of higher car prices.
* ⛽ **Fuel efficiency (city mpg)** has an inverse relationship with price — higher efficiency usually means lower price.
* 🏆 **Random Forest Regressor** performed the best, achieving an **R² ≈ 0.96**, explaining \~96% of price variation.

### 💼 Business Implications

1. 🛠 **Car Design:** Adjusting technical specs like engine size & horsepower can position cars in premium or budget segments.
2. 🎯 **Market Strategy:** Balance between performance features and fuel efficiency to attract target customers.
3. 🌍 **New Market Entry:** Use the model to simulate pricing dynamics in unfamiliar markets and set competitive prices.

---

## ✅ **Conclusion**

This ML regression model is not only a **highly accurate predictor of car prices** but also a **strategic decision-making tool**. It enables management to:
✔️ Understand the impact of design features on price
✔️ Optimize business strategies for different market segments
✔️ Confidently plan pricing in new markets

🚀 **In short:** The model bridges **data-driven insights** with **business strategy**, giving the company a strong competitive edge.