### Car Price Prediction With Machine Learning
The price of a car depends on a lot of factors like the goodwill of the brand of the car, features of the car, horsepower and the milage it gives and many more. Car price prediction is one of the major research areas in machine learning. So if you want to learn how to train a car price predicton model, then this project is for you.

In [1]:
# import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split



### Loading the dataset

In [2]:
# load the dataset
dataset = pd.read_csv(r"C:\Users\Harbiodun\Documents\Data Science and AI\OIBSIP\documents\Car price\car data.csv")
dataset.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


### Exploratory Data Analysis
Getting information about the dataset

In [3]:
dataset.describe()

Unnamed: 0,Year,Selling_Price,Present_Price,Driven_kms,Owner
count,301.0,301.0,301.0,301.0,301.0
mean,2013.627907,4.661296,7.628472,36947.20598,0.043189
std,2.891554,5.082812,8.642584,38886.883882,0.247915
min,2003.0,0.1,0.32,500.0,0.0
25%,2012.0,0.9,1.2,15000.0,0.0
50%,2014.0,3.6,6.4,32000.0,0.0
75%,2016.0,6.0,9.9,48767.0,0.0
max,2018.0,35.0,92.6,500000.0,3.0


In [4]:
# display the data types of each column in the dataset
dataset.dtypes

Car_Name          object
Year               int64
Selling_Price    float64
Present_Price    float64
Driven_kms         int64
Fuel_Type         object
Selling_type      object
Transmission      object
Owner              int64
dtype: object

In [5]:
# check for empty/null rows
dataset.isna().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Driven_kms       0
Fuel_Type        0
Selling_type     0
Transmission     0
Owner            0
dtype: int64

In [6]:
# displays the shape of the dataset
dataset.shape

(301, 9)

In [7]:
# displays all info about the dataset
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Driven_kms     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Selling_type   301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


In [8]:
# prints out all the different car name in the dataset
dataset['Car_Name'].unique()

array(['ritz', 'sx4', 'ciaz', 'wagon r', 'swift', 'vitara brezza',
       's cross', 'alto 800', 'ertiga', 'dzire', 'alto k10', 'ignis',
       '800', 'baleno', 'omni', 'fortuner', 'innova', 'corolla altis',
       'etios cross', 'etios g', 'etios liva', 'corolla', 'etios gd',
       'camry', 'land cruiser', 'Royal Enfield Thunder 500',
       'UM Renegade Mojave', 'KTM RC200', 'Bajaj Dominar 400',
       'Royal Enfield Classic 350', 'KTM RC390', 'Hyosung GT250R',
       'Royal Enfield Thunder 350', 'KTM 390 Duke ',
       'Mahindra Mojo XT300', 'Bajaj Pulsar RS200',
       'Royal Enfield Bullet 350', 'Royal Enfield Classic 500',
       'Bajaj Avenger 220', 'Bajaj Avenger 150', 'Honda CB Hornet 160R',
       'Yamaha FZ S V 2.0', 'Yamaha FZ 16', 'TVS Apache RTR 160',
       'Bajaj Pulsar 150', 'Honda CBR 150', 'Hero Extreme',
       'Bajaj Avenger 220 dtsi', 'Bajaj Avenger 150 street',
       'Yamaha FZ  v 2.0', 'Bajaj Pulsar  NS 200', 'Bajaj Pulsar 220 F',
       'TVS Apache RTR 180', 

In [9]:
# count how many times a car name appears in the dataset
dataset['Car_Name'].value_counts()

city                        26
corolla altis               16
verna                       14
fortuner                    11
brio                        10
                            ..
Honda CB Trigger             1
Yamaha FZ S                  1
Bajaj Pulsar 135 LS          1
Activa 4g                    1
Bajaj Avenger Street 220     1
Name: Car_Name, Length: 98, dtype: int64

In [10]:
dataset['Fuel_Type'].unique()

array(['Petrol', 'Diesel', 'CNG'], dtype=object)

In [11]:
dataset['Selling_type'].unique()

array(['Dealer', 'Individual'], dtype=object)

In [12]:
dataset['Transmission'].unique()

array(['Manual', 'Automatic'], dtype=object)

### Data Preprocessing

In [13]:
# encode the categorical columns using LabelEncoder
encoder = LabelEncoder()

In [14]:
categorical_columns = ['Car_Name','Fuel_Type', 'Selling_type', 'Transmission']

In [15]:
# encode each categorical column in the dataset 
def to_numerical(data, categorical_columns, encoder):
    for columns in categorical_columns:
        data[columns] = encoder.fit_transform(data[columns])
    return data

In [16]:
dataset = to_numerical(dataset, categorical_columns, encoder)
dataset

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
0,90,2014,3.35,5.59,27000,2,0,1,0
1,93,2013,4.75,9.54,43000,1,0,1,0
2,68,2017,7.25,9.85,6900,2,0,1,0
3,96,2011,2.85,4.15,5200,2,0,1,0
4,92,2014,4.60,6.87,42450,1,0,1,0
...,...,...,...,...,...,...,...,...,...
296,69,2016,9.50,11.60,33988,1,0,1,0
297,66,2015,4.00,5.90,60000,2,0,1,0
298,69,2009,3.35,11.00,87934,2,0,1,0
299,69,2017,11.50,12.50,9000,1,0,1,0


### Splitting the Dataset

In [17]:
# split the data into features(x) and target(y)
X = dataset.drop(columns='Selling_Price', axis =0)
y = dataset['Selling_Price']

In [18]:
X.dtypes

Car_Name           int32
Year               int64
Present_Price    float64
Driven_kms         int64
Fuel_Type          int32
Selling_type       int32
Transmission       int32
Owner              int64
dtype: object

In [19]:
y.dtype

dtype('float64')

Change the data type from int32 to int64 so all the integers can have the same data type

In [20]:
# change the data type from int32 to int64
def change_dtype(data, categorical_columns):
    for columns in categorical_columns:
        data[columns] = data[columns].astype('int64')
    return data

In [21]:
X = change_dtype(X, categorical_columns)
X

Unnamed: 0,Car_Name,Year,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
0,90,2014,5.59,27000,2,0,1,0
1,93,2013,9.54,43000,1,0,1,0
2,68,2017,9.85,6900,2,0,1,0
3,96,2011,4.15,5200,2,0,1,0
4,92,2014,6.87,42450,1,0,1,0
...,...,...,...,...,...,...,...,...
296,69,2016,11.60,33988,1,0,1,0
297,66,2015,5.90,60000,2,0,1,0
298,69,2009,11.00,87934,2,0,1,0
299,69,2017,12.50,9000,1,0,1,0


In [22]:
X.dtypes

Car_Name           int64
Year               int64
Present_Price    float64
Driven_kms         int64
Fuel_Type          int64
Selling_type       int64
Transmission       int64
Owner              int64
dtype: object

Splitting the dataset into test and train

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [24]:
X_train

Unnamed: 0,Car_Name,Year,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
146,20,2014,0.787,15000,2,1,1,0
260,69,2016,13.600,29223,2,0,1,0
37,0,2003,2.280,127000,2,1,1,0
273,87,2010,7.500,61203,2,0,1,0
164,28,2016,0.540,14000,2,1,1,0
...,...,...,...,...,...,...,...,...
251,69,2013,9.900,56701,2,0,1,0
192,24,2007,0.750,49000,2,1,1,1
117,49,2015,1.900,14000,2,1,1,0
47,96,2006,4.150,65000,2,0,1,0


In [25]:
X_test

Unnamed: 0,Car_Name,Year,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
223,94,2015,9.400,61381,1,0,1,0
150,52,2011,0.826,6000,2,1,1,0
226,82,2015,5.700,24678,2,0,1,0
296,69,2016,11.600,33988,1,0,1,0
52,86,2017,19.770,15000,1,0,0,0
...,...,...,...,...,...,...,...,...
46,90,2013,4.890,64532,2,0,1,0
158,38,2017,0.540,8600,2,1,1,0
230,94,2013,9.400,45000,1,0,1,0
179,39,2010,1.050,213000,2,1,1,0


In [26]:
y_train

146    0.55
260    9.15
37     0.35
273    2.25
164    0.45
       ... 
251    5.00
192    0.20
117    1.10
47     1.05
172    0.40
Name: Selling_Price, Length: 225, dtype: float64

In [27]:
y_test

223     8.25
150     0.50
226     5.25
296     9.50
52     18.00
       ...  
46      2.65
158     0.48
230     6.15
179     0.31
27      6.00
Name: Selling_Price, Length: 76, dtype: float64

### Model Development
For this problem, the RandomForestRegressor model will be used

In [28]:
# initialising the model
model = RandomForestRegressor(n_estimators=40, random_state=0)

In [29]:
# fitting the model
model.fit(X=X_train, y=y_train)

RandomForestRegressor(n_estimators=40, random_state=0)

In [30]:
# make predictions
prediction = model.predict(X_test)

In [31]:
prediction

array([ 7.035  ,  0.45   ,  4.67125,  8.80975, 14.751  ,  5.2875 ,
        3.1525 ,  0.455  ,  3.83   ,  4.745  ,  3.03   ,  0.75625,
        4.855  ,  6.935  ,  7.60875, 14.7485 ,  6.5335 ,  4.0575 ,
        0.49725,  1.59   ,  2.84125,  5.1225 ,  5.0075 , 10.961  ,
        0.21425,  0.71025,  0.3315 ,  0.7345 ,  0.4675 ,  4.535  ,
        2.23625,  6.07875,  0.483  ,  7.29375,  3.2475 ,  1.164  ,
        5.51875,  5.3    ,  0.26225,  7.49125,  8.5375 , 20.44125,
        5.01875,  4.44125,  5.76375, 11.23975,  0.259  ,  0.761  ,
        5.1875 ,  6.6925 ,  6.7985 ,  3.08875,  5.125  , 22.8375 ,
        1.1705 ,  1.118  ,  0.48375,  2.62   ,  3.71375,  2.11   ,
        5.72   ,  6.0425 ,  3.0375 , 22.94375,  4.335  ,  5.77   ,
        9.5685 ,  5.7425 ,  0.4545 ,  2.96625,  2.8425 ,  2.93125,
        0.462  ,  5.3925 ,  0.496  ,  5.265  ])

In [32]:
r2_score = r2_score(y_true=y_test, y_pred=prediction)
r2_score

0.9294654285142236