# Building the Model

In [1]:
#Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings("ignore")

In [2]:
#Importing our clean dataset
df = pd.read_csv('cleaned_car_data.csv')
df

Unnamed: 0,name,selling_price,km_driven,fuel,transmission,owner,mileage,engine,max_power,torque,age
0,Maruti,5400.0,145500,Diesel,Manual,1,23.40,1248.0,74.00,190.0,9
1,Skoda,4440.0,120000,Diesel,Manual,2,21.14,1498.0,103.52,250.0,9
2,Honda,1896.0,140000,Petrol,Manual,3,17.70,1497.0,78.00,12.7,17
3,Hyundai,2700.0,127000,Diesel,Manual,1,23.00,1396.0,90.00,22.4,13
4,Maruti,1560.0,120000,Petrol,Manual,1,16.10,1298.0,88.20,11.5,16
...,...,...,...,...,...,...,...,...,...,...,...
6711,Maruti,3120.0,50000,Petrol,Manual,2,18.90,998.0,67.10,90.0,10
6712,Hyundai,5700.0,80000,Diesel,Manual,2,22.54,1396.0,88.73,219.7,9
6713,Hyundai,3840.0,110000,Petrol,Manual,1,18.50,1197.0,82.85,113.7,10
6714,Hyundai,1620.0,119000,Diesel,Manual,4,16.80,1493.0,110.00,24.0,16


In [3]:
#Encoding categorical data
e_name = LabelEncoder()
e_fuel = LabelEncoder()
e_transmission = LabelEncoder()

In [4]:
df['name'] = e_name.fit_transform(df['name'])
df['fuel'] = e_fuel.fit_transform(df['fuel'])
df['transmission'] = e_transmission.fit_transform(df['transmission'])

In [5]:
df.head(10)

Unnamed: 0,name,selling_price,km_driven,fuel,transmission,owner,mileage,engine,max_power,torque,age
0,20,5400.0,145500,0,1,1,23.4,1248.0,74.0,190.0,9
1,26,4440.0,120000,0,1,2,21.14,1498.0,103.52,250.0,9
2,10,1896.0,140000,2,1,3,17.7,1497.0,78.0,12.7,17
3,11,2700.0,127000,0,1,1,23.0,1396.0,90.0,22.4,13
4,20,1560.0,120000,2,1,1,16.1,1298.0,88.2,11.5,16
5,11,5280.0,45000,2,1,1,20.14,1197.0,81.86,113.75,6
6,20,1152.0,175000,1,1,1,17.3,1061.0,57.5,7.8,16
7,20,540.0,5000,2,1,2,16.1,796.0,37.0,59.0,22
8,28,4200.0,90000,0,1,1,23.59,1364.0,67.1,170.0,12
9,9,2400.0,169000,0,1,1,20.0,1399.0,68.1,160.0,10


In [6]:
df[df['name'] == 0]

Unnamed: 0,name,selling_price,km_driven,fuel,transmission,owner,mileage,engine,max_power,torque,age
2974,0,900.0,90000,0,1,2,12.8,1489.0,35.5,72.9,23
3978,0,1188.0,100000,0,1,2,12.8,1995.0,52.0,106.0,29
4167,0,1464.0,60000,0,1,2,12.8,1995.0,52.0,106.0,15
6514,0,2400.0,80000,0,1,3,13.5,1995.0,52.0,106.0,15


In [7]:
#Looking for correlation
corr = df.corr()
corr

Unnamed: 0,name,selling_price,km_driven,fuel,transmission,owner,mileage,engine,max_power,torque,age
name,1.0,-0.041741,0.094086,-0.135344,0.056751,0.01066,-0.04419,0.104687,-0.074094,0.026149,0.030037
selling_price,-0.041741,1.0,-0.16138,-0.249654,-0.465436,-0.241565,-0.108429,0.44264,0.692283,0.616859,-0.427395
km_driven,0.094086,-0.16138,1.0,-0.281941,0.119367,0.258275,-0.196312,0.253361,0.041585,0.048714,0.387916
fuel,-0.135344,-0.249654,-0.281941,1.0,-0.02288,-0.01733,-0.082319,-0.531945,-0.320579,-0.613286,0.052761
transmission,0.056751,-0.465436,0.119367,-0.02288,1.0,0.088983,0.173038,-0.218354,-0.441137,-0.29382,0.144189
owner,0.01066,-0.241565,0.258275,-0.01733,0.088983,1.0,-0.187215,0.030576,-0.062707,-0.119669,0.489824
mileage,-0.04419,-0.108429,-0.196312,-0.082319,0.173038,-0.187215,1.0,-0.578923,-0.378289,-0.132158,-0.366085
engine,0.104687,0.44264,0.253361,-0.531945,-0.218354,0.030576,-0.578923,1.0,0.683237,0.611222,0.019705
max_power,-0.074094,0.692283,0.041585,-0.320579,-0.441137,-0.062707,-0.378289,0.683237,1.0,0.743225,-0.159995
torque,0.026149,0.616859,0.048714,-0.613286,-0.29382,-0.119669,-0.132158,0.611222,0.743225,1.0,-0.263141


In [8]:
#Definig the target variable
X = df[[ 'name', 'km_driven', 'fuel', 'transmission', 'owner', 'mileage', 'engine', 'max_power', 'torque', 'age']]
y = np.log(df['selling_price'])

In [9]:
#Scaling the data
scale = StandardScaler()
X = scale.fit_transform(X)
X

array([[ 0.3298181 ,  1.22836757, -0.90874683, ..., -0.43375489,
         0.31856513, -0.09975205],
       [ 1.1996054 ,  0.79396697, -0.90874683, ...,  0.49707529,
         0.97364957, -0.09975205],
       [-1.11982741,  1.13467332,  1.11484194, ..., -0.30762614,
        -1.61720939,  1.95290137],
       ...,
       [-0.97486286,  0.62361379,  1.11484194, ..., -0.15469503,
        -0.51448392,  0.15682963],
       [-0.97486286,  0.77693165, -0.90874683, ...,  0.70140387,
        -1.49383515,  1.69631969],
       [ 0.3298181 ,  0.79396697, -0.90874683, ..., -0.43690811,
         0.31856513,  1.18315634]])

## Choosing the best model

### Linear Regression

In [10]:
#Calling the Linear Regression Model
mult_reg = linear_model.LinearRegression()

mult_reg.fit(X, y)

In [11]:
#Defininf the size of the sample for training and testing
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 100)

In [12]:
y_train.shape

(4701,)

In [13]:
y_test.shape

(2015,)

In [14]:
#Applying the Linear Regression to our train set
mult_reg = LinearRegression().fit(X_train, y_train)

In [15]:
#Predicting values
y_pred= mult_reg.predict(X_test)  
x_pred= mult_reg.predict(X_train) 

In [16]:
#Comparison between the real and the predicted values
reg_model_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred})
reg_model_diff

Unnamed: 0,Actual value,Predicted value
2386,9.487972,9.110737
6154,9.137770,8.962735
3279,8.571681,8.719051
2694,8.188689,8.623340
1745,9.309279,9.153637
...,...,...
489,8.243808,8.379927
5203,8.881836,9.022975
3990,8.794825,8.745876
2758,8.794825,8.608012


In [17]:
#Calculating the error
mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print('Mean Absolute Error:', mae)
print('Mean Square Error:', mse)
print('Root Mean Square Error:', rmse)

Mean Absolute Error: 0.2231168909886933
Mean Square Error: 0.08558142848248987
Root Mean Square Error: 0.2925430369748866


### Decision Tree Regressor

In [18]:
#Calling the Decision Tree Regresor Model
model_tree = DecisionTreeRegressor(random_state=100)
model_tree.fit(X_train, y_train)
predictions = model_tree.predict(X_test)

In [19]:
#Predictions
y_pred_tree = model_tree.predict(X_test)  
x_pred_tree = model_tree.predict(X_train) 

In [20]:
#Comparing
reg_model_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred_tree})
reg_model_diff

Unnamed: 0,Actual value,Predicted value
2386,9.487972,9.392662
6154,9.137770,9.169518
3279,8.571681,8.371011
2694,8.188689,8.205084
1745,9.309279,9.206332
...,...,...
489,8.243808,8.006368
5203,8.881836,8.830543
3990,8.794825,8.669053
2758,8.794825,8.767173


In [21]:
#Size of the error
mae_tree = metrics.mean_absolute_error(y_test, y_pred_tree)
mse_tree = metrics.mean_squared_error(y_test, y_pred_tree)
rmse_tree = np.sqrt(mse_tree)

print('Mean Absolute Error:', mae_tree)
print('Mean Square Error:', mse_tree)
print('Root Mean Square Error:', rmse_tree)

Mean Absolute Error: 0.20401814087911857
Mean Square Error: 0.08147001268005694
Root Mean Square Error: 0.28542952314022624


### Forest Tree Regressor

In [22]:
#Forest Tree Regressor
forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(X_train, y_train)
forest_pred = forest_model.predict(X_test)

In [23]:
#Predicting
y_pred_forest = forest_model.predict(X_test)  
x_pred_forest = forest_model.predict(X_train) 

In [24]:
#Difference of real and predicted values
reg_model_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred_forest})
reg_model_diff

Unnamed: 0,Actual value,Predicted value
2386,9.487972,9.345773
6154,9.137770,9.078579
3279,8.571681,8.489618
2694,8.188689,8.228849
1745,9.309279,9.136780
...,...,...
489,8.243808,8.264324
5203,8.881836,8.863575
3990,8.794825,8.853561
2758,8.794825,8.821520


In [25]:
#Size of error
mae_forest = metrics.mean_absolute_error(y_test, y_pred_forest)
mse_forest = metrics.mean_squared_error(y_test, y_pred_forest)
rmse_forest = np.sqrt(mse_forest)

print('Mean Absolute Error:', mae_forest)
print('Mean Square Error:', mse_forest)
print('Root Mean Square Error:', rmse_forest)

Mean Absolute Error: 0.15554058696002984
Mean Square Error: 0.046300807770354696
Root Mean Square Error: 0.21517622491891314


In [26]:
#Comparing the models
best_model = pd.DataFrame({
    'model': ['Multiple_Linear_Regression', 'Tree_Regressor', 'Forest_Regressor'],
    
    'mae': [mae, mae_tree, mae_forest],
    
    'mse': [mse, mse_tree, mse_forest],
    
    'rmse': [rmse, rmse_tree, rmse_forest] })
best_model

Unnamed: 0,model,mae,mse,rmse
0,Multiple_Linear_Regression,0.223117,0.085581,0.292543
1,Tree_Regressor,0.204018,0.08147,0.28543
2,Forest_Regressor,0.155541,0.046301,0.215176


## Making predictions with new data

In [27]:
#Creating a new set of variables for a car
new_car = [['Volkswagen', 141343.855, 'Petrol', 'Manual', 1, 31, 1395, 150, 280, 7]]

In [28]:
#Saving the new data in array
new_car = np.array(new_car)
new_car

array([['Volkswagen', '141343.855', 'Petrol', 'Manual', '1', '31',
        '1395', '150', '280', '7']], dtype='<U32')

In [29]:
#Encoding new data
new_car[:,0] = e_name.transform(new_car[:,0])
new_car[:,2] = e_fuel.transform(new_car[:,2])
new_car[:,3] = e_transmission.transform(new_car[:,3])
new_car

array([['29', '141343.855', '2', '1', '1', '31', '1395', '150', '280',
        '7']], dtype='<U32')

In [30]:
#Scaling the new data
new_car = scale.fit_transform(new_car)
new_car

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [36]:
#Calculating the price
price = forest_model.predict(new_car)
#Converting the price to the actual price, droping the logarithm
act_price = np.exp(price) 
act_price = round(act_price[0])

In [37]:
print(f"The estimated price of the car is {act_price:,} ")

The estimated price of the car is 6,275 
