# Multiple Linear Regression- Car dataset

## Importing the libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Importing the dataset

In [3]:
ds=pd.read_csv('CAR.csv')

In [4]:
ds.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,2014,450000,141000,Diesel,Individual,Manual,Second Owner


### Split the dataset into independent and Dependent variables

In [5]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4337 entries, 0 to 4336
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   year           4337 non-null   int64 
 1   selling_price  4337 non-null   int64 
 2   km_driven      4337 non-null   int64 
 3   fuel           4337 non-null   object
 4   seller_type    4337 non-null   object
 5   transmission   4337 non-null   object
 6   owner          4337 non-null   object
dtypes: int64(3), object(4)
memory usage: 237.3+ KB


In [9]:
X=ds[['year', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner']]
y=ds[['selling_price']]

In [6]:
ds.columns

Index(['year', 'selling_price', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner'],
      dtype='object')

### Work with the catagorical data

In [7]:
ds['fuel'].value_counts()

Diesel      2151
Petrol      2122
CNG           40
LPG           23
Electric       1
Name: fuel, dtype: int64

In [11]:
ds['seller_type'].value_counts()

Individual          3242
Dealer               993
Trustmark Dealer     102
Name: seller_type, dtype: int64

In [12]:
ds['transmission'].value_counts()

Manual       3889
Automatic     448
Name: transmission, dtype: int64

In [13]:
ds['owner'].value_counts()

First Owner             2831
Second Owner            1104
Third Owner              304
Fourth & Above Owner      81
Test Drive Car            17
Name: owner, dtype: int64

In [15]:
X=pd.get_dummies(ds[['year', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner']],
                drop_first=True)

In [16]:
X.head()

Unnamed: 0,year,km_driven,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,2007,70000,0,0,0,1,1,0,1,0,0,0,0
1,2007,50000,0,0,0,1,1,0,1,0,0,0,0
2,2012,100000,1,0,0,0,1,0,1,0,0,0,0
3,2017,46000,0,0,0,1,1,0,1,0,0,0,0
4,2014,141000,1,0,0,0,1,0,1,0,1,0,0


## Splitting the dataset into the Training set and Test set
- Random State 20

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_rest, y_train, y_rest= train_test_split(X,y,
                                                  test_size=.2,
                                                  random_state=20)

In [20]:
X_test, X_val, y_test, y_val=train_test_split(X_rest,y_rest,
                                                  test_size=.5,
                                                  random_state=20)

## Training the Multiple Linear Regression model on the Training set

In [21]:
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(X_train.values, y_train)

LinearRegression()

## Intercept and Coefficient

In [23]:
print('The coefficients are:', regressor.coef_)
print('The intercept is:', regressor.intercept_)

The coefficients are: [[ 3.62245575e+04 -8.31584195e-01  2.88702246e+05  1.89174898e-10
   4.65464254e+04  2.64057958e+03 -6.04980455e+04  1.71882689e+05
  -8.64323880e+05 -3.50851884e+03 -4.04890692e+04  1.83178786e+05
  -2.83903020e+04]]
The intercept is: [-71683645.58006911]


In [24]:
X_train.columns

Index(['year', 'km_driven', 'fuel_Diesel', 'fuel_Electric', 'fuel_LPG',
       'fuel_Petrol', 'seller_type_Individual', 'seller_type_Trustmark Dealer',
       'transmission_Manual', 'owner_Fourth & Above Owner',
       'owner_Second Owner', 'owner_Test Drive Car', 'owner_Third Owner'],
      dtype='object')

## Predicting the Test set results

## selling price= -71683645.58006911+ year(3.62245575e+04)+km_driven(-8.31584195e-01)+fuel_Diesel(2.88702246e+05)+fuel_Electric(1.89174898e-10)+fuel_LPG(4.65464254e+04)+fuel_Petro(2.64057958e+03)+seller_type_Individual(-6.04980455e+04)+'seller_type_Trustmark Dealer(1.71882689e+05)+transmission_Manual(-8.64323880e+05)+owner_Fourth & Above Owner(-3.50851884e+03)+owner_Second Owner(-4.04890692e+04)+owner_Test Drive Car(1.83178786e+05)+owner_Third Owner( -2.83903020e+04)

### Calculate RMSE, R-Square

In [28]:
y_pred=regressor.predict(X_test.values)

In [30]:
from sklearn.metrics import mean_squared_error, r2_score
import math
print(f'The r-square is {r2_score(y_test, y_pred): .2f}')
print(f'The RMSE is {math.sqrt(mean_squared_error(y_test, y_pred)): .2f}')

The r-square is  0.54
The RMSE is  392646.28


## Validation case scenario:
#### 1. Predict how much will be the car selling price for a car of 
- year 2014 
- 70000 km driven 
- fuel type Diesel
- Seller type Dealer
- manual transmission
- first owner

** 465000 ** actual


In [34]:
y_predval=regressor.predict(X_val.values)

In [35]:
y_validate=y_val.to_numpy()

In [36]:
np.hstack((y_predval, y_validate))

array([[ 280943.85990559,  350000.        ],
       [ 821100.12192747,  780000.        ],
       [ 465915.677426  ,  434999.        ],
       [ 644135.25563619,  425000.        ],
       [ 425421.76671462,  280000.        ],
       [1317666.22883591,  850000.        ],
       [ 163472.75913742,  120000.        ],
       [ 357444.30497424,  250000.        ],
       [ 465344.37372941,  198000.        ],
       [ 182292.02908249,  120000.        ],
       [ 432081.00592296,  350000.        ],
       [ 464254.23321456,  125000.        ],
       [ 146019.33954768,  225000.        ],
       [ 236403.46050055,  300000.        ],
       [ 582333.88804127,  490000.        ],
       [1310281.53682841,  750000.        ],
       [ 163472.75913742,  175000.        ],
       [ 289259.70185719,  164000.        ],
       [ 601308.66958538,  525000.        ],
       [ 511497.22024338,  550000.        ],
       [ 302980.84107736,  229999.        ],
       [ 477192.70902458,  350000.        ],
       [ 4

In [37]:
X_test.head(1)

Unnamed: 0,year,km_driven,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
2549,2018,26000,1,0,0,0,0,0,1,0,0,0,0


In [42]:
#My predicted result
regressor.predict([[2014,70000,1,0,0,0,0,0,1,0,0,0,0]])

array([[638780.60333155]])