# Multiple Linear Regression- Car dataset

## Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
!pip install scikit-learn



## Importing the dataset

In [2]:
car = pd.read_csv('CAR.csv')

In [3]:
car.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [4]:
#missing value
car.isnull().sum()

year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
dtype: int64

In [5]:
# explore the categorical data 
#fuel
car['fuel'].value_counts()

fuel
Diesel      2151
Petrol      2122
CNG           40
LPG           23
Electric       1
Name: count, dtype: int64

In [6]:
#seller_type
car['fuel'].value_counts()

fuel
Diesel      2151
Petrol      2122
CNG           40
LPG           23
Electric       1
Name: count, dtype: int64

In [7]:
#owner
car['owner'].value_counts()

owner
First Owner             2831
Second Owner            1104
Third Owner              304
Fourth & Above Owner      81
Test Drive Car            17
Name: count, dtype: int64

In [8]:
#transmission
car['transmission'].value_counts()

transmission
Manual       3889
Automatic     448
Name: count, dtype: int64

### Split the dataset into independent and Dependent variables

In [9]:
car. columns

Index(['year', 'selling_price', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner'],
      dtype='object')

In [10]:
# 2D array
x=car[['year', 'km_driven','fuel','seller_type','transmission','owner']]
y=car[['selling_price']]

In [11]:
type(x)

pandas.core.frame.DataFrame

In [12]:
type(y)

pandas.core.frame.DataFrame

### Work with the categorical data

In [13]:
#pd.get_dummies
x_new=pd.get_dummies(car[['year','km_driven','fuel','seller_type','transmission','owner']])

In [14]:
x_new=x_new.astype(int)

In [15]:
x_new.head(30)

Unnamed: 0,year,km_driven,fuel_CNG,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Automatic,transmission_Manual,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,2007,70000,0,0,0,0,1,0,1,0,0,1,1,0,0,0,0
1,2007,50000,0,0,0,0,1,0,1,0,0,1,1,0,0,0,0
2,2012,100000,0,1,0,0,0,0,1,0,0,1,1,0,0,0,0
3,2017,46000,0,0,0,0,1,0,1,0,0,1,1,0,0,0,0
4,2014,141000,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0
5,2007,125000,0,0,0,0,1,0,1,0,0,1,1,0,0,0,0
6,2016,25000,0,0,0,0,1,0,1,0,0,1,1,0,0,0,0
7,2014,60000,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0
8,2015,25000,0,0,0,0,1,0,1,0,0,1,1,0,0,0,0
9,2017,78000,1,0,0,0,0,0,1,0,0,1,1,0,0,0,0


#### 
- fuel_CNG - 0000
- fuel_Petrol - 0001
- fuel_LPG - 0010
- fuel_Electric - 0100
- fuel_Diesel - 1000

## Splitting the dataset into the Training set and Test set
- Random State 20

In [16]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_new,y,test_size=.20,random_state=0)

## Training the Multiple Linear Regression model on the Training set

In [17]:
from sklearn.linear_model import LinearRegression

regressor=LinearRegression()
#.fit to train your model
regressor.fit(x_train.values,y_train)

## Intercept and Coefficient

In [18]:
print(f"Coefficient: {regressor.coef_}")

Coefficient: [[ 3.51072652e+04 -9.05011984e-01  6.82464420e+04  3.43066321e+05
  -5.57208380e+05  9.51434892e+04  5.07521282e+04 -3.29620064e+04
  -1.16441463e+05  1.49403470e+05  4.32450146e+05 -4.32450146e+05
  -1.79314326e+04 -1.77718265e+04 -6.04580971e+04  1.50612009e+05
  -5.44506533e+04]]


In [19]:
print(f"Intercept: {regressor.intercept_}")

Intercept: [-69840884.88011284]


## Predicting the Test set results

In [20]:
y_pred=regressor.predict(x_test.values)

### Calculate RMSE, R-Square

In [21]:
from sklearn.metrics import mean_squared_error, r2_score
import math

In [22]:
print(F"MSE: {mean_squared_error(y_test,y_pred):.2f}")

MSE: 159138339901.29


In [23]:
print(F"RMSE: {math.sqrt(mean_squared_error(y_test,y_pred)):.2f}")

RMSE: 398921.47


In [24]:
print(f"R-square: {r2_score(y_test,y_pred):.2f}")

R-square: 0.48


## Validation case scenario:
#### 1. Predict how much will be the car selling price for a car of 
- year 2014 
- 70000 km driven 
- fuel type Diesel
- Seller type Dealer
- manual transmission
- first owner

** 465000 ** actual


In [25]:
x_test.head()

Unnamed: 0,year,km_driven,fuel_CNG,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Automatic,transmission_Manual,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
2088,2018,25000,0,0,0,0,1,0,1,0,0,1,1,0,0,0,0
1191,2017,10000,0,0,0,0,1,0,1,0,0,1,1,0,0,0,0
3402,2015,23800,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0
555,2017,19890,0,0,0,0,1,1,0,0,1,0,1,0,0,0,0
1127,2015,46000,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0


In [27]:
regressor.predict([[2014,70000,0,1,0,0,0,1,0,0,0,1,1,0,0,0,0]])

array([[661519.09973817]])