# Import required libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split as ttsp

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error, r2_score

In [2]:
# Import dataset

car = pd.read_csv("Car prices.csv")

car.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,Audi,100,1992,regular unleaded,172.0,6.0,MANUAL,front wheel drive,4.0,Luxury,Midsize,Sedan,24,17,3105,2000
1,Audi,100,1992,regular unleaded,172.0,6.0,MANUAL,front wheel drive,4.0,Luxury,Midsize,Sedan,24,17,3105,2000
2,Audi,100,1992,regular unleaded,172.0,6.0,AUTOMATIC,all wheel drive,4.0,Luxury,Midsize,Wagon,20,16,3105,2000
3,Audi,100,1992,regular unleaded,172.0,6.0,MANUAL,front wheel drive,4.0,Luxury,Midsize,Sedan,24,17,3105,2000
4,Audi,100,1992,regular unleaded,172.0,6.0,MANUAL,all wheel drive,4.0,Luxury,Midsize,Sedan,21,16,3105,2000


## Dropping unrequired columns

In [3]:
# list of columns available in dataset

car.columns

Index(['Make', 'Model', 'Year', 'Engine Fuel Type', 'Engine HP',
       'Engine Cylinders', 'Transmission Type', 'Driven_Wheels',
       'Number of Doors', 'Market Category', 'Vehicle Size', 'Vehicle Style',
       'highway MPG', 'city mpg', 'Popularity', 'MSRP'],
      dtype='object')

In [4]:
# list of reqired columns for the model

cols = [  'Year', 
          'Engine HP',
          'Engine Cylinders', 
          'Driven_Wheels',
          'Vehicle Size', 
          'Vehicle Style',
          'MSRP'
       ]

In [5]:
car = car[cols]
car.head()

Unnamed: 0,Year,Engine HP,Engine Cylinders,Driven_Wheels,Vehicle Size,Vehicle Style,MSRP
0,1992,172.0,6.0,front wheel drive,Midsize,Sedan,2000
1,1992,172.0,6.0,front wheel drive,Midsize,Sedan,2000
2,1992,172.0,6.0,all wheel drive,Midsize,Wagon,2000
3,1992,172.0,6.0,front wheel drive,Midsize,Sedan,2000
4,1992,172.0,6.0,all wheel drive,Midsize,Sedan,2000


In [6]:
# Number of Rows and Columns

car.shape

(11907, 7)

## Null Values

In [7]:
car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11907 entries, 0 to 11906
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Year              11907 non-null  int64  
 1   Engine HP         11838 non-null  float64
 2   Engine Cylinders  11877 non-null  float64
 3   Driven_Wheels     11907 non-null  object 
 4   Vehicle Size      11907 non-null  object 
 5   Vehicle Style     11907 non-null  object 
 6   MSRP              11907 non-null  int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 651.3+ KB


In [8]:
# Null-values count for each column

car.isnull().sum()

Year                 0
Engine HP           69
Engine Cylinders    30
Driven_Wheels        0
Vehicle Size         0
Vehicle Style        0
MSRP                 0
dtype: int64

In [9]:
# Dropping Null-values from the required Dataset

car.dropna(inplace = True)

In [10]:
# car.reset_index(inplace=True)

In [11]:
car.shape

(11809, 7)

In [12]:
# Double-check for Null-values

car.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11809 entries, 0 to 11906
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Year              11809 non-null  int64  
 1   Engine HP         11809 non-null  float64
 2   Engine Cylinders  11809 non-null  float64
 3   Driven_Wheels     11809 non-null  object 
 4   Vehicle Size      11809 non-null  object 
 5   Vehicle Style     11809 non-null  object 
 6   MSRP              11809 non-null  int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 738.1+ KB


## Categorical data

In [13]:
# determining the Data-types of columns

car.dtypes

Year                  int64
Engine HP           float64
Engine Cylinders    float64
Driven_Wheels        object
Vehicle Size         object
Vehicle Style        object
MSRP                  int64
dtype: object

In [14]:
# Converting columns with Object-datatype into Categorical-datatype

for col in car.columns:
    if ( car[col].dtype == 'object'):
         car[col] = car[col].astype('category')
         car[col] = car[col].cat.codes
        
car.head()

Unnamed: 0,Year,Engine HP,Engine Cylinders,Driven_Wheels,Vehicle Size,Vehicle Style,MSRP
0,1992,172.0,6.0,2,2,14,2000
1,1992,172.0,6.0,2,2,14,2000
2,1992,172.0,6.0,0,2,15,2000
3,1992,172.0,6.0,2,2,14,2000
4,1992,172.0,6.0,0,2,14,2000


In [15]:
# generating Descriptive Statistics

car.describe()

Unnamed: 0,Year,Engine HP,Engine Cylinders,Driven_Wheels,Vehicle Size,Vehicle Style,MSRP
count,11809.0,11809.0,11809.0,11809.0,11809.0,11809.0,11809.0
mean,2010.361335,249.192142,5.645271,1.768736,0.97087,8.356169,39738.089762
std,7.595904,108.399374,1.741147,1.066926,0.875555,4.91372,48597.980379
min,1990.0,55.0,0.0,0.0,0.0,0.0,2000.0
25%,2007.0,170.0,4.0,1.0,0.0,3.0,20984.0
50%,2015.0,227.0,6.0,2.0,1.0,8.0,29950.0
75%,2016.0,300.0,6.0,3.0,2.0,14.0,42175.0
max,2017.0,750.0,12.0,3.0,2.0,15.0,548800.0


In [16]:
# determining Correlation between the columns

car.corr()

Unnamed: 0,Year,Engine HP,Engine Cylinders,Driven_Wheels,Vehicle Size,Vehicle Style,MSRP
Year,1.0,0.356608,-0.033895,-0.183961,0.104014,-0.068255,0.285782
Engine HP,0.356608,1.0,0.77681,-0.032372,0.211849,0.013216,0.73743
Engine Cylinders,-0.033895,0.77681,1.0,0.090093,0.197599,0.034338,0.597108
Driven_Wheels,-0.183961,-0.032372,0.090093,1.0,-0.131051,0.102605,-0.032922
Vehicle Size,0.104014,0.211849,0.197599,-0.131051,1.0,0.163964,0.059022
Vehicle Style,-0.068255,0.013216,0.034338,0.102605,0.163964,1.0,-0.011815
MSRP,0.285782,0.73743,0.597108,-0.032922,0.059022,-0.011815,1.0


## declaring x and y

In [17]:
# list of Independent Variables for model

x = car[[  'Year', 
           'Engine HP',
           'Engine Cylinders', 
           'Driven_Wheels',
           'Vehicle Size', 
           'Vehicle Style'
        ]]

x.head()

Unnamed: 0,Year,Engine HP,Engine Cylinders,Driven_Wheels,Vehicle Size,Vehicle Style
0,1992,172.0,6.0,2,2,14
1,1992,172.0,6.0,2,2,14
2,1992,172.0,6.0,0,2,15
3,1992,172.0,6.0,2,2,14
4,1992,172.0,6.0,0,2,14


In [18]:
# Dependent Variable

y = car['MSRP']

y.head()

0    2000
1    2000
2    2000
3    2000
4    2000
Name: MSRP, dtype: int64

# train-test split

In [19]:
# Splitting data into train and test set for training and testing the model

x_train, x_test, y_train, y_test = ttsp(x, y, test_size=0.2)

In [20]:
x_train.shape

(9447, 6)

In [21]:
y_train.shape

(9447,)

In [22]:
x_test.shape

(2362, 6)

In [23]:
y_test.shape

(2362,)

## Standardization

In [24]:
# Standardization of dataset ( subtract 'mean' and divide by 'Standard-deviation' )

std = StandardScaler()

x_train_std = std.fit_transform(x_train)

x_test_std = std.transform(x_test)

In [25]:
x_train_std

array([[-1.8918757 , -0.96020794,  0.20771613,  1.15541588,  1.16945694,
         0.73689342],
       [ 0.87526241, -1.19047381, -0.94725401,  0.22113412, -1.11198028,
        -1.29659146],
       [ 0.87526241,  1.21350186,  1.36268627, -0.71314764,  0.02873833,
         0.12684796],
       ...,
       [-0.96949633, -1.00626111, -0.94725401,  0.22113412,  1.16945694,
         1.1435904 ],
       [ 0.74349393,  1.66482296,  1.36268627,  1.15541588, -1.11198028,
        -0.48319751],
       [ 0.61172544,  0.50428298,  0.20771613,  0.22113412,  0.02873833,
         1.1435904 ]])

In [26]:
x_test_std

array([[ 0.08465152, -0.82204842, -0.94725401,  0.22113412,  1.16945694,
         1.1435904 ],
       [ 0.61172544,  0.20954267, -0.94725401, -1.6474294 , -1.11198028,
         1.1435904 ],
       [-0.31065392, -0.38914859,  0.20771613, -0.71314764, -1.11198028,
         0.33019644],
       ...,
       [-0.17888544, -0.2878316 , -0.94725401,  1.15541588, -1.11198028,
        -0.48319751],
       [-0.4424224 ,  0.42138727,  1.36268627,  1.15541588,  0.02873833,
         0.33019644],
       [-0.04711696, -0.91415477, -0.36976894,  0.22113412, -1.11198028,
        -1.70328844]])

## model training

In [27]:
# training a Linear Regression model by fitting the data in the Regressor ( 'reg' is Regressor )

reg = LinearRegression()

reg.fit(x_train_std, y_train)

LinearRegression()

In [28]:
y_pred = reg.predict(x_test_std)

y_pred

array([ 2073.9105731 , 49852.9117414 , 34975.27899128, ...,
       28751.21914809, 58035.59251687, 15482.84249346])

In [29]:
y_test

3138      21599
9308      44660
4824      26590
3726      23490
10102     52250
          ...  
11051     82500
11421    126995
10279     54990
5540      28705
2815      20390
Name: MSRP, Length: 2362, dtype: int64

In [30]:
mean_absolute_error(y_test, y_pred)

18225.21121534552

In [31]:
r2_score(y_test, y_pred)

0.5692169578092574

## Predict price

In [32]:
# Used car details for generating the fair price 

new_car = [[ 2010,
            180,
            4.0,
            2,
            1,
            12
]]

In [33]:
new_car_std = std.transform( new_car )

new_car_std

array([[-0.04711696, -0.63783572, -0.94725401,  0.22113412,  0.02873833,
         0.73689342]])

In [34]:
# Predicted Fair price

reg.predict(new_car_std)

array([13417.26868538])

In [35]:
# Hence, based on the prediction the price of car is 13417.26 .