In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
import scipy.stats as stats
from sklearn.preprocessing import PowerTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [2]:
car = pd.read_csv("CarPrice_Assignment.csv")

In [3]:
car.shape

(205, 26)

In [4]:
car.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [5]:
car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    object 
 16  enginesize        205 non-null    int64  
 1

In [6]:
car.isnull().sum()

car_ID              0
symboling           0
CarName             0
fueltype            0
aspiration          0
doornumber          0
carbody             0
drivewheel          0
enginelocation      0
wheelbase           0
carlength           0
carwidth            0
carheight           0
curbweight          0
enginetype          0
cylindernumber      0
enginesize          0
fuelsystem          0
boreratio           0
stroke              0
compressionratio    0
horsepower          0
peakrpm             0
citympg             0
highwaympg          0
price               0
dtype: int64

In [7]:
car[['fueltype','price']]

Unnamed: 0,fueltype,price
0,gas,13495.0
1,gas,16500.0
2,gas,16500.0
3,gas,13950.0
4,gas,17450.0
...,...,...
200,gas,16845.0
201,gas,19045.0
202,gas,21485.0
203,diesel,22470.0


In [8]:
car['fueltype'].unique()

array(['gas', 'diesel'], dtype=object)

In [9]:
car[['aspiration','price']]

Unnamed: 0,aspiration,price
0,std,13495.0
1,std,16500.0
2,std,16500.0
3,std,13950.0
4,std,17450.0
...,...,...
200,std,16845.0
201,turbo,19045.0
202,std,21485.0
203,turbo,22470.0


In [10]:
car['aspiration'].unique()

array(['std', 'turbo'], dtype=object)

In [11]:
car[['carbody','price']].sample(20)

Unnamed: 0,carbody,price
96,sedan,7499.0
131,hatchback,9895.0
24,hatchback,6229.0
109,wagon,12440.0
202,sedan,21485.0
94,sedan,7299.0
140,hatchback,7603.0
6,sedan,17710.0
190,hatchback,9980.0
177,hatchback,11248.0


In [12]:
car['carbody'].unique()

array(['convertible', 'hatchback', 'sedan', 'wagon', 'hardtop'],
      dtype=object)

In [13]:
car[['doornumber','price']]

Unnamed: 0,doornumber,price
0,two,13495.0
1,two,16500.0
2,two,16500.0
3,four,13950.0
4,four,17450.0
...,...,...
200,four,16845.0
201,four,19045.0
202,four,21485.0
203,four,22470.0


In [14]:
car[['drivewheel','price']].head(20)

Unnamed: 0,drivewheel,price
0,rwd,13495.0
1,rwd,16500.0
2,rwd,16500.0
3,fwd,13950.0
4,4wd,17450.0
5,fwd,15250.0
6,fwd,17710.0
7,fwd,18920.0
8,fwd,23875.0
9,4wd,17859.167


In [15]:
car[['enginelocation','price']].value_counts()

enginelocation  price  
front           5572.0     2
                6229.0     2
                6692.0     2
                7775.0     2
                7295.0     2
                          ..
                41315.0    1
                45400.0    1
rear            32528.0    1
                34028.0    1
                37028.0    1
Name: count, Length: 189, dtype: int64

In [16]:
car[['enginetype','price']].value_counts().head(60)

enginetype  price  
ohc         5572.0     2
            7609.0     2
            6692.0     2
            6229.0     2
            7295.0     2
            8495.0     2
            7957.0     2
            7898.0     2
ohcv        13499.0    2
ohc         8916.5     2
            8921.0     2
            9279.0     2
            8845.0     2
dohc        9298.0     1
            13495.0    1
            9538.0     1
l           12440.0    1
            13200.0    1
            15580.0    1
            13860.0    1
            16695.0    1
            16900.0    1
            17075.0    1
            16630.0    1
dohc        18150.0    1
            18620.0    1
            32250.0    1
            35550.0    1
            15750.0    1
            15998.0    1
            16500.0    1
            15690.0    1
ohc         6095.0     1
            5499.0     1
            5399.0     1
            5389.0     1
            6377.0     1
            6479.0     1
            6295.0     1
     

In [17]:
car[['cylindernumber','price']].value_counts()

cylindernumber  price  
four            5572.0     2
                6229.0     2
                7295.0     2
                7609.0     2
                7775.0     2
                          ..
twelve          36000.0    1
two             10945.0    1
                11845.0    1
                13645.0    1
                15645.0    1
Name: count, Length: 190, dtype: int64

In [18]:
car[['fuelsystem','price']]

Unnamed: 0,fuelsystem,price
0,mpfi,13495.0
1,mpfi,16500.0
2,mpfi,16500.0
3,mpfi,13950.0
4,mpfi,17450.0
...,...,...
200,mpfi,16845.0
201,mpfi,19045.0
202,mpfi,21485.0
203,idi,22470.0


In [19]:
car['symboling'].value_counts()

symboling
 0    67
 1    54
 2    32
 3    27
-1    22
-2     3
Name: count, dtype: int64

In [20]:
car[['symboling','price']]

Unnamed: 0,symboling,price
0,3,13495.0
1,3,16500.0
2,1,16500.0
3,2,13950.0
4,2,17450.0
...,...,...
200,-1,16845.0
201,-1,19045.0
202,-1,21485.0
203,-1,22470.0


In [21]:
car[['compressionratio','price']]

Unnamed: 0,compressionratio,price
0,9.0,13495.0
1,9.0,16500.0
2,9.0,16500.0
3,10.0,13950.0
4,8.0,17450.0
...,...,...
200,9.5,16845.0
201,8.7,19045.0
202,8.8,21485.0
203,23.0,22470.0


In [22]:
car[['boreratio','price']]

Unnamed: 0,boreratio,price
0,3.47,13495.0
1,3.47,16500.0
2,2.68,16500.0
3,3.19,13950.0
4,3.19,17450.0
...,...,...
200,3.78,16845.0
201,3.78,19045.0
202,3.58,21485.0
203,3.01,22470.0


In [23]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
transformer = ColumnTransformer(transformers=[
    ('tnf1',OneHotEncoder(sparse_output=False,drop='first', handle_unknown='ignore'),['fueltype','carbody','drivewheel','fuelsystem']),
    ('tnf2',OrdinalEncoder(categories=[['std','turbo'],['two','four'],['front','rear'],[ 'dohcv','ohc','rotor','ohcv','ohcf','l','dohc'],['twelve','two','three','four','six','five','eight']]),['aspiration','doornumber','enginelocation','enginetype','cylindernumber'])
],remainder='passthrough')


In [78]:
df_new = car[['horsepower','wheelbase','carlength','carwidth','curbweight','enginesize','stroke','boreratio','citympg','highwaympg','fueltype','aspiration','carbody','doornumber','drivewheel','enginelocation','enginetype','cylindernumber','fuelsystem','price']]
# wheelbase, carlength, carwidth, curbweight, enginesize, boreratio, stroke, horsepower, fueltype, aspiration, doornumber, drivewheel, enginelocation, enginetype, cylindernumber

In [79]:
df_new.shape

(205, 20)

In [80]:
print(df_new.shape)

(205, 20)


In [81]:
X = df_new.iloc[0:,0:-1]
X

Unnamed: 0,horsepower,wheelbase,carlength,carwidth,curbweight,enginesize,stroke,boreratio,citympg,highwaympg,fueltype,aspiration,carbody,doornumber,drivewheel,enginelocation,enginetype,cylindernumber,fuelsystem
0,111,88.6,168.8,64.1,2548,130,2.68,3.47,21,27,gas,std,convertible,two,rwd,front,dohc,four,mpfi
1,111,88.6,168.8,64.1,2548,130,2.68,3.47,21,27,gas,std,convertible,two,rwd,front,dohc,four,mpfi
2,154,94.5,171.2,65.5,2823,152,3.47,2.68,19,26,gas,std,hatchback,two,rwd,front,ohcv,six,mpfi
3,102,99.8,176.6,66.2,2337,109,3.40,3.19,24,30,gas,std,sedan,four,fwd,front,ohc,four,mpfi
4,115,99.4,176.6,66.4,2824,136,3.40,3.19,18,22,gas,std,sedan,four,4wd,front,ohc,five,mpfi
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,114,109.1,188.8,68.9,2952,141,3.15,3.78,23,28,gas,std,sedan,four,rwd,front,ohc,four,mpfi
201,160,109.1,188.8,68.8,3049,141,3.15,3.78,19,25,gas,turbo,sedan,four,rwd,front,ohc,four,mpfi
202,134,109.1,188.8,68.9,3012,173,2.87,3.58,18,23,gas,std,sedan,four,rwd,front,ohcv,six,mpfi
203,106,109.1,188.8,68.9,3217,145,3.40,3.01,26,27,diesel,turbo,sedan,four,rwd,front,ohc,six,idi


In [82]:
X.shape

(205, 19)

In [83]:
y = df_new['price']
y.shape

(205,)

In [84]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [85]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(164, 19)
(41, 19)
(164,)
(41,)


In [86]:
X_train.shape

(164, 19)

In [87]:
X_train_trf = transformer.fit_transform(X_train)

In [88]:
transformer.fit_transform(X_train).shape

(164, 28)

In [89]:
X_test_trf = transformer.transform(X_test)



In [90]:
transformer.transform(X_test).shape



(41, 28)

In [91]:
X_train_trf

array([[ 1.  ,  0.  ,  0.  , ...,  3.46, 16.  , 18.  ],
       [ 1.  ,  0.  ,  1.  , ...,  3.13, 16.  , 22.  ],
       [ 1.  ,  0.  ,  1.  , ...,  2.97, 37.  , 41.  ],
       ...,
       [ 1.  ,  0.  ,  0.  , ...,  3.46, 16.  , 18.  ],
       [ 1.  ,  0.  ,  0.  , ...,  3.62, 16.  , 22.  ],
       [ 1.  ,  1.  ,  0.  , ...,  3.62, 24.  , 30.  ]])

In [92]:
X_test_trf

array([[ 1.  ,  0.  ,  0.  , ...,  2.97, 31.  , 38.  ],
       [ 1.  ,  0.  ,  1.  , ...,  3.59, 19.  , 24.  ],
       [ 1.  ,  0.  ,  0.  , ...,  3.33, 27.  , 34.  ],
       ...,
       [ 0.  ,  0.  ,  0.  , ...,  3.01, 33.  , 38.  ],
       [ 0.  ,  0.  ,  0.  , ...,  3.01, 37.  , 46.  ],
       [ 1.  ,  0.  ,  0.  , ...,  3.46, 19.  , 24.  ]])

In [93]:
lr = LinearRegression()

In [94]:
lr.fit(X_train_trf,y_train)

In [95]:
y_pred = lr.predict(X_test_trf)

In [96]:
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

In [97]:
r2 = r2_score(y_pred,y_test)

In [98]:
r2

0.8330123996594726