In [1]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer

In [2]:
df = pd.read_csv('cars.csv')
df

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,13495
1,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,16500
2,1,?,alfa-romero,gas,hatchback,rwd,front,65.5,52.4,ohcv,152,154,19,26,16500
3,2,164,audi,gas,sedan,fwd,front,66.2,54.3,ohc,109,102,24,30,13950
4,2,164,audi,gas,sedan,4wd,front,66.4,54.3,ohc,136,115,18,22,17450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95,volvo,gas,sedan,rwd,front,68.9,55.5,ohc,141,114,23,28,16845
201,-1,95,volvo,gas,sedan,rwd,front,68.8,55.5,ohc,141,160,19,25,19045
202,-1,95,volvo,gas,sedan,rwd,front,68.9,55.5,ohcv,173,134,18,23,21485
203,-1,95,volvo,diesel,sedan,rwd,front,68.9,55.5,ohc,145,106,26,27,22470


In [3]:
df.replace('?', np.nan, inplace = True)

In [4]:
df

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,13495
1,3,,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,16500
2,1,,alfa-romero,gas,hatchback,rwd,front,65.5,52.4,ohcv,152,154,19,26,16500
3,2,164,audi,gas,sedan,fwd,front,66.2,54.3,ohc,109,102,24,30,13950
4,2,164,audi,gas,sedan,4wd,front,66.4,54.3,ohc,136,115,18,22,17450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95,volvo,gas,sedan,rwd,front,68.9,55.5,ohc,141,114,23,28,16845
201,-1,95,volvo,gas,sedan,rwd,front,68.8,55.5,ohc,141,160,19,25,19045
202,-1,95,volvo,gas,sedan,rwd,front,68.9,55.5,ohcv,173,134,18,23,21485
203,-1,95,volvo,diesel,sedan,rwd,front,68.9,55.5,ohc,145,106,26,27,22470


In [5]:
si = SimpleImputer()

df.iloc[:,[1,11]] = si.fit_transform(df.iloc[:,[1,11]])
df

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,122.0,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111.0,21,27,13495
1,3,122.0,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111.0,21,27,16500
2,1,122.0,alfa-romero,gas,hatchback,rwd,front,65.5,52.4,ohcv,152,154.0,19,26,16500
3,2,164.0,audi,gas,sedan,fwd,front,66.2,54.3,ohc,109,102.0,24,30,13950
4,2,164.0,audi,gas,sedan,4wd,front,66.4,54.3,ohc,136,115.0,18,22,17450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95.0,volvo,gas,sedan,rwd,front,68.9,55.5,ohc,141,114.0,23,28,16845
201,-1,95.0,volvo,gas,sedan,rwd,front,68.8,55.5,ohc,141,160.0,19,25,19045
202,-1,95.0,volvo,gas,sedan,rwd,front,68.9,55.5,ohcv,173,134.0,18,23,21485
203,-1,95.0,volvo,diesel,sedan,rwd,front,68.9,55.5,ohc,145,106.0,26,27,22470


In [6]:
cat_cols = df.select_dtypes(object).columns
cat_cols

Index(['make', 'fuel-type', 'body-style', 'drive-wheels', 'engine-location',
       'engine-type'],
      dtype='object')

In [7]:
oe = OrdinalEncoder()

df[cat_cols] = oe.fit_transform(df[cat_cols])
df

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,122.0,0.0,1.0,0.0,2.0,0.0,64.1,48.8,0.0,130,111.0,21,27,13495
1,3,122.0,0.0,1.0,0.0,2.0,0.0,64.1,48.8,0.0,130,111.0,21,27,16500
2,1,122.0,0.0,1.0,2.0,2.0,0.0,65.5,52.4,5.0,152,154.0,19,26,16500
3,2,164.0,1.0,1.0,3.0,1.0,0.0,66.2,54.3,3.0,109,102.0,24,30,13950
4,2,164.0,1.0,1.0,3.0,0.0,0.0,66.4,54.3,3.0,136,115.0,18,22,17450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95.0,21.0,1.0,3.0,2.0,0.0,68.9,55.5,3.0,141,114.0,23,28,16845
201,-1,95.0,21.0,1.0,3.0,2.0,0.0,68.8,55.5,3.0,141,160.0,19,25,19045
202,-1,95.0,21.0,1.0,3.0,2.0,0.0,68.9,55.5,5.0,173,134.0,18,23,21485
203,-1,95.0,21.0,0.0,3.0,2.0,0.0,68.9,55.5,3.0,145,106.0,26,27,22470


In [8]:
x = df.iloc[:,:-1]
y = df['price']

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
xtrain, xtest, ytrain, ytest = train_test_split(x,y, test_size = 0.3, random_state = 1)

In [12]:
lr = LinearRegression()
lr.fit(xtrain, ytrain)
ypred = lr.predict(xtest)

print(f"Train Accuracy is {lr.score(xtrain, ytrain)} \nTest Accuracy is {lr.score(xtest, ytest)}")

Train Accuracy is 0.8539412613914049 
Test Accuracy is 0.7440263258545711


In [13]:
lr.coef_

array([ 8.10627829e+01, -4.98250716e+00, -1.93628369e+02, -3.96939528e+02,
       -1.59308387e+02,  2.03453658e+03,  1.61641371e+04,  8.12358117e+02,
        2.86594382e+02,  3.04469374e+02,  9.72915986e+01, -1.24843308e+01,
        2.37997595e+02, -3.56739524e+02])

# Lasso Regression

In [22]:
l1 = Lasso(alpha = 1)
l1.fit(xtrain, ytrain)

print(f"Train Accuracy is {l1.score(xtrain, ytrain)} \nTest Accuracy is {l1.score(xtest, ytest)}")

Train Accuracy is 0.8539398244720607 
Test Accuracy is 0.7442680796367558


In [23]:
l1.coef_

array([ 8.10099235e+01, -4.98244205e+00, -1.93459787e+02, -3.89872260e+02,
       -1.60587747e+02,  2.03168757e+03,  1.60855755e+04,  8.10235442e+02,
        2.87249208e+02,  3.04384733e+02,  9.73692186e+01, -1.23519330e+01,
        2.36917278e+02, -3.55445582e+02])

# Hyperparameter Tuning

In [24]:
for i in range(100, 200):
    l1 = Lasso(alpha = i)
    l1.fit(xtrain, ytrain)
    
    print(f"{i} {l1.score(xtrain, ytrain)} {l1.score(xtest, ytest)}")

100 0.8399550254990269 0.7644150726044205
101 0.8396765068564949 0.7645910215883885
102 0.8393951129870751 0.7647659882285331
103 0.8391110603761689 0.7649401532453524
104 0.8388242397338377 0.7651134250740712
105 0.8385345349342124 0.7652857085507111
106 0.8382421762634075 0.7654571927954633
107 0.8379470482259248 0.7656277826674934
108 0.837649029554768 0.7657973803678717
109 0.8373483618984794 0.7659661812330151
110 0.8370449238236125 0.7661340868339707
111 0.836738589066682 0.7663009968898453
112 0.8364296146136767 0.7664671159710184
113 0.8361177485059503 0.7666322443886752
114 0.8358032437723879 0.7667965815573106
115 0.8354858300186775 0.7669599153505952
116 0.8351657814298425 0.7671224598759211
117 0.8348428153561449 0.7672839955065129
118 0.8345172182613106 0.7674447438786604
119 0.8341887040017569 0.7676044846703385
120 0.8338575681406851 0.7677634444292694
121 0.833523511083555 0.7679213945163487
122 0.8331868353897822 0.7680785647942531
123 0.832847230094472 0.7682347200504

In [25]:
l1 = Lasso(alpha = 170)
l1.fit(xtrain, ytrain)

print(f"Train Accuracy is {l1.score(xtrain, ytrain)} \nTest Accuracy is {l1.score(xtest, ytest)}")

Train Accuracy is 0.813762160321102 
Test Accuracy is 0.7745657699036098


# Ridge Regression

In [26]:
l2 = Ridge()
l2.fit(xtrain, ytrain)

print(f"Train Accuracy is {l2.score(xtrain, ytrain)} \nTest Accuracy is {l2.score(xtest, ytest)}")

Train Accuracy is 0.8482674089811111 
Test Accuracy is 0.7597037544007509


In [27]:
for i in range(1,50):
    l2 = Ridge(alpha = i)
    l2.fit(xtrain, ytrain)
    
    print(f"{i} {l2.score(xtrain, ytrain)} {l2.score(xtest, ytest)}")

1 0.8482674089811111 0.7597037544007509
2 0.8415619363937635 0.7655470158349814
3 0.8363951508249314 0.7685522366994493
4 0.8324610677838388 0.7703895039721603
5 0.8293863299288738 0.7716361765232881
6 0.8269124234824408 0.7725405637365281
7 0.8248693933233863 0.7732264194061074
8 0.8231441159269753 0.7737624277305333
9 0.82165935802416 0.77418998914559
10 0.8203609531530248 0.7745357446711172
11 0.8192099407731408 0.7748177686083774
12 0.8181776487192544 0.7750488663114246
13 0.8172425389038229 0.7752384410630868
14 0.8163881323933204 0.7753936067841302
15 0.8156016116243241 0.7755198811711728
16 0.8148728578870452 0.7756216343440712
17 0.814193775120706 0.775702389170018
18 0.8135578061085905 0.7757650283597382
19 0.812959580531318 0.7758119410917473
20 0.8123946550311475 0.7758451292906464
21 0.8118593185518815 0.7758662862953722
22 0.8113504446953665 0.7758768561943044
23 0.8108653784189215 0.7758780793379442
24 0.8104018481402175 0.7758710277800905
25 0.8099578968650145 0.77585663

In [29]:
l2 = Ridge(alpha = 10)
l2.fit(xtrain, ytrain)

print(f"Train Accuracy is {l2.score(xtrain, ytrain)} \nTest Accuracy is {l2.score(xtest, ytest)}")

Train Accuracy is 0.8203609531530248 
Test Accuracy is 0.7745357446711172


# Elastic Net

In [30]:
en = ElasticNet()
en.fit(xtrain, ytrain)

print(f"Train Accuracy is {en.score(xtrain, ytrain)} \nTest Accuracy is {en.score(xtest, ytest)}")

Train Accuracy is 0.7969654339953139 
Test Accuracy is 0.7722146104160097
