In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score

In [2]:
mpg_df=pd.read_csv('car-mpg.csv')
mpg_df=mpg_df.drop('car_name',axis=1)
mpg_df['origin']=mpg_df['origin'].replace({1:'america',2:'europe',3:'asia'})
mpg_df=pd.get_dummies(mpg_df,columns=['origin'])
mpg_df=mpg_df.replace('?',np.nan)
mpg_df=mpg_df.apply(lambda x:x.fillna(x.median()),axis=0)

In [3]:
mpg_df.tail()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,car_type,origin_america,origin_asia,origin_europe
393,27.0,4,140.0,86,2790,15.6,82,1,1,0,0
394,44.0,4,97.0,52,2130,24.6,82,1,0,0,1
395,32.0,4,135.0,84,2295,11.6,82,1,1,0,0
396,28.0,4,120.0,79,2625,18.6,82,1,1,0,0
397,31.0,4,119.0,82,2720,19.4,82,1,1,0,0


## Separate Independent and Dependent Variable

In [4]:
X=mpg_df.drop('mpg',axis=1)
y=mpg_df[['mpg']]


In [5]:
from sklearn import preprocessing
X_scaled=preprocessing.scale(X)
X_scaled=pd.DataFrame(X_scaled,columns=X.columns)
y_scaled=preprocessing.scale(y)
y_scaled=pd.DataFrame(y_scaled,columns=y.columns)

In [6]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_scaled,y_scaled,test_size=0.3,random_state=1)

## Simple Linear Model

In [7]:
regression_model=LinearRegression()
regression_model.fit(X_train,y_train)

for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

The coefficient for cyl is 0.3210223856916108
The coefficient for disp is 0.3248343091848391
The coefficient for hp is -0.2291695005943759
The coefficient for wt is -0.7112101905072294
The coefficient for acc is 0.014713682764191108
The coefficient for yr is 0.3755811949510748
The coefficient for car_type is 0.3814769484233101
The coefficient for origin_america is -0.07472247547584153
The coefficient for origin_asia is 0.04451525203567814
The coefficient for origin_europe is 0.0483485495394537


In [8]:
intercept=regression_model.intercept_[0]
print("The intercept for our model is {}".format(intercept))

The intercept for our model is 0.01928411610363976


## Regularized RIDGE Model

In [9]:
ridge=Ridge(alpha=.3)
ridge.fit(X_train,y_train)
print("Ridge Model:",(ridge.coef_))

Ridge Model: [[ 0.31649043  0.31320707 -0.22876025 -0.70109447  0.01295851  0.37447352
   0.37725608 -0.07423624  0.04441039  0.04784031]]


## Regularized LASSO model

In [10]:
lasso=Lasso(alpha=0.1)
lasso.fit(X_train,y_train)
print("Lasso model:",(lasso.coef_))

Lasso model: [-0.         -0.         -0.01690287 -0.51890013  0.          0.28138241
  0.1278489  -0.01642647  0.          0.        ]


## Compare Scores

In [11]:
print(regression_model.score(X_train,y_train))
print(regression_model.score(X_test,y_test))

0.8343770256960538
0.8513421387780066


In [12]:
print(ridge.score(X_train,y_train))
print(ridge.score(X_test,y_test))

0.8343617931312616
0.8518882171608506


In [13]:
print(lasso.score(X_train,y_train))
print(lasso.score(X_test,y_test))

0.7938010766228453
0.8375229615977083


## Polynomial Models

In [30]:
from sklearn.preprocessing import PolynomialFeatures

In [31]:
poly=PolynomialFeatures(degree=2,interaction_only=True)

In [33]:
X_poly=poly.fit_transform(X_scaled)
X_train,X_test,y_train,y_test=train_test_split(X_poly,y,test_size=0.3,random_state=1)
X_train.shape

(278, 56)

## Simple non-regularized linear model on poly features

In [41]:
regression_model.fit(X_train,y_train)
print(regression_model.coef_[0])

[ 3.24082770e-13 -1.14204220e+12 -4.43738735e+00 -2.24947964e+00
 -2.98166341e+00 -1.56730367e+00  3.00442772e+00 -1.52060575e+12
 -7.80788356e+11  3.71375223e+12 -3.23609457e+12 -1.15918732e+00
 -1.43925476e+00 -3.57818604e-03  2.58444214e+00 -1.91918182e+00
 -3.65891647e+12 -6.45319147e+12 -2.39436996e+12 -2.28543203e+12
  3.90441895e-01  2.09503174e-01 -4.23446655e-01  3.58471680e+00
 -2.02703094e+00 -9.03672940e+11 -7.44778888e+11 -7.10893285e+11
  2.47772217e-01 -6.70440674e-01 -1.92620850e+00 -7.47558594e-01
 -2.15947171e+11 -1.77976884e+11 -1.69879374e+11 -1.72500610e-01
  5.30212402e-01 -3.32050323e+00  1.69388998e+12  1.39605098e+12
  1.33253411e+12  5.85876465e-01  1.53894043e+00  4.76389633e+11
  3.92625390e+11  3.74761903e+11  4.00207520e-01 -1.27131857e+10
 -1.04778089e+10 -1.00010944e+10 -1.09798815e+12  8.13175594e+11
  7.76178109e+11  2.20248210e+11 -5.15971535e+12  2.83957085e+12]


In [42]:
ridge=Ridge(alpha=0.3)
ridge.fit(X_train,y_train)
print("Ridge model:",(ridge.coef_))

Ridge model: [[ 0.          3.73512981 -2.93500874 -2.13974194 -3.56547812 -1.28898893
   3.01290805  2.04739082  0.0786974   0.21972225 -0.3302341  -1.46231096
  -1.17221896  0.00856067  2.48054694 -1.67596093  0.99537516 -2.29024279
   4.7699338  -2.08598898  0.34009408  0.35024058 -0.41761834  3.06970569
  -2.21649433  1.86339518 -2.62934278  0.38596397  0.12088534 -0.53440382
  -1.88265835 -0.7675926  -0.90146842  0.52416091  0.59678246 -0.26349448
   0.5827378  -3.02842915 -0.36548074  0.5956112  -0.15941014  0.49168856
   1.45652375 -0.43819158 -0.20964198  0.77665496  0.36489921 -0.4750838
   0.3551047   0.23188557 -1.42941282  2.06831543 -0.34986402 -0.32320394
   0.39054656  0.06283411]]


In [43]:
print(ridge.score(X_train,y_train))
print(ridge.score(X_test,y_test))

0.9143225702003365
0.8613398053698541


In [44]:
lasso=Lasso(alpha=0.01)
lasso.fit(X_train,y_train)
print("Lasso model:",(lasso.coef_))

Lasso model: [ 0.          0.52263805 -0.5402102  -1.99423315 -4.55360385 -0.85285179
  2.99044036  0.00711821 -0.          0.76073274 -0.         -0.
 -0.19736449  0.          2.04221833 -1.00014513  0.         -0.
  4.28412669 -0.          0.          0.31442062 -0.          2.13894094
 -1.06760107  0.         -0.          0.          0.         -0.44991392
 -1.55885506 -0.         -0.68837902  0.          0.17455864 -0.34653644
  0.3313704  -2.84931966  0.         -0.34340563  0.00815105  0.47019445
  1.25759712 -0.69634581  0.          0.55528147  0.2948979  -0.67289549
  0.06490671  0.         -1.19639935  1.06711702  0.         -0.88034391
  0.         -0.        ]


In [45]:
print(lasso.score(X_train,y_train))
print(lasso.score(X_test,y_test))

0.9098286193898272
0.8695296858772456
