In [1]:
%matplotlib inline


# Numerical libraries
import numpy as np   

# Import Linear Regression machine learning library
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.metrics import r2_score

# to handle data in form of rows and columns 
import pandas as pd    

# importing ploting libraries
import matplotlib.pyplot as plt   

#importing seaborn for statistical plots
import seaborn as sns

In [2]:
mpg_df = pd.read_csv("auto-mpg.csv")  
mpg_df = mpg_df.drop('car name', axis=1)
mpg_df['origin'] = mpg_df['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})
mpg_df = pd.get_dummies(mpg_df, columns=['origin'])
mpg_df = mpg_df.replace('?', np.nan)
mpg_df = mpg_df.apply(lambda x: x.fillna(x.median()),axis=0)
mpg_df['horsepower']=mpg_df['horsepower'].astype(np.float16)

In [3]:
mpg_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   mpg             398 non-null    float64
 1   cylinders       398 non-null    int64  
 2   displacement    398 non-null    float64
 3   horsepower      398 non-null    float16
 4   weight          398 non-null    int64  
 5   acceleration    398 non-null    float64
 6   model year      398 non-null    int64  
 7   origin_america  398 non-null    uint8  
 8   origin_asia     398 non-null    uint8  
 9   origin_europe   398 non-null    uint8  
dtypes: float16(1), float64(3), int64(3), uint8(3)
memory usage: 20.7 KB


# separate independent and dependent variables

In [4]:
# Copy all the predictor variables into X dataframe. Since 'mpg' is dependent variable drop it
X = mpg_df.drop('mpg', axis=1)

# Copy the 'mpg' column alone into the y dataframe. This is the dependent variable
y = mpg_df[['mpg']]


In [5]:
from sklearn import preprocessing

# scale all the columns of the mpg_df. This will produce a numpy array
X_scaled = preprocessing.scale(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)  # ideally the training and test should be 

y_scaled = preprocessing.scale(y)
y_scaled = pd.DataFrame(y_scaled, columns=y.columns)  # ideally the training and test should be 

In [6]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.30, random_state=1)

# fit a simple linear model

In [7]:
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

The coefficient for cylinders is -0.08592264254448734
The coefficient for displacement is 0.3861501766895437
The coefficient for horsepower is -0.10637514644618916
The coefficient for weight is -0.7965737428612097
The coefficient for acceleration is 0.02184681331891979
The coefficient for model year is 0.3959410531014954
The coefficient for origin_america is -0.09399896644893509
The coefficient for origin_asia is 0.044917890138051704
The coefficient for origin_europe is 0.07243059852959383


In [8]:
intercept = regression_model.intercept_[0]

print("The intercept for our model is {}".format(intercept))

The intercept for our model is 0.015510225561902383


# Create a regularized RIDGE model and note the coefficients

In [13]:
ridge = Ridge(alpha=.3)
   

In [15]:
ridge.fit(X_train,y_train)
 

TypeError: solve() got an unexpected keyword argument 'sym_pos'

# Create a regularized LASSO model and note the coefficients

In [16]:
lasso = Lasso(alpha=0.1)
lasso.fit(X_train,y_train)
print ("Lasso model:", (lasso.coef_))

# Observe, many of the coefficients have become 0 indicating drop of those dimensions from the model

Lasso model: [-0.         -0.         -0.01464723 -0.60711757  0.          0.29460087
 -0.04017427  0.          0.        ]


## Let us compare their scores

In [17]:
print(regression_model.score(X_train, y_train))
print(regression_model.score(X_test, y_test))


0.8141025501610559
0.8433135132808832


In [18]:
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))

AttributeError: 'Ridge' object has no attribute 'coef_'

In [19]:
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))

0.7878910251573478
0.8315130533007058


In [None]:
# More or less similar results but with less complex models.  Complexity is a function of variables and coefficients
## Note - with Lasso, we get equally good result in test though not so in training.  Further, the number of dimensions is much less
# in LASSO model than ridge or un-regularized model

# Let us generate polynomial models reflecting the non-linear interaction between some dimensions

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
poly = PolynomialFeatures(degree = 2, interaction_only=True)

#poly = PolynomialFeatures(2)

In [None]:
X_poly = poly.fit_transform(X_scaled)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.30, random_state=1)
X_train.shape

# Fit a simple non regularized linear model on poly features-

In [None]:
regression_model.fit(X_train, y_train)
print(regression_model.coef_[0])


In [None]:
ridge = Ridge(alpha=.3)
ridge.fit(X_train,y_train)
print ("Ridge model:", (ridge.coef_))

In [None]:
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))


In [None]:
lasso = Lasso(alpha=0.01)
lasso.fit(X_train,y_train)
print ("Lasso model:", (lasso.coef_))


In [None]:
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))
