In [1]:
from IPython.display import set_matplotlib_formats, display
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import load_boston
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso

## Linear Regression

In [2]:
# Dummy dats
# It has a single input feature and a numeric target variable
rnd = np.random.RandomState(2022)
x = rnd.uniform(-3, 3, size=100)
y = (np.sin(4 * x) + x) + rnd.normal(size=len(x)) / 2
X = x.reshape(-1, 1)
# plt.figure()
# plt.plot(X,y,'o')
# plt.xlim(-4,4)
# plt.xlabel("x")
# plt.ylabel("y")
# plt.show()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2022)
print("train:", X_train.shape, y_train.shape)
print("test:", X_test.shape, y_test.shape)

lr = LinearRegression().fit(X_train, y_train)
print(f"w: {lr.coef_}, b: {lr.intercept_}")

print("R^2 on training set: {:.2f}".format(lr.score(X_train, y_train)))
print("R^2 on test set: {:.2f}".format(lr.score(X_test, y_test)))

train: (75, 1) (75,)
test: (25, 1) (25,)
w: [0.89595497], b: 0.05924948767021938
R^2 on training set: 0.82
R^2 on test set: 0.78


## Polynomial Regression

In [3]:

boston = load_boston()
# print(boston.keys())
print(boston['feature_names'])
print(boston['data'].shape, type(boston['data']))
print(boston['target'].shape, type(boston['target']))

X = boston.data
X = MinMaxScaler().fit_transform(boston.data)
X = PolynomialFeatures(degree=2, include_bias=False).fit_transform(X)
y = boston.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2022)
print("train:", X_train.shape, y_train.shape)
print("test:", X_test.shape, y_test.shape)

lr = LinearRegression().fit(X_train, y_train)
print("R^2 on training set: {:.2f}".format(lr.score(X_train, y_train)))
print("R^2 on test set: {:.2f}".format(lr.score(X_test, y_test)))

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
(506, 13) <class 'numpy.ndarray'>
(506,) <class 'numpy.ndarray'>
train: (379, 104) (379,)
test: (127, 104) (127,)
R^2 on training set: 0.94
R^2 on test set: 0.81


## Ridge Regression

In [4]:
ridge = Ridge().fit(X_train, y_train)
print("R^2 on training set: {:.2f}".format(ridge.score(X_train, y_train)))
print("R^2 on test set: {:.2f}".format(ridge.score(X_test, y_test)))

R^2 on training set: 0.87
R^2 on test set: 0.81


In [5]:
#alpha=10
ridge10 = Ridge(alpha=10).fit(X_train, y_train)
print("R^2 on training set: {:.2f}".format(ridge10.score(X_train, y_train)))
print("R^2 on test set: {:.2f}".format(ridge10.score(X_test, y_test)))

R^2 on training set: 0.77
R^2 on test set: 0.75


In [6]:
#alpha=0.01
ridge001 = Ridge(alpha=0.01).fit(X_train, y_train)
print("R^2 on training set: {:.2f}".format(ridge001.score(X_train, y_train)))
print("R^2 on test set: {:.2f}".format(ridge001.score(X_test, y_test)))

R^2 on training set: 0.94
R^2 on test set: 0.81


## Lasso Regression

In [7]:
lasso = Lasso().fit(X_train, y_train)
print("R^2 on training set: {:.2f}".format(lasso.score(X_train, y_train)))
print("R^2 on test set: {:.2f}".format(lasso.score(X_test, y_test)))
print("Number of features used:", np.sum(lasso.coef_ != 0))

R^2 on training set: 0.21
R^2 on test set: 0.22
Number of features used: 4


In [8]:
# alpha=0.01
lasso001 = Lasso(alpha=0.01, max_iter=100000).fit(X_train, y_train)
print("R^2 on training set: {:.2f}".format(lasso001.score(X_train, y_train)))
print("R^2 on test set: {:.2f}".format(lasso001.score(X_test, y_test)))
print("Number of features used:", np.sum(lasso001.coef_ != 0))

R^2 on training set: 0.89
R^2 on test set: 0.78
Number of features used: 30


In [9]:
# alpha=0.0001
lasso00001 = Lasso(alpha=0.0001, max_iter=100000).fit(X_train, y_train)
print("R^2 on training set: {:.2f}".format(lasso00001.score(X_train, y_train)))
print("R^2 on test set: {:.2f}".format(lasso00001.score(X_test, y_test)))
print("Number of features used:", np.sum(lasso00001.coef_ != 0))

R^2 on training set: 0.94
R^2 on test set: 0.82
Number of features used: 96
