In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

# Exercises

There are three exercises in this notebook:

1. Use the cross-validation method to test the linear regression with different $\alpha$ values, at least three.
2. Implement a SGD method that will train the Lasso regression for 10 epochs.
3. Extend the Fisher's classifier to work with two features. Use the class as the $y$.

## 1. Cross-validation linear regression

You need to change the variable ``alpha`` to be a list of alphas. Next do a loop and finally compare the results.

In [16]:
x = np.array([188, 181, 197, 168, 167, 187, 178, 194, 140, 176, 168, 192, 173, 142, 176]).reshape(-1, 1).reshape(15,1)
y = np.array([141, 106, 149, 59, 79, 136, 65, 136, 52, 87, 115, 140, 82, 69, 121]).reshape(-1, 1).reshape(15,1)

x = np.asmatrix(np.c_[np.ones((15,1)),x])

I = np.identity(2)
alpha_values = [0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]

results = []
for alpha in alpha_values:
        
    w = np.linalg.inv(x.T*x + alpha * I)*x.T*y
    w = w.ravel()
        
    results.append([alpha, w.item(0), w.item(1)])

res = pd.DataFrame(results, columns=["alpha", "w0", "w1"])
res

Unnamed: 0,alpha,w0,w1
0,0.01,-167.85534,1.54416
1,0.05,-130.22804,1.33115
2,0.1,-101.723971,1.169788
3,0.5,-36.97522,0.803242
4,1.0,-20.590447,0.710486
5,5.0,-4.528023,0.619551
6,10.0,-2.291063,0.606881


In [17]:
results = []
kf = KFold(n_splits=3, shuffle=True, random_state=42)
for alpha in alpha_values:
    mse_scores = []
    
    for train_index, test_index in kf.split(x):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        w = np.linalg.inv(x_train.T*x_train + alpha * I)*x_train.T*y_train
        w = w.ravel()
        
        y_pred = x_test @ w.T
        y_pred = np.asarray(y_pred)
        y_test = np.asarray(y_test)
        mse = mean_squared_error(y_test, y_pred)
        mse_scores.append(mse)
        
    results.append([alpha, np.mean(mse_scores)])

res = pd.DataFrame(results, columns=["alpha", "mean_mse_score"])
res

Unnamed: 0,alpha,mean_mse_score
0,0.01,484.703538
1,0.05,527.177853
2,0.1,575.702601
3,0.5,701.479815
4,1.0,733.796792
5,5.0,765.175398
6,10.0,769.512525


## 2. Implement based on the Ridge regression example, the Lasso regression.

Please implement the SGD method and compare the results with the sklearn Lasso regression results. 

In [418]:
from sklearn.linear_model import Lasso
from sklearn import preprocessing

In [477]:
def sgd(x, y, max_iter, lr=0.1):
    m, n = x.shape
    w = np.ones((n,1))
    b = 1.0
    clip_value = 1.0
    
    for _ in range(max_iter):
        err = y - x @ w - b
        gradient_w = -(2 / m) * x.T @ err / m
        gradient_b = -(2 / m) * sum(err)
        
        grad_norm = np.sqrt(np.sum(np.square(gradient_w)) + np.square(gradient_b))
        if grad_norm > clip_value:
            scale = clip_value / grad_norm
            gradient_w *= scale
            gradient_b *= scale
        
        w -= lr * gradient_w
        b -= lr * gradient_b
    return b, w

In [478]:
x = np.array([188, 181, 197, 168, 167, 187, 178, 194, 140, 176, 168, 192, 173, 142, 176]).reshape(-1, 1).reshape(15,1)
y = np.array([141, 106, 149, 59, 79, 136, 65, 136, 52, 87, 115, 140, 82, 69, 121]).reshape(-1, 1).reshape(15,1)

x = np.asmatrix(np.c_[np.ones((15,1)),x])

alpha = 0.1

In [479]:
lasso = Lasso(alpha=alpha)
lasso.fit(np.asarray(x), y)

In [484]:
b0, w0 = sgd(x, y, max_iter=100)
b1, w1 = sgd(x, y, max_iter=1000)
b2, w2 = sgd(x, y, max_iter=10000)
b3, w3 = sgd(x, y, max_iter=100000)

In [485]:
results = pd.DataFrame({
    'method': ['Sklearn Lasso', 'SGD (100 iter)', 'SGD (1000 iter)', 'SGD (10000 iter)', 'SGD (100000 iter)'],
    'w0 (intercept)': [lasso.intercept_, b0.item(), b1.item(), b2.item(), b3.item()],
    'w1 (slope)': [lasso.coef_[1], w0[1].item(), w1[1].item(), w2[1].item(), w3[1].item()],
})
results

Unnamed: 0,method,w0 (intercept),w1 (slope)
0,Sklearn Lasso,[-180.85790859980537],1.617765
1,SGD (100 iter),0.779604,0.619078
2,SGD (1000 iter),-4.106609,0.708611
3,SGD (10000 iter),-80.095093,1.169474
4,SGD (100000 iter),-166.583096,1.673168


## 3. Extend the Fisher's classifier

Please extend the targets of the ``iris_data`` variable and use it as the $y$.

In [500]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris

iris_data = load_iris()
iris_df = pd.DataFrame(iris_data.data,columns=iris_data.feature_names)
iris_df['species'] = iris_data.target

x1 = iris_df['sepal width (cm)'].values
x2 = iris_df['sepal length (cm)'].values
x = np.column_stack((x1, x2))
y = iris_df['type'].values.reshape(-1, 1)

dataset_size = np.size(x)

mean_x, mean_y = np.mean(x), np.mean(y)

SS_xy = np.sum(y * x) - dataset_size * mean_y * mean_x
SS_xx = np.sum(x * x) - dataset_size * mean_x * mean_x

a = SS_xy / SS_xx
b = mean_y - a * mean_x


y_pred = a * x + b

In [501]:
iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),type
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [504]:
res = pd.DataFrame(y_pred, columns=["0", "1"])
res

Unnamed: 0,0,1
0,0.924785,1.051418
1,0.885212,1.035589
2,0.901042,1.019760
3,0.893127,1.011845
4,0.932700,1.043504
...,...,...
145,0.885212,1.178051
146,0.845640,1.146393
147,0.885212,1.162222
148,0.916871,1.138479
