# Section 2

In [17]:
import cvxpy as cp
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split

class color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'

In [18]:
print('To use color, just append color.BOLD to the beginning of the printed string and color.END to the end:')
print(color.BOLD + 'Like This!' + color.END)

To use color, just append color.BOLD to the beginning of the printed string and color.END to the end:
[1mLike This![0m


Implement the mean absolute error:
$$
MAE = \frac{1}{N}\sum_{i=1}^N |y_i-x_i^\top\theta|
$$

In [19]:
def get_MAE(theta, X, y):
    # --------------
    # Your Code Here
    # This function should return the mean absolute error
    # --------------
    y_real = X@theta
    N_mae = len(y)
    p = cp.norm(y - y_real)
    mae =((1/N_mae) * cp.sum(p)).value
    return mae

In [20]:
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
X, X_test, Y, Y_test = train_test_split(diabetes['data'], 
                                        np.expand_dims(diabetes['target'], 1), 
                                        test_size=0.5, random_state=0)

In [5]:
print(diabetes['DESCR'])

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, T-Cells (a type of white blood cells)
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, thyroid stimulating hormone
      - s5      ltg, lamotrigine
      - s6      glu, blood sugar level

Note: Each of these 10 feature va

## Question 2.2
Implement below the mean-absolute error regression with LASSO. Use $\lambda=0.5$. Hints: in the X matrix, rows represent data samples. Also, don't forget to add the `1` column to capture the intercept. (Use the `GLPK` solver)

In [21]:
Ones_X = np.ones((X.shape[0], 1))
X = np.hstack ((Ones_X,X))
Ones_X_test = np.ones((X_test.shape[0], 1))
X_test = np.hstack ((Ones_X_test,X_test))

lamda = 0.5
# Define the decision optimization variable
N = X.shape[0]
d = X.shape[1]
z = cp.Variable((N, 1))
k = cp.Variable((d, 1))
theta = cp.Variable(shape=(11,1),name='theta')
# Define an array containing all the constraints
constraints = [
    Y-X@theta <= z,
    -Y+X@theta <= z,
    theta <= k,
    -theta <= k,
    z >= 0
]
# Define the objective: to minimize (cp.Minimize)
objective  = cp.Minimize((1/N)*cp.sum(z) + lamda*cp.sum(k))           
# First combine the objective and constraints to formulate the problem using cp.Problem
# Use .solve() on the problem to solve the problem
problem = cp.Problem(objective, constraints)

print('\n')
print(color.BOLD+'With GLPK solution of LP ='+color.END,problem.solve(solver=cp.GLPK))
print('\n')



[1mWith GLPK solution of LP =[0m 120.48642533936652




In [22]:
print(color.BOLD + 'Training Results' + color.END)
print('MAE: {}'.format(get_MAE(theta, X, Y)))
print('\n')
print(color.BOLD + 'Test Results' + color.END)
print('MAE: {}'.format(get_MAE(theta, X_test, Y_test)))

[1mTraining Results[0m
MAE: 7.126487104577589


[1mTest Results[0m
MAE: 6.9763102871841065


## Question 2.3
Implement Cross-Validation for your MAE LASSO regression. You may recycle any functions used above. Hint: Use the `sklearn` function `train_test_split`, which can be used to randomly split the data.

In [41]:
# --------------
# Your Code Here
# --------------
def MAE_Lasso_solver(X,Y,X_test,Y_test,l):
    
    # Define the decision optimization variable
    N = X.shape[0]
    d = X.shape[1]
    z = cp.Variable((N, 1))
    k = cp.Variable((d, 1))
    theta = cp.Variable(shape=(d,1),name='theta')
    # Define an array containing all the constraints
    constraints = [
        Y-X@theta <= z,
        -Y+X@theta <= z,
        theta <= k,
        -theta <= k,
        z >= 0
    ]
    # Define the objective: to minimize (cp.Minimize)
    objective  = cp.Minimize((1/N)*cp.sum(z) + l*cp.sum(k))           
    # First combine the objective and constraints to formulate the problem using cp.Problem
    # Use .solve() on the problem to solve the problem
    problem = cp.Problem(objective, constraints)
    problem.solve(solver=cp.GLPK)

    return theta

diabetes = load_diabetes()
X, X_test, Y, Y_test = train_test_split(diabetes['data'], 
                                        np.expand_dims(diabetes['target'], 1), 
                                        test_size=0.25, random_state=42)

Ones_X = np.ones((X.shape[0], 1))
X = np.hstack ((Ones_X,X))
Ones_X_test = np.ones((X_test.shape[0], 1))
X_test = np.hstack ((Ones_X_test,X_test))

train, test, thetas = [], [], []

# Hyperparameter:
lamda = np.logspace(-5, -1, 50, base = 10)

# Cross-validation:
for l in lamda:
    theta = MAE_Lasso_solver(X, Y, X_test, Y_test, l)
    thetas.append(theta)
    train.append(get_MAE(theta, X, Y))
    test.append(get_MAE(theta, X_test, Y_test))
    
best_lamda = lamda[np.argmin(test)]
best_theta = thetas[np.argmin(test)]

print(color.BOLD + 'Best lamda' + color.END)
print('Lamda: {}'.format(best_lamda))

print(color.BOLD + 'Best theta' + color.END)
print('Theta: {}'.format(best_theta.value))

[1mBest lamda[0m
Lamda: 0.0033932217718953264
[1mBest theta[0m
Theta: [[ 148.27219259]
 [   0.        ]
 [-164.12936595]
 [ 457.06579556]
 [ 433.09645823]
 [   0.        ]
 [   0.        ]
 [-237.99709241]
 [   0.        ]
 [ 376.0943771 ]
 [   5.17444167]]


In [42]:
print(color.BOLD + 'Training Results' + color.END)
print('MAE: {}'.format(get_MAE(best_theta, X, Y)))
print('\n')
print(color.BOLD + 'Test Results' + color.END)
print('MAE: {}'.format(get_MAE(best_theta, X_test, Y_test)))

[1mTraining Results[0m
MAE: 3.0392929514777043


[1mTest Results[0m
MAE: 5.011492732640858
