# TME 3 - Régularisation L1

In [68]:
import numpy as np
from sklearn import datasets
from sklearn import metrics
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import functools
import operator
import itertools

%matplotlib inline

cancer = datasets.load_breast_cancer()
X = cancer.data
print("shape of X:", X.shape)
y = cancer.target
print("shape of Y:", y.shape)
print("classes in Y: ", np.unique(y))
X, y = shuffle(X, y)

# Restricting data
X = X[:100,:10]
y = y[:100]


shape of X: (569, 30)
shape of Y: (569,)
classes in Y:  [0 1]


In [69]:
from scipy.optimize import approx_fprime, check_grad, minimize

def decision(theta, X):
    return np.sign(X.dot(theta)) / 2 + 0.5

def loss(theta, X, y, pen):
    mse = ((y - X.dot(theta))**2).mean()
    return mse + pen * sum([np.abs(t) for t in theta])

def gradient_loss(theta, X, y, pen):
    gradient = np.array(2 * X.T .dot(X.dot(theta) - y)) / X.shape[0] + pen * np.sign(theta)
    return gradient

print("Checking gradient:")
for i in range(3):
    print("\n### Iteration %d ###" % i)
    theta = np.random.random(X[0].shape) - 0.5
    pen = np.random.random() / 10
    func = lambda t:loss(t, X, y, pen)
    grad = lambda t:gradient_loss(t, X, y, pen)
    print("approx_fprime:", approx_fprime(theta, func, epsilon=1))
    print("gradient_loss:", grad(theta))
    print("check:", check_grad(func, grad, theta))


def gradient_l1(theta, X, y, max_iter=100, step=1e-1, pen=0.1, verbose=0, plot=False):
    """ Performs gradient clipping to find the best theta
    :param:
    
    :return:"""
    
    (l, n) = X.shape
    losses = []
    for it in range(max_iter):
        if verbose >= 1:
            print("###########  Step", it, ": #################")
        for i in range(l):
            idx = np.random.randint(0, l)
            if verbose >= 2:
                print("========= i = %d, idx = %d ===========" % (i, idx))
            local_loss = lambda t: loss(t, np.array([X[idx]]), np.array([y[idx]]), pen)
#             grad = approx_fprime(theta, local_loss, epsilon=step)
            grad = gradient_loss(theta, X, y, pen)
            theta_prime = theta - step * grad
            
            if verbose >= 2:
#                 print("Random sample:", X[idx])
                print("Gradient:     ",grad)
                print("Theta before: ", theta)
            theta_prime[theta * theta_prime < 0] = 0
            theta = theta_prime
            if verbose >= 2:
                print("Theta after:  ", theta)
                
        general_loss = loss(theta, X, y, pen)
        losses.append(general_loss)
        if verbose >= 1:
            print("L = %f" % (general_loss))
    
    if plot:
        plt.plot(losses)
        plt.title("Evolution of loss")
        plt.xlabel("Iterations")
        plt.ylabel("Loss")
    return theta

Checking gradient:

### Iteration 0 ###
approx_fprime: [  2.88e+03   3.72e+03   2.63e+04   6.51e+05   1.64e+01   2.03e+01
   1.99e+01   1.12e+01   3.07e+01   1.05e+01]
gradient_loss: [  2.68e+03   3.34e+03   1.76e+04   1.40e+05   1.63e+01   2.03e+01
   1.98e+01   1.12e+01   3.07e+01   1.05e+01]
check: 0.00765599646048

### Iteration 1 ###
approx_fprime: [  2.10e+03   2.80e+03   2.12e+04   6.08e+05   1.19e+01   1.44e+01
   1.39e+01   7.76e+00   2.22e+01   7.60e+00]
gradient_loss: [  1.90e+03   2.42e+03   1.24e+04   9.74e+04   1.19e+01   1.44e+01
   1.39e+01   7.75e+00   2.21e+01   7.60e+00]
check: 0.00764845078973

### Iteration 2 ###
approx_fprime: [ -2.81e+03  -3.40e+03  -1.10e+04   3.54e+05  -1.86e+01  -2.26e+01
  -2.19e+01  -1.23e+01  -3.48e+01  -1.19e+01]
gradient_loss: [ -3.02e+03  -3.79e+03  -1.98e+04  -1.56e+05  -1.87e+01  -2.27e+01
  -2.19e+01  -1.24e+01  -3.49e+01  -1.19e+01]
check: 0.00755295909611


In [71]:
# Optimization parameters
theta_init = np.random.random(X[0].shape)
max_iter = 200
step     = 1e-3
pen      = 1e-3

# Find theta 
np.set_printoptions(precision=2, threshold=10)
theta_minimize = minimize(lambda t:loss(t, X, y, pen), theta_init).x
print("theta with minimize():            ", theta_minimize)

params = {"step": np.logspace(-6, 1, 10),
          "pen" : np.logspace(-6, 1, 10)}
combinations = functools.reduce(operator.mul, [len(elt) for elt in params.values()])
best_params = {"step": params["step"][0],  "pen": params["pen"][0]}
best_theta = gradient_l1(theta_init, X, y, max_iter=max_iter, step=best_params["step"], pen=best_params["pen"])
print("Searching best parameters")
for i, (it_step, it_pen) in enumerate(itertools.product(params["step"], params["pen"])):
    theta = gradient_l1(theta_init, X, y, max_iter=max_iter, step=it_step, pen=it_pen)
    print("%d/%d" % (i, combinations) )
    if np.linalg.norm(theta - theta_minimize) < np.linalg.norm(best_theta - theta_minimize):
        best_theta = theta
        best_params = {"step": it_step, "pen": it_pen}
        



theta with minimize():             [  1.51e-01  -1.02e-02  -7.44e-03  -8.91e-04  -2.68e-07  -7.21e-08
  -2.11e+00  -6.08e+00  -5.29e-10   6.93e+00]
Searching best parameters
0/100
1/100
2/100
3/100
4/100
5/100
6/100
7/100
8/100
9/100




10/100
11/100
12/100
13/100
14/100
15/100
16/100
17/100
18/100
19/100
20/100
21/100
22/100
23/100
24/100
25/100
26/100
27/100
28/100
29/100
30/100
31/100
32/100
33/100
34/100
35/100
36/100
37/100
38/100
39/100
40/100
41/100
42/100
43/100
44/100
45/100
46/100
47/100
48/100
49/100
50/100
51/100
52/100
53/100
54/100
55/100
56/100
57/100
58/100
59/100
60/100
61/100
62/100
63/100
64/100
65/100
66/100
67/100
68/100
69/100
70/100
71/100
72/100
73/100
74/100
75/100
76/100
77/100
78/100
79/100
80/100
81/100
82/100
83/100
84/100
85/100
86/100
87/100
88/100
89/100
90/100
91/100
92/100
93/100
94/100
95/100
96/100
97/100
98/100
99/100


In [72]:
print("Done")
print(best_params)
print(best_theta)

Done
{'pen': 9.9999999999999995e-07, 'step': 0.00021544346900318845}
[  3.42e-03   4.96e-03   2.20e-02   1.33e-01   2.52e-05   2.15e-05
   1.24e-05   7.11e-06   4.77e-05   1.71e-05]


In [46]:
from sklearn.base import BaseEstimator

class LinearClassifier(BaseEstimator):
    """ Custom linear classifier """
    
    def __init__(self, max_iter=1000, step=1e-2, pen=1e-2, verbose=0):
        self.theta= 0
        self.max_iter = max_iter
        self.step = step
        self.pen = pen
        self.verbose = verbose
    
    def fit(self, X, y):
        self.theta = gradient_l1(np.ones_like(X[0]), X, y, 
                                 max_iter=self.max_iter, 
                                 step=self.step, 
                                 pen=self.pen,
                                 verbose=self.verbose)
        
    def predict(self, X):
        return decision(theta, X)

    def get_theta(self):
        return self.theta

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import KFold



kf = KFold(n_splits=4)


for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = y[train_index], y[test_index]
    clf = LinearClassifier(max_iter=max_iter, step=step, pen=pen, verbose=0)
    clf.fit(X_train, Y_train)
    print("Theta found:", clf.get_theta())
    pred = clf.predict(X_test)
    
    print("KFold:")
    print("Y_Pred: ", pred)
    print("Y_reel: ", Y_test)
    print("Score: ", accuracy_score(pred, Y_test))
    print("Errors: ", Y_test[pred != Y_test])




Theta found: [-0.01 -0.01 -0.01 ..., -0.   -0.01 -0.01]
KFold:
Y_Pred:  [ 1.  1.  1. ...,  1.  1.  1.]
Y_reel:  [0 0 1 ..., 1 1 0]
Score:  0.56
Errors:  [0 0 0 ..., 0 0 0]
Theta found: [-0.01 -0.01 -0.01 ..., -0.01 -0.01 -0.01]
KFold:
Y_Pred:  [ 1.  1.  1. ...,  1.  1.  1.]
Y_reel:  [0 0 1 ..., 0 0 1]
Score:  0.44
Errors:  [0 0 0 ..., 0 0 0]
Theta found: [-0.01 -0.01 -0.01 ..., -0.01 -0.01 -0.01]
KFold:
Y_Pred:  [ 1.  1.  1. ...,  1.  1.  1.]
Y_reel:  [1 1 0 ..., 1 0 1]
Score:  0.64
Errors:  [0 0 0 ..., 0 0 0]
Theta found: [-0.01 -0.01 -0.01 ..., -0.01 -0.01 -0.01]
KFold:
Y_Pred:  [ 1.  1.  1. ...,  1.  1.  1.]
Y_reel:  [1 0 1 ..., 0 1 0]
Score:  0.6
Errors:  [0 0 0 ..., 0 0 0]
