In [3]:
%matplotlib inline

import numpy as np
import seaborn as sns
import pandas as pd
from sklearn.linear_model import SGDClassifier, LogisticRegression 
from sklearn.svm import LinearSVC
from time import sleep
import attr
import matplotlib.pyplot as plt
from abc import ABC, abstractmethod

plt.rcParams['figure.figsize'] = [10, 8]

In [80]:
def plot_boundary(decision_function):
    xx, yy = np.meshgrid(np.linspace(-2, 10), np.linspace(-6,6))
    Z = decision_function(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
    ax = sns.scatterplot(x=0, y=1, hue='label', data=data)
    ax.contour(xx, yy, Z, levels=[0])

def add_intercept(X):
    N,_ = X.shape
    return np.concatenate([np.ones(N).reshape(N, -1), X], 1)

class LossFunction(ABC):
    """ Base class for loss functions """
    @abstractmethod
    def loss(self, p, y):
        pass

    @abstractmethod
    def dloss(self, p, y):
        pass

@attr.s
class SGD():
    """
    Simple stochastic gradient descent!
    

    Parameters
    ----------
    loss: LossFunction
    	An instance of the LossFunction class

    eta : float
    	Learning rate for descent

    w_init : array, shape(n_features)
    	Initial weights

    max_iter: int
    	Maximum number of epochs to train
    
    tol: float
    	Tolerance for stopping from convergence


    Attributes

    weights: array, shape(n_features)
    	The fitted weights of the model

    losses: array, shape(n_epochs)
    	The losses over time
    """

    loss = attr.ib()
    eta = attr.ib()
    w_init = attr.ib(default=None)
    max_iter = attr.ib(default=1000)
    tol = attr.ib(default=1e-9)

    def decision_function(self, X):
        X = add_intercept(X)
        return X.dot(self.weights)
    
    def fit(self, X, y):
        """
        Parameters
        ----------

        X : array, shape(n_samples, n_features)
            Your data, without intercept

        y : array, shape (n_samples)
            Your labels, in {+1,-1}
       
        Returns
        -------
        
        weights: array, shape(n_features)
    	    The fitted weights of the model
        """

        X = add_intercept(X)
        N,P = X.shape

        # init weights
        w = self.w_init
        if w is None:
            w = np.random.normal(0,1,P)

        iters = 0
        self.losses = []

        for j in range(self.max_iter):
            tot_loss = 0
            iters += 1
            w_new = w.copy()

            # Stochastic part!
            idx = np.arange(N)
            np.random.shuffle(idx)
            # Loss of each data point
            for i in idx:
                p = np.dot(X[i], w)
                tot_loss += self.loss.loss(p, y[i])
                dloss = self.loss.dloss(p, y[i], w_new)
                update = X[i] * (-self.eta * dloss)
                w_new += update

            # Check convergence
            if np.linalg.norm(w_new - w) < self.tol:
                break
            else:
                w = w_new

            self.losses.append(tot_loss)
        print('Finished after {} iterations. Finall loss = {}'.format(iters, tot_loss))
        self.weights = w
        return w    

In [None]:
# Generate some data to play with! This is important to be able to do. 
# Generate a couple of classes, try with linearly separable and not linearly
# separable!

In [None]:
# Implement the different loss functions (as classes that inherit
# from LossFunction. For example: Logistic, Hinge. 

In [None]:
# Use the SGD classifier from above with the data and different loss
# functions. See how it works! 
# Play with max_iter, and look at the losses and convergence. 
# How do the loss functions affect things? 

In [None]:
# Try the other loss functions from the slides. 
# Does the square loss converge well? 

In [None]:
# Try with LogisticRegression from Sklearn using the newton method
# Does it converge faster? That's second-order optimization! 