### Assignment 3

1) Implement linear regression model for multiclass classification using pytorch.

2) Implement multinomial and one-vs-rest variants on multiclass classification.

3) Implement L2 relularization for your model.

4) Test your model on 20newsgroups dataset. Your baseline is accuracy=0.75.

5) How can we justify using accuracy score for this problem?

6) What is acuraccy score for random answer for this problem?

Follow #TODO in the code below.
Feel free to add additional regularizers to your model.
Remember, that SGD convergence is slower that lbfgs from scikit-learn. Manage your time.

Usefull links:

https://pytorch.org/

https://gluon.mxnet.io/chapter06_optimization/gd-sgd-scratch.html

(bonus) http://ruder.io/optimizing-gradient-descent/

In [2]:
import torch as  tt
from torch.optim import SGD
from torch import nn
import numpy as np
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
from sklearn import metrics
from scipy import sparse
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import normalize
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
%matplotlib inline

SEED = 42
np.random.seed(SEED)

In [3]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

# retrieve dataset
data = fetch_20newsgroups()


X = data['data']
y = data['target']
#TODO some feature engineering
# If you want to use some sparse feature vectors, pay attention to feature size.
# While your feature matrix can be sparse, weight tensor in the model is always dense.

In [4]:
tfidf = TfidfVectorizer(min_df=5, stop_words='english', ngram_range=(1,3), max_features=45000)
X = tfidf.fit_transform(X)
y = np.array(y)
X.shape, y.shape

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


((11314, 45000), (11314,))

In [5]:
class LogisticRegressionNN(nn.Module):
    """
    All neural networks in pytorch are descendants of nn.Module class
    As you remember, Logistic regression is just a 1-layer neural network
    #TODO implement multinomial logistic regression
    """
    
    def __init__(self, d, k):
        """
        In the constructor we define model weights and layers
        d: feature size
        k: number of classes
        """
        super(LogisticRegressionNN, self).__init__()
        
        # TODO create tensor of weights and tensor of biases
        # initialize tensors from N(0,1) using np.random.rand
        # W has shape (d,k)
        # b has shape (k,)
        # set requires_grad=True for tensors, so they can be learned during training
        #self.W = tt.from_numpy(np.random.normal(0, 1, (d,k)))
        #self.b = tt.from_numpy(np.random.normal(0, 1, (k,)))
        #self.W.require_grad = True
        #self.b.require_grad = True
        self.W = tt.tensor(np.random.rand(d, k), dtype=tt.float32, requires_grad=True)
        self.b = tt.tensor(np.random.rand(k,), dtype=tt.float32, requires_grad=True)
        
        
    def forward(self, x):
        """
        In this method we implement connections between neural network weights
        x: batch feature matrix
        returns: probability logits
        """
        # TODO implement linear model without softmax
        result = tt.mm(x.double(), self.W.double()) + self.b.double()
        return result
    
    def parameters(self):
        """
        learnable model parameters
        """
        return [self.W, self.b]

In [6]:
class LogisticRegressionEstimator(BaseEstimator, ClassifierMixin):
    """
    Logistic Regression estimator coping interface from scikit-learn
    """
    def __init__(self, learning_rate, n_epochs, batch_size, alpha=1, multi_class='multinomial', verbose=False):
        """
        learning_rate: SGD learning rate
        n_epochs: number of epochs
        batch_size: size of mini-batch
        alpha:  regularizer coef
        multi_class: ['multinomial', 'ovr']
        verbose:
        """
        self.learning_rate = learning_rate
        self.n_epochs = n_epochs
        self.alpha = alpha
        self.multi_class = multi_class
        self.verbose = verbose
        self.model_nn = None
        self.batch_size = batch_size
        
    def _train_nn(self, model, X, y):
        """
        Train neural network
        model: neural network module
        X: - feature matrix
        y: - target values
        """
        
        # criterion to minimize
        criterion = nn.CrossEntropyLoss()
        # optimization algorithm
        optimizer = tt.optim.SGD(model.parameters(), lr=self.learning_rate)

        #TODO calculate number of batches, round to the ceil
        n_batches = int(np.ceil(X.shape[0] / self.batch_size))

        if self.verbose:
            # nice progress bar
            t_epochs = tqdm_notebook(range(self.n_epochs), desc='epochs', leave=True)
        else:
            t_epochs = range(self.n_epochs)
        
        # iterate over epochs
        for epoch in t_epochs:

            # TODO make random permutation over indices, use np.random.choice
            indices = np.random.choice(X.shape[0], X.shape[0], replace=False) 

            epoch_average_loss = 0

            # iterate over mini-batches
            for j in range(n_batches):
                
                batch_idx = indices[j: j + self.batch_size]

                #batch_idx = indices[j*self.batch_size: j*self.batch_size + self.batch_size]

                # we have to wrap data into tensors before feed them to neural network
                #TODO: batch feature float tensor. use tt.from_numpy
                batch_x = tt.from_numpy(X[batch_idx].toarray())
                #batch_x.require_grad = True
                #TODO batch target long tensor. use tt.from_numpy
                #batch_y.require_grad = True|.astype(np.int64)
                batch_y = tt.from_numpy(y[batch_idx])
                #batch_y.require_grad = True

                # reset gradients for the new iteration
                optimizer.zero_grad()
                # get predictions
                pred = model.forward(batch_x)

                # cross-entropy loss
                loss = criterion(pred, batch_y.long())
                #TODO: add regularizer on weights
                #param = self.model_nn.parameters()
                #param = tt.cat((param[0], param[1].reshape(1,-1)), dim=0)
                #loss += self.alpha * np.linalg.norm(pred)
                loss += self.alpha/2 * tt.norm(model.W.double())**2
                

                # calculate gradients
                halp = tt.ones(1, dtype=tt.float64, requires_grad=True)*0.9
                
                loss = loss + halp
                loss.backward()
                # make optimization step
                optimizer.step()

                epoch_average_loss += loss.data.detach().item()

            # average loss for epoch
            epoch_average_loss /= n_batches
            if self.verbose:
                t_epochs.set_postfix(loss='%.3f' % epoch_average_loss)


    def fit(self, X, y):
        """
        X: feature matrix
        y: target values
        """
        
        n_features = X.shape[1]
        self.n_classes_ = len(np.unique(y))
        
        # binary classification
        if self.n_classes_ == 2:
            self.model_nn = LogisticRegressionNN(n_features, 2)
            self._train_nn(self.model_nn, X, y)
            
        else:
            
            if self.multi_class == 'multinomial':
                # TODO: multinomial classification
                self.model_nn = LogisticRegressionNN(n_features, self.n_classes_)
                self._train_nn(self.model_nn, X, y)
                
            # ovr classification
            elif self.multi_class == 'ovr':
                
                if self.verbose:
                    t_ovr = tqdm_notebook(range(self.n_classes_), desc='ovr')
                else:
                    t_ovr = range(self.n_classes_)
                
                # TODO: one-vs-rest classification
                for key, item in enumerate(t_ovr):
                    self.model_nn.append(LogisticRegressionNN(n_features, 2))
                    self._train_nn(self.model_nn, X, item)
                    
        return self
                    
    def predict_proba(self, X):
        
        if sparse.issparse(X):
            # create sparse tensor
            X = X.tocoo()
            ii = tt.LongTensor([X.row, X.col])
            X = tt.sparse.FloatTensor(ii, tt.from_numpy(X.data).float(), X.shape)
        else:
            # create dense tensor
            X = tt.from_numpy(X).float()
            
        
        if self.n_classes_ == 2:
            pred = self.model_nn.forward(X)
            pred = tt.softmax(pred, dim=-1)
            pred = pred.detach().numpy()
            return pred
            
        else:
            if self.multi_class == 'multinomial':
                # TODO return class probabilities
                pred = self.model_nn.forward(X)
                pred = tt.softmax(pred, dim=-1)
                pred = pred.detach().numpy()
                return pred
                return pred
                
            elif self.multi_class == 'ovr':
                # TODO return class probabilities
                # remember to normalize probabities from different binary classification models, so they sum up to 1.0
                result = []
                for m in self.models:
                    pred = m.forward(X)
                    pred = tt.sigmoid(pred)
                    result.append(pred.detach().numpy()[:, 1])
                pred = np.array(result).T
                pred = normalize(pred)
                    
                return pred
            
    def predict(self, X):
        proba = self.predict_proba(X)
        return proba.argmax(axis=1)

In [7]:
test_data = fetch_20newsgroups(subset='test')

X_test = tfidf.transform(test_data['data']).todense()
y_test = test_data['target']

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [10]:
%%time

est = LogisticRegressionEstimator(learning_rate = 1, n_epochs = 1500, batch_size = 1000, alpha = 0.001, multi_class = 'multinomial', verbose = False)
est.fit(X,y)


Wall time: 2h 8min 55s


In [9]:
print('acc', metrics.accuracy_score(y_test, est.predict(X_test)))

acc 0.7616834838024429


In [None]:
# your baseline
print('acc', metrics.accuracy_score(y_test, est.predict(X_test)))

acc 0.7541157727031333