In [1]:
%matplotlib inline

In [2]:
# modified version of notebook by Robert Guthrie
# https://pytorch.org/tutorials/beginner/nlp/deep_learning_tutorial.html#sphx-glr-beginner-nlp-deep-learning-tutorial-py

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.feature_extraction.text import CountVectorizer
from random import shuffle

torch.manual_seed(1)

<torch._C.Generator at 0x20f797f2110>

Basic way of creating a neural network in Pytorch : inheriting from the nn.Module
===

The steps are:
- define a class inheriting from the nn.Module
- define a forward method
- you don't have to define the backpropagation method !

Creating a feed-forward network in PyTorch
======================================

Let's build a MLP in PyTorch, with a single hidden layer with a non-linear activation function, and softmax to get probabilities of classes.
We'll then use the negative log likelihood loss, and stochastic gradient descent to learn the parameters.

Because log-likelihood supposes to take the log of probabilities, in our network we will directly output log-softmax instead of softmax.






## Defining the network

In pytorch, all network components should inherit from nn.Module and override the
forward() method. 

Inheriting from nn.Module provides functionality to your
component, like backpropagation and switching from CPU to GPU computing.


In [3]:
class MyMLP(nn.Module):  # inheriting from nn.Module

    def __init__(self, nb_classes, d, hidden_layer_size, batch_size=1):
        """ - d is the size of the vectors representing the objects to classify 
            - hidden_layer_size is ... the size of the hidden layer
        """
        # calls the init function of the superclass, i.e. nn.Module
        super(MyMLP, self).__init__()
        self.batch_size = batch_size
        # The parameters of the network that must be learnt
        # are inferred thanks to pytorch machinary, 
        # from what is declared in the constructor 

        # linear layer Wx + b
        # => implicitely contains parameters W and b
        # cf. https://pytorch.org/docs/stable/generated/torch.nn.Linear.html
        # which are randomly initialized by default
        self.linear_1 = nn.Linear(d, hidden_layer_size)
        
        # another pair of W and b parameters
        self.linear_2 = nn.Linear(hidden_layer_size, nb_classes)

        # NB: the non-linear activation functions do not have any parameter
        # so we don't need to declare it in the __init__ method
        # they will be used directly in the forward method
        
    def forward(self, X):
        """
        Input : X batch of inputs : ( shape (b, d) )
                with b is the size of the batch (it does not have to be constant)
        Output : log probabilities (shape (b, nb_classes) )
        """
        # linear combination from input to hidden layer
        out = self.linear_1(X) # out shape (b, hidden_layer_size)
        
        # activation function at hidden layer
        out = torch.relu(out) # same shape
        
        # linear combination from hidden to output layer
        out = self.linear_2(out) # out shape (b, nb_classes)
        
        # Transformation into log-probabilities
        #  Note : Since we're going to use the NLL loss, 
        #         we will need to apply log to softmax anyway
        #         we do it here because directly computing log of softmax is numerically more stable
        # CAUTION: the dim=1 is crucial here to tell pytorch on which axis apply the softmax
        return F.log_softmax(out, dim=1) # out shape (n, nb_classes)
    
# examples
my_classifier = MyMLP(nb_classes=3, d=20, hidden_layer_size=10, batch_size=2)

# nn.Module and any subclass (e.g. nn.Linear, BoWClassifier)
# records what are the parameters
# depending on what has been declared in the __init__ method.

# For nn.Linear, the first param is matrix W, the second is bias b.
# Through some Python magic from the PyTorch devs, your module
# (in this case, BoWClassifier) will store knowledge of the nn.Linear's parameters

# and the parameters have requires_grad set to True of course
for name, param in my_classifier.named_parameters():
    print("PARAM named %s, of shape %s" % (name, str(param.shape)))
    print(param)
    




PARAM named linear_1.weight, of shape torch.Size([10, 20])
Parameter containing:
tensor([[ 0.1152, -0.0987, -0.0433,  0.1050, -0.2105,  0.1341, -0.0460,  0.1138,
          0.0311, -0.0274,  0.0620,  0.0110,  0.0817, -0.0871, -0.0163, -0.0201,
          0.0324, -0.0009,  0.1955,  0.0696],
        [-0.0833, -0.1350, -0.0375, -0.0965, -0.0717,  0.0107,  0.1333,  0.1215,
         -0.2186,  0.1386,  0.0625,  0.2121,  0.1476, -0.2037, -0.2126, -0.1079,
          0.1964, -0.0372,  0.0957, -0.1039],
        [ 0.2194, -0.0946,  0.1677,  0.0026, -0.1178,  0.1149, -0.1187,  0.0658,
         -0.0646, -0.0245, -0.2150, -0.1066,  0.1213, -0.0544,  0.2227,  0.1792,
         -0.0105, -0.1493,  0.1362,  0.0694],
        [-0.1445,  0.1452,  0.1358,  0.1983, -0.1253, -0.0368, -0.0043,  0.0327,
         -0.1697, -0.1587,  0.1216, -0.0524,  0.1092,  0.0127,  0.0734,  0.0492,
          0.0813,  0.1108, -0.2071,  0.1126],
        [-0.1572, -0.1687,  0.0136, -0.0381,  0.1313, -0.1295, -0.1988,  0.1627,
      

## Task

The task we consider is **language identification**,
and we are using a toy data set of 4 training examples, and 2 test examples (below).

TODO
==
After you've programmed all the TODOs below,
switch to the ep-08-04* files, which contain 500 sentences in French and 500 sentences in English.

In [4]:
train_examples = [("me gusta comer en la cafeteria".split(), "SPANISH"),
                  ("Give it to me".split(), "ENGLISH"),
                  ("No creo que sea una buena idea".split(), "SPANISH"),
                  ("No it is not a good idea to get lost at sea".split(), "ENGLISH")]

test_examples = [("Yo creo que si".split(), "SPANISH"),
                 ("it is lost on me".split(), "ENGLISH")]




## Encoding data : creating indices and converting data to tensors

In [5]:
# word to index
# and label to index correspondances
w2i = {}
i2w = []
label2i = {}
i2label = []

# TODO:
for example in train_examples:
  for word in example[0]: # example[0] is the sentence
    if word not in w2i:
      w2i[word] = len(w2i)
      i2w.append(word)
  label = example[1] # example[1] is the label
  if label not in label2i:
    label2i[label] = len(label2i)
    i2label.append(label)
    
# fill in the w2i, i2w, label2i and i2label
# using the training examples only
# the words in test that are unknown in train will be ignored

# instead YOU CAN USE the CountVectorizer of sklearn 


VOCAB_SIZE = len(w2i)
NB_CLASSES = len(label2i)

print("VOCAB SIZE:", VOCAB_SIZE)
print("NB CLASSES:", NB_CLASSES)
print("w2i:", w2i)
print("i2w:", i2w)
print("label2i:", label2i)


def convert_examples_to_tensors(examples, w2i, label2i):
    """ Input = 
          - list of n examples
             -- each example is a pair [sentence, class label]
             -- a sentence being a list of tokens
          - dictionary of words to indices
          - dictionary of class labels to indices
        Output = 
          - X = BOW vectors for the n examples:
              = pytorch tensor of shape ( n , vocabulary size )
                X[i, j] = number of occ of word j in sentence i
          - Y = tensor of shape  ( n ) for indices of gold labels
    """
    # separating input sentences and gold labels
    (sentences, gold_labels) = list(zip(*examples))
    
    n = len(examples)
    
    vectorizer = CountVectorizer(vocabulary=w2i) # Initialize vectorizer with vocabulary
    X_vec = vectorizer.transform([ " ".join(sentence) for sentence in sentences ]) # Convert sentences to BOW vectors
    X = torch.from_numpy(X_vec.toarray()).float() # Convert BOW vectors to tensor of shape (n, VOCAB_SIZE) and of type float
    Y = torch.LongTensor([ label2i[gold_label] for gold_label in gold_labels ]) # Convert gold labels to tensor of indices of shape (n)

    return X, Y


X_test = None
X_train = None
Y_test = None
Y_train = None

X_train, Y_train = convert_examples_to_tensors(train_examples, w2i, label2i)
X_test, Y_test = convert_examples_to_tensors(test_examples, w2i, label2i)

print("X_train", X_train.size())
print("Y_train", Y_train.size())
print("X_test", X_test.size())
print("Y_test", Y_test.size())
print(X_train[0])


VOCAB SIZE: 23
NB CLASSES: 2
w2i: {'me': 0, 'gusta': 1, 'comer': 2, 'en': 3, 'la': 4, 'cafeteria': 5, 'Give': 6, 'it': 7, 'to': 8, 'No': 9, 'creo': 10, 'que': 11, 'sea': 12, 'una': 13, 'buena': 14, 'idea': 15, 'is': 16, 'not': 17, 'a': 18, 'good': 19, 'get': 20, 'lost': 21, 'at': 22}
i2w: ['me', 'gusta', 'comer', 'en', 'la', 'cafeteria', 'Give', 'it', 'to', 'No', 'creo', 'que', 'sea', 'una', 'buena', 'idea', 'is', 'not', 'a', 'good', 'get', 'lost', 'at']
label2i: {'SPANISH': 0, 'ENGLISH': 1}
X_train torch.Size([4, 23])
Y_train torch.Size([4])
X_test torch.Size([2, 23])
Y_test torch.Size([2])
tensor([1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.])


In [6]:
my_classifier = MyMLP(nb_classes=NB_CLASSES, d=VOCAB_SIZE, hidden_layer_size=10)

## Test of forward propagation (with random parameters)

In [7]:
# Test of forward propagation with the randomly initialized parameters:

with torch.no_grad():
    # X_test : shape [n , vocab_size]
    # NB: to run the forward method, use the name of the MyMLM instance
    log_probs = my_classifier(X_test)  # shape [n , num_labels]
    print("Log probabilities on test before training:\n", log_probs)

    # prediction : argmax of the log_probabilities
    # (make sure to control the axis on which the argmax is computed)
    pred_labels = torch.argmax(log_probs, dim=1) # shape n
    print("PREDICTED LABELS", [ i2label[l] for l in pred_labels ])
    print("     GOLD LABELS", [ i2label[l] for l in Y_test ])


Log probabilities on test before training:
 tensor([[-0.6232, -0.7683],
        [-0.5101, -0.9174]])
PREDICTED LABELS ['SPANISH', 'SPANISH']
     GOLD LABELS ['SPANISH', 'ENGLISH']


## Training

So lets train! To do this, we pass instances through forward propagation
to get log probabilities, compute a loss function, compute the gradient of the loss
function, and then update the parameters with a gradient step. 

Loss functions are provided by Torch in the nn package. nn.NLLLoss() is the
negative log likelihood loss we want. 

Optimization functions are in torch.optim. Here, we will just use SGD.


In [8]:


# --------- the loss -------------
# negative log likelihood loss
# TODO: check its input and output https://pytorch.org/docs/stable/generated/torch.nn.NLLLoss.html
loss_function = nn.NLLLoss() 

# --------- the optimizer --------
# simplest one: stochastic gradient descent
# we declare the parameters we wish to optimize  
# => here we want to optimize all the parameters of our BoWClassifier instance
optimizer = optim.SGD(my_classifier.parameters(), lr=0.1)

print("\nTraining on:")
for x in train_examples:
    print(x)
print("\n")
print(label2i)

# loop on epochs
for epoch in range(10):
    print("Epoch", epoch)
    shuffle(train_examples) # NB: here original order is lost

    # NB: here if we had more examples,
    # we should loop on mini batches (in random order)
    # since we only have 4 training examples
    # on each epoch we use a full batch of 4 training examples
    
    # Step 1:
    # get tensors for batch of examples (here : one batch = all the training data)
    X, Y = convert_examples_to_tensors(train_examples, w2i, label2i)


    # Step 2: 
    # (re)sets all parameter gradients to 0
    #  before using each batch of inputs
    my_classifier.zero_grad()
    
    # Step 3: forward propagation
    #         NB: my_classifier(X) implicitely calls my_classifier.forward(X)
    log_probs = my_classifier(X)
    
    print("LOG PROBS on train at epoch %i:\n" %epoch, log_probs)

    # optional: accuracy on this batch
    pred_labels = torch.argmax(log_probs, dim=1)
    print("PREDICTED LABELS ON BATCH", [ i2label[l] for l in pred_labels ])
    print("     GOLD LABELS ON BATCH", [ i2label[l] for l in Y ])
    
    # Step 4: Compute the loss (NB: this is the loss for the full batch of inputs X)
    #         The input to the loss function is for each example in X,
    #          the log_probabilities for each class, and the gold label
    loss = loss_function(log_probs, Y) 

    
    # Step 5: Compute the gradients
    loss.backward() # partial derivatives of loss with respect to
                    # all the tensors that - were used to compute loss,
                    #                      - and have requires_grad=True
                    # after this call, all the parameters have their .grad attribute
                    # filled with the partial derivative

    # Step 6: Update the parameters
    #         NB: the optimizer instance knows what are the parameters to update
    optimizer.step()

# prediction after training:
print("\nPrediction on test, after training:")
# NB: when we are not training, we don't have to compute gradients
# => to be more efficient, we use torch.no_grad()
with torch.no_grad():
    log_probs = my_classifier(X_test)  
    print("LOG PROBS on test:\n", log_probs)
    
    pred_labels = torch.argmax(log_probs, dim=1)
    print("PREDICTED LABELS", [ i2label[l] for l in pred_labels ])
    print("     GOLD LABELS", [ i2label[l] for l in Y_test ])




Training on:
(['me', 'gusta', 'comer', 'en', 'la', 'cafeteria'], 'SPANISH')
(['Give', 'it', 'to', 'me'], 'ENGLISH')
(['No', 'creo', 'que', 'sea', 'una', 'buena', 'idea'], 'SPANISH')
(['No', 'it', 'is', 'not', 'a', 'good', 'idea', 'to', 'get', 'lost', 'at', 'sea'], 'ENGLISH')


{'SPANISH': 0, 'ENGLISH': 1}
Epoch 0
LOG PROBS on train at epoch 0:
 tensor([[-0.5860, -0.8131],
        [-0.4698, -0.9812],
        [-0.6020, -0.7934],
        [-0.5899, -0.8083]], grad_fn=<LogSoftmaxBackward0>)
PREDICTED LABELS ON BATCH ['SPANISH', 'SPANISH', 'SPANISH', 'SPANISH']
     GOLD LABELS ON BATCH ['SPANISH', 'ENGLISH', 'ENGLISH', 'SPANISH']
Epoch 1
LOG PROBS on train at epoch 1:
 tensor([[-0.6379, -0.7516],
        [-0.5843, -0.8153],
        [-0.5779, -0.8234],
        [-0.5283, -0.8906]], grad_fn=<LogSoftmaxBackward0>)
PREDICTED LABELS ON BATCH ['SPANISH', 'SPANISH', 'SPANISH', 'SPANISH']
     GOLD LABELS ON BATCH ['ENGLISH', 'SPANISH', 'SPANISH', 'ENGLISH']
Epoch 2
LOG PROBS on train at epoch 2:
 

TODO
==

- implement the inner loop on batches (size of batch = hyperparameter)
- implement early stopping

In [9]:
# Define a batch function:
def batchify(examples, batch_size):
    for i in range(0, len(examples), batch_size):
        yield examples[i:i+batch_size]

# Test the batch function:
batches = batchify(train_examples, 2)
for batch in batches:
    X, Y = convert_examples_to_tensors(batch, w2i, label2i)
    print(X.size(), Y.size())

torch.Size([2, 23]) torch.Size([2])
torch.Size([2, 23]) torch.Size([2])


### Implement the batch size (inner loop) and the early stopping mechanism (step 7):

In [10]:

my_classifier = MyMLP(nb_classes=NB_CLASSES, d=VOCAB_SIZE, hidden_layer_size=10, batch_size=1)
# --------- the loss -------------
# negative log likelihood loss
# TODO: check its input and output https://pytorch.org/docs/stable/generated/torch.nn.NLLLoss.html
loss_function = nn.NLLLoss() 

# --------- the optimizer --------
# simplest one: stochastic gradient descent
# we declare the parameters we wish to optimize  
# => here we want to optimize all the parameters of our BoWClassifier instance
optimizer = optim.SGD(my_classifier.parameters(), lr=0.1)

print("\nTraining on:")
for x in train_examples:
    print(x)
print("\n")
print(label2i)
    
# loop on epochs
best_loss = float('inf') # Initialize best loss to infinity
best_epoch = 0 # Initialize best epoch to 0

for epoch in range(10):
    print("Epoch", epoch)
    shuffle(train_examples) # NB: here original order is lost
    # loop on batches
    batches = batchify(train_examples, batch_size=my_classifier.batch_size)
    for nb_batch, batch  in enumerate(batches):
        
    # NB: here if we had more examples,
    # we should loop on mini batches (in random order)
    # since we only have 4 training examples
    # on each epoch we use a full batch of 4 training examples
    
    # Step 1:
    # get tensors for batch of examples (here : one batch = all the training data)
        X, Y = convert_examples_to_tensors(batch, w2i, label2i)

    # Step 2: 
    # (re)sets all parameter gradients to 0
    #  before using each batch of inputs
        my_classifier.zero_grad()
    # Step 3: forward propagation
    #         NB: my_classifier(X) implicitely calls my_classifier.forward(X)
        log_probs = my_classifier(X)
        print("LOG PROBS on train at batch %i:\n" %nb_batch, log_probs)
    # optional: accuracy on this batch
        pred_labels = torch.argmax(log_probs, dim=1)
        accuracy = (pred_labels == Y).sum().item() / len(Y)
        print("PREDICTED LABELS ON BATCH", [ i2label[l] for l in pred_labels ])
        print("     GOLD LABELS ON BATCH", [ i2label[l] for l in Y ])
        print(f"Accuracy on batch: {accuracy:.2%}")
    
    # Step 4: Compute the loss (NB: this is the loss for the full batch of inputs X)
    #         The input to the loss function is for each example in X,
    #          the log_probabilities for each class, and the gold label
        loss = loss_function(log_probs, Y) 

    
    # Step 5: Compute the gradients
        loss.backward() # partial derivatives of loss with respect to
                    # all the tensors that - were used to compute loss,
                    #                      - and have requires_grad=True
                    # after this call, all the parameters have their .grad attribute
                    # filled with the partial derivative

    # Step 6: Update the parameters
    #         NB: the optimizer instance knows what are the parameters to update
        optimizer.step()
        
    # Step 7: Implement early stopping:
    with torch.no_grad():
        log_probs = my_classifier(X_test) # Compute log probabilities on dev set
        loss = loss_function(log_probs, Y_test) # Compute loss on dev set
        dev_loss += loss.item()
        print(f"Best loss: {best_loss:.4}")
        print(f"Dev loss: {dev_loss:.4} at epoch {epoch}, batch {nb_batch}")
            
    if dev_loss < best_loss: # If the loss on the dev set is lower than the best loss
        best_loss = dev_loss # Update the best loss
        best_epoch = epoch # Update the best epoch
    else: # If the loss on the dev set is higher than the best loss
        print("Performance decreases, stopping training")
        print(f"Best epoch: {best_epoch}, best loss: {best_loss:.4}")
        break # Stop training       

        

# prediction after training:
print("\nPrediction on test, after training:")
# NB: when we are not training, we don't have to compute gradients
# => to be more efficient, we use torch.no_grad()
with torch.no_grad():
    log_probs = my_classifier(X_test)  
    print("LOG PROBS on test:\n", log_probs)
    
    pred_labels = torch.argmax(log_probs, dim=1)
    accuracy = (pred_labels == Y_test).sum().item() / len(Y_test)
    print("PREDICTED LABELS", [ i2label[l] for l in pred_labels ])
    print("     GOLD LABELS", [ i2label[l] for l in Y_test ])
    print(f"Accuracy on test set: {accuracy:.2%}")




Training on:
(['No', 'creo', 'que', 'sea', 'una', 'buena', 'idea'], 'SPANISH')
(['me', 'gusta', 'comer', 'en', 'la', 'cafeteria'], 'SPANISH')
(['No', 'it', 'is', 'not', 'a', 'good', 'idea', 'to', 'get', 'lost', 'at', 'sea'], 'ENGLISH')
(['Give', 'it', 'to', 'me'], 'ENGLISH')


{'SPANISH': 0, 'ENGLISH': 1}
Epoch 0
LOG PROBS on train at batch 0:
 tensor([[-0.5083, -0.9201]], grad_fn=<LogSoftmaxBackward0>)
PREDICTED LABELS ON BATCH ['SPANISH']
     GOLD LABELS ON BATCH ['ENGLISH']
Accuracy on batch: 0.00%
LOG PROBS on train at batch 1:
 tensor([[-0.6386, -0.7508]], grad_fn=<LogSoftmaxBackward0>)
PREDICTED LABELS ON BATCH ['SPANISH']
     GOLD LABELS ON BATCH ['ENGLISH']
Accuracy on batch: 0.00%
LOG PROBS on train at batch 2:
 tensor([[-0.6010, -0.7947]], grad_fn=<LogSoftmaxBackward0>)
PREDICTED LABELS ON BATCH ['SPANISH']
     GOLD LABELS ON BATCH ['SPANISH']
Accuracy on batch: 100.00%
LOG PROBS on train at batch 3:
 tensor([[-0.6208, -0.7711]], grad_fn=<LogSoftmaxBackward0>)
PREDICTED L

NameError: name 'dev_loss' is not defined

## Apply the architecture on the EP dataset:
### Read the data:

In [None]:
# Apply the architecture on the EP dataset:
# Define a reader for the EP EN and FR dataset:
# Because we know a priori that the EN dataset has all sentences in English,
# we can directly use ENGLISH as a gold label
def read_examples_en(filename):
    examples = []
    with open(filename, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                sentence = line.split()
                example = (sentence, "ENGLISH")
                examples.append(example)
    return examples

# We do the same for the FR dataset, but we use the gold label "FRENCH"
def read_examples_fr(filename):
    examples = []
    with open(filename, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                sentence = line.split()
                example = (sentence, "FRENCH")
                examples.append(example)
    return examples

# Read the data:
en_train_examples = read_examples_en("ep-en.tok.train")
fr_train_examples = read_examples_fr("ep-fr.tok.train")
en_test_examples = read_examples_en("ep-en.tok.test")
fr_test_examples = read_examples_fr("ep-fr.tok.test")
en_dev_examples = read_examples_en("ep-en.tok.dev")
fr_dev_examples = read_examples_fr("ep-fr.tok.dev")
print(en_train_examples[0])
print(fr_train_examples[0])
# Merge the data:
train_examples = en_train_examples + fr_train_examples
test_examples = en_test_examples + fr_test_examples
dev_examples = en_dev_examples + fr_dev_examples
print("Number of training examples:", len(train_examples))
print("Number of test examples:", len(test_examples))
print("Number of dev examples:", len(dev_examples))

(['Resumption', 'of', 'the', 'session'], 'ENGLISH')
(['Reprise', 'de', 'la', 'session'], 'FRENCH')
Number of training examples: 1000
Number of test examples: 200
Number of dev examples: 200


### Build the vocabulary:

In [None]:
# Build the vocabulary:
def build_vocab(train_examples):
  
  # word to index
  # and label to index correspondances
  w2i = {}
  i2w = []
  label2i = {}
  i2label = []

  for example in train_examples:
    for word in example[0]: # example[0] is the sentence
      if word not in w2i:
        w2i[word] = len(w2i)
        i2w.append(word)
    label = example[1] # example[1] is the label
    if label not in label2i:
      label2i[label] = len(label2i)
      i2label.append(label)
    
  # fill in the w2i, i2w, label2i and i2label
  # using the training examples only
  # the words in test that are unknown in train will be ignored

  VOCAB_SIZE = len(w2i)
  NB_CLASSES = len(label2i)
  return w2i, i2w, label2i, i2label, VOCAB_SIZE, NB_CLASSES

# Build the vocabulary:
w2i, i2w, label2i, i2label, VOCAB_SIZE, NB_CLASSES = build_vocab(train_examples)
print("VOCAB SIZE:", VOCAB_SIZE)
print("NB CLASSES:", NB_CLASSES)

# Convert examples to tensors:
# We reuse the code from the previous part:
X_train, Y_train = convert_examples_to_tensors(train_examples, w2i, label2i)
shuffle(test_examples)
shuffle(dev_examples)
X_test, Y_test = convert_examples_to_tensors(test_examples, w2i, label2i)
X_dev, Y_dev = convert_examples_to_tensors(dev_examples, w2i, label2i)
print("X_train.shape:", X_train.shape)
print("Y_train.shape:", Y_train.shape)
print("X_test.shape:", X_test.shape)
print("Y_test.shape:", Y_test.shape)
print("X_dev.shape:", X_dev.shape)
print("Y_dev.shape:", Y_dev.shape)

VOCAB SIZE: 4541
NB CLASSES: 2
X_train.shape: torch.Size([1000, 4541])
Y_train.shape: torch.Size([1000])
X_test.shape: torch.Size([200, 4541])
Y_test.shape: torch.Size([200])
X_dev.shape: torch.Size([200, 4541])
Y_dev.shape: torch.Size([200])


### Apply the architecture on the EP dataset:

In [None]:
# Initialize the model:
my_classifier = MyMLP(nb_classes=NB_CLASSES, d=VOCAB_SIZE, hidden_layer_size=10, batch_size=42)
# --------- the loss -------------
# negative log likelihood loss
# TODO: check its input and output https://pytorch.org/docs/stable/generated/torch.nn.NLLLoss.html
loss_function = nn.NLLLoss() 

# --------- the optimizer --------
# simplest one: stochastic gradient descent
# we declare the parameters we wish to optimize  
# => here we want to optimize all the parameters of our BoWClassifier instance
optimizer = optim.SGD(my_classifier.parameters(), lr=0.1)

print("\nTraining on the EP dataset:")

    
# loop on epochs
best_loss = float('inf')
best_epoch = 0
for epoch in range(100):
    print("Epoch", epoch)
    shuffle(train_examples) # NB: here original order is lost
    # loop on batches
    batches = batchify(train_examples, batch_size=my_classifier.batch_size)
    for nb_batch, batch  in enumerate(batches):
    # NB: here if we had more examples,
    # we should loop on mini batches (in random order)
    # since we only have 4 training examples
    # on each epoch we use a full batch of 4 training examples
    
    # Step 1:
    # get tensors for batch of examples (here : one batch = all the training data)
        X, Y = convert_examples_to_tensors(batch, w2i, label2i)

    # Step 2: 
    # (re)sets all parameter gradients to 0
    #  before using each batch of inputs
        my_classifier.zero_grad()
    # Step 3: forward propagation
    #         NB: my_classifier(X) implicitely calls my_classifier.forward(X)
        log_probs = my_classifier(X)
        #print("LOG PROBS on train at batch %i:\n" %nb_batch, log_probs)
    # optional: accuracy on this batch
        pred_labels = torch.argmax(log_probs, dim=1)
        accuracy = (pred_labels == Y).sum().item() / len(Y)
        print("PREDICTED LABELS ON BATCH", [ i2label[l] for l in pred_labels ])
        print("     GOLD LABELS ON BATCH", [ i2label[l] for l in Y ])
        print(f"Train accuracy on batch {nb_batch}: {accuracy:.2%}")
    
    # Step 4: Compute the loss (NB: this is the loss for the full batch of inputs X)
    #         The input to the loss function is for each example in X,
    #          the log_probabilities for each class, and the gold label
        loss = loss_function(log_probs, Y) 

    
    # Step 5: Compute the gradients
        loss.backward() # partial derivatives of loss with respect to
                    # all the tensors that - were used to compute loss,
                    #                      - and have requires_grad=True
                    # after this call, all the parameters have their .grad attribute
                    # filled with the partial derivative

    # Step 6: Update the parameters
    #         NB: the optimizer instance knows what are the parameters to update
        optimizer.step()
    
    # Step 7: Implement early stopping:
    with torch.no_grad():
        log_probs = my_classifier(X_dev) # Compute log probabilities on dev set
        loss = loss_function(log_probs, Y_dev) # Compute loss on dev set
        dev_loss += loss.item()
        print(f"Best loss: {best_loss:.4}")
        print(f"Dev loss: {dev_loss:.4} at epoch {epoch}, batch {nb_batch}")
            
    if dev_loss < best_loss: # If the loss on the dev set is lower than the best loss
        best_loss = dev_loss # Update the best loss
        best_epoch = epoch # Update the best epoch
    else: # If the loss on the dev set is higher than the best loss
        print("Performance decreases, stopping training")
        print(f"Best epoch: {best_epoch}, best loss: {best_loss:.4}")
        break # Stop training
    


Training on the EP dataset:
Epoch 0
PREDICTED LABELS ON BATCH ['FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH']
     GOLD LABELS ON BATCH ['ENGLISH', 'ENGLISH', 'ENGLISH', 'FRENCH', 'FRENCH', 'FRENCH', 'ENGLISH', 'FRENCH', 'FRENCH', 'ENGLISH', 'ENGLISH', 'ENGLISH', 'ENGLISH', 'FRENCH', 'FRENCH', 'FRENCH', 'ENGLISH', 'FRENCH', 'ENGLISH', 'FRENCH', 'FRENCH', 'ENGLISH', 'FRENCH', 'FRENCH', 'FRENCH', 'ENGLISH', 'FRENCH', 'ENGLISH', 'FRENCH', 'FRENCH', 'ENGLISH', 'FRENCH', 'FRENCH', 'FRENCH', 'ENGLISH', 'ENGLISH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'ENGLISH']
Train accuracy on batch 0: 59.52%
PREDICTED LABELS O

In [None]:
# prediction after training:
print("\nPrediction on test, after training on full dataset:")
# NB: when we are not training, we don't have to compute gradients
# => to be more efficient, we use torch.no_grad()
with torch.no_grad():
    log_probs = my_classifier(X_test)  
    #print("LOG PROBS on test:\n", log_probs)
    
    pred_labels = torch.argmax(log_probs, dim=1)
    accuracy = (pred_labels == Y_test).sum().item() / len(Y_test)
    print("PREDICTED LABELS", [ i2label[l] for l in pred_labels ])
    print("     GOLD LABELS", [ i2label[l] for l in Y_test ])
    print(f"Accuracy on test set: {accuracy:.2%}")
    
    # prediction on dev set:
    log_probs = my_classifier(X_dev)
    
    pred_labels = torch.argmax(log_probs, dim=1)
    accuracy = (pred_labels == Y_dev).sum().item() / len(Y_dev)
    print("PREDICTED LABELS", [ i2label[l] for l in pred_labels ])
    print("     GOLD LABELS", [ i2label[l] for l in Y_dev ])
    print(f"Accuracy on dev set: {accuracy:.2%}")


Prediction on test, after training on full dataset:
PREDICTED LABELS ['FRENCH', 'ENGLISH', 'ENGLISH', 'FRENCH', 'ENGLISH', 'ENGLISH', 'ENGLISH', 'FRENCH', 'ENGLISH', 'ENGLISH', 'ENGLISH', 'FRENCH', 'ENGLISH', 'FRENCH', 'FRENCH', 'ENGLISH', 'ENGLISH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'ENGLISH', 'FRENCH', 'FRENCH', 'ENGLISH', 'ENGLISH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'ENGLISH', 'FRENCH', 'FRENCH', 'FRENCH', 'ENGLISH', 'ENGLISH', 'FRENCH', 'ENGLISH', 'FRENCH', 'ENGLISH', 'FRENCH', 'FRENCH', 'ENGLISH', 'FRENCH', 'ENGLISH', 'FRENCH', 'FRENCH', 'ENGLISH', 'FRENCH', 'FRENCH', 'ENGLISH', 'ENGLISH', 'ENGLISH', 'ENGLISH', 'FRENCH', 'ENGLISH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'ENGLISH', 'FRENCH', 'FRENCH', 'ENGLISH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'FRENCH', 'ENGLISH', 'ENGLISH', 'ENGLISH', 'ENGLISH', 'FRENCH', 'ENGLISH', 'FRENCH', 'ENGLISH', 'ENGLISH', 'ENGLISH', 'ENGLISH', 'FRENCH', 'ENGLISH', 'ENGLISH', 'FRENCH', 'ENGLISH', 'ENGLISH', 'FRENCH', 'ENGLIS