# Word window classification

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import torch
import torch.nn as nn

## Generate Data

In [2]:
sentences = ['I went to Paris last year', 
             'Toronto is the most beautiful city in the world',
             'I was born and raised in Casablanca',
             'Let us go to bali this summer',
             'I want to go to Ireland for my next vacation',
             'I am going to visit London next month']

labels = [[0, 0, 0, 1, 0, 0],
          [1, 0, 0, 0, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 0, 0, 1],
          [0, 0, 0, 0, 1, 0, 0],
          [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
          [0, 0, 0, 0, 0, 1, 0, 0]]

## Preprocess Data

In [3]:
def PreProcess(sentence):
    sentence = sentence.lower()
    sentence = sentence.split()
    return sentence

In [4]:
sentences = [PreProcess(sentence) for sentence in sentences]
sentences

[['i', 'went', 'to', 'paris', 'last', 'year'],
 ['toronto', 'is', 'the', 'most', 'beautiful', 'city', 'in', 'the', 'world'],
 ['i', 'was', 'born', 'and', 'raised', 'in', 'casablanca'],
 ['let', 'us', 'go', 'to', 'bali', 'this', 'summer'],
 ['i', 'want', 'to', 'go', 'to', 'ireland', 'for', 'my', 'next', 'vacation'],
 ['i', 'am', 'going', 'to', 'visit', 'london', 'next', 'month']]

In [5]:
vocabulary = set(w for s in sentences for w in s)

In [6]:
word_to_ix = {}

i = 2
for word in vocabulary:
    word_to_ix[word] = i
    i += 1

In [7]:
word_to_ix['<pad>'] = 0
word_to_ix['<unk>'] = 1

In [8]:
word_to_ix

{'raised': 2,
 'year': 3,
 'summer': 4,
 'went': 5,
 'london': 6,
 'was': 7,
 'in': 8,
 'go': 9,
 'toronto': 10,
 'beautiful': 11,
 'born': 12,
 'let': 13,
 'most': 14,
 'want': 15,
 'going': 16,
 'my': 17,
 'vacation': 18,
 'the': 19,
 'last': 20,
 'is': 21,
 'month': 22,
 'bali': 23,
 'world': 24,
 'visit': 25,
 'next': 26,
 'i': 27,
 'to': 28,
 'us': 29,
 'casablanca': 30,
 'ireland': 31,
 'city': 32,
 'am': 33,
 'for': 34,
 'paris': 35,
 'and': 36,
 'this': 37,
 '<pad>': 0,
 '<unk>': 1}

In [26]:
def sentence_to_ix(sentence):
    indexes = []
    for word in sentence:
        # if the word is in the vocabulary return its index else return the index of <unk>
        indexes.append(word_to_ix.get(word, word_to_ix['<unk>']))
    return indexes

In [10]:
sentence_to_ix(['i', 'went', 'to', 'paris', 'last', 'year'])

[27, 5, 28, 35, 20, 3]

In [11]:
sentences = [sentence_to_ix(sentence) for sentence in sentences]

In [12]:
def window_padding(sentence, window_size, pad_index=0):
    return [pad_index] * window_size + sentence + [pad_index] * window_size

In [13]:
sentences = [window_padding(sentence, 2, pad_index=0) for sentence in sentences]
sentences

[[0, 0, 27, 5, 28, 35, 20, 3, 0, 0],
 [0, 0, 10, 21, 19, 14, 11, 32, 8, 19, 24, 0, 0],
 [0, 0, 27, 7, 12, 36, 2, 8, 30, 0, 0],
 [0, 0, 13, 29, 9, 28, 23, 37, 4, 0, 0],
 [0, 0, 27, 15, 28, 9, 28, 31, 34, 17, 26, 18, 0, 0],
 [0, 0, 27, 33, 16, 28, 25, 6, 26, 22, 0, 0]]

In [14]:
def final_padding(sentences, labels, word_to_ix):

    pad_token_ix = word_to_ix["<pad>"]

    # pad_sequence function expects the input to be a tensor, so we turn x into one
    sentences = [torch.LongTensor(sentence) for sentence in sentences]
    sentences_padded = nn.utils.rnn.pad_sequence(sentences, batch_first=True, padding_value=pad_token_ix)

    # We will also pad the labels. Before doing so, we will record the number 
    # of labels so that we know how many words existed in each example. 
    lengths = [len(label) for label in labels]
    lenghts = torch.LongTensor(lengths)

    labels = [torch.LongTensor(label) for label in labels]
    y_padded = nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=pad_token_ix)

    # We are now ready to return our variables. The order we return our variables
    # here will match the order we read them in our training loop.
    return sentences_padded, y_padded, lenghts  

In [15]:
sentences = final_padding(sentences, labels, word_to_ix)[0]
labels = final_padding(sentences, labels, word_to_ix)[1]
lengths = final_padding(sentences, labels, word_to_ix)[2]

In [16]:
window_size = 2

In [17]:
# Finally; we create the windows in order to perform window word classification using unfold(dimension, size, step)

sentences = [sentence.unfold(0, 2*window_size + 1 ,1) for sentence in sentences]

sentences

[tensor([[ 0,  0, 27,  5, 28],
         [ 0, 27,  5, 28, 35],
         [27,  5, 28, 35, 20],
         [ 5, 28, 35, 20,  3],
         [28, 35, 20,  3,  0],
         [35, 20,  3,  0,  0],
         [20,  3,  0,  0,  0],
         [ 3,  0,  0,  0,  0],
         [ 0,  0,  0,  0,  0],
         [ 0,  0,  0,  0,  0]]),
 tensor([[ 0,  0, 10, 21, 19],
         [ 0, 10, 21, 19, 14],
         [10, 21, 19, 14, 11],
         [21, 19, 14, 11, 32],
         [19, 14, 11, 32,  8],
         [14, 11, 32,  8, 19],
         [11, 32,  8, 19, 24],
         [32,  8, 19, 24,  0],
         [ 8, 19, 24,  0,  0],
         [19, 24,  0,  0,  0]]),
 tensor([[ 0,  0, 27,  7, 12],
         [ 0, 27,  7, 12, 36],
         [27,  7, 12, 36,  2],
         [ 7, 12, 36,  2,  8],
         [12, 36,  2,  8, 30],
         [36,  2,  8, 30,  0],
         [ 2,  8, 30,  0,  0],
         [ 8, 30,  0,  0,  0],
         [30,  0,  0,  0,  0],
         [ 0,  0,  0,  0,  0]]),
 tensor([[ 0,  0, 13, 29,  9],
         [ 0, 13, 29,  9, 28],
  

- Our preprocessing is done, we can build our model.
- Note that every new data that will be used to get a prediction should be preprocessed the same way

## Define Model

In [18]:
# Parameters
vocab_size = len(vocabulary) + 2
dim_embeddings = 5
hidden_dim = 25

In [19]:
embedding_table = nn.Embedding(vocab_size, dim_embeddings)
embedding_table.weight.requires_grad = True

hidden_layer = nn.Linear(sentences[0][0].numel()*dim_embeddings, hidden_dim)

prediction_layer = nn.Linear(hidden_dim, 1)

In [20]:
model = nn.Sequential(embedding_table, 
                      nn.Flatten(start_dim=0),
                      hidden_layer, nn.Tanh(), 
                      prediction_layer, nn.Sigmoid()
                      )

## Define Optimization Process

In [21]:
# Define dataloader

from torch.utils.data import DataLoader

loader = DataLoader(list(zip(sentences, labels)), batch_size=2)

In [22]:
# Define criterion
criterion = nn.BCELoss()

In [23]:
# Define optimizer
import torch.optim as optim
optimizer = optim.SGD(model.parameters(), lr=0.001)

## Train Model

In [24]:
for param in model.parameters():
    param.requires_grad = True

epochs = 1000
for epoch in range(epochs):   
    for batch_sentences, batch_labels in loader:
        predictions = []
        real = []
        for sentence, label in zip(batch_sentences, batch_labels):
            for window, word_label in zip(sentence, label):
                predictions.append(model(window))
                real.append(word_label)

        predictions, real = torch.stack(predictions), torch.stack(real).unsqueeze(1).float()

        optimizer.zero_grad()
        loss = criterion(predictions, real)
        loss.backward()
        optimizer.step()

## Plot train loss vs epochs

In [25]:
# plotting the loss
losses = []

for epoch in range(epochs):
    for batch_sentences, batch_labels in loader:
        predictions = []
        real = []
        for sentence, label in zip(batch_sentences, batch_labels):
            for window, word_label in zip(sentence, label):
                predictions.append(model(window))
                real.append(word_label)

        predictions, real = torch.stack(predictions), torch.stack(real).unsqueeze(1).float()

        optimizer.zero_grad()
        loss = criterion(predictions, real)
        loss.backward()
        optimizer.step()

    losses.append(loss.item())

fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(len(losses))), y=losses, mode='lines'))
fig.update_layout(title='Loss over epochs', xaxis_title='Epoch', yaxis_title='Loss')
fig.show()

## Test on new sentence

In [29]:
test_sentence = 'I want to go to California in December'

In [30]:
# Preprocess the sentence
test_sentence = PreProcess(test_sentence)

# Convert the sentence to indexes
test_sentence = sentence_to_ix(test_sentence)

# Pad the sentence
test_sentence = window_padding(test_sentence, 2, pad_index=0)

# perform final padding
test_sentence = final_padding([test_sentence], [[0]], word_to_ix)[0]

test_sentence

tensor([[ 0,  0, 27, 15, 28,  9, 28,  1,  8,  1,  0,  0]])

In [33]:
# Create the windows
window_size = 2
test_sentence = test_sentence.unfold(1, 2*window_size + 1 ,1)



In [34]:
test_sentence

tensor([[[ 0,  0, 27, 15, 28],
         [ 0, 27, 15, 28,  9],
         [27, 15, 28,  9, 28],
         [15, 28,  9, 28,  1],
         [28,  9, 28,  1,  8],
         [ 9, 28,  1,  8,  1],
         [28,  1,  8,  1,  0],
         [ 1,  8,  1,  0,  0]]])

In [46]:
predictions = []
for i in range(test_sentence.shape[1]):
    predictions.append(model(test_sentence[:, i]))

torch.argmax(torch.stack(predictions)) 

tensor(5)

-  The word of index 5 in our test sentence is "California" which is indeed a location.