<a href="https://colab.research.google.com/github/HaojiaK/blog/blob/cs224n_Natural_Language_Processing/Word_Window_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import pprint
pp = pprint.PrettyPrinter()

#Data
#Our raw data, which consists of sentences
corpus = [
    "We always come to Paris",
    "The professor is from Australia",
    "I live in Stanford",
    "He comes from Taiwan",
    "The capital of Turkey is Ankara"
      ]

#Preprocessing
def preprocess_sentence(sentence):
  return sentence.lower().split()

train_sentences = [sent.lower().split() for sent in corpus]
print(train_sentences) #看一下切割分词后的效果

[['we', 'always', 'come', 'to', 'paris'], ['the', 'professor', 'is', 'from', 'australia'], ['i', 'live', 'in', 'stanford'], ['he', 'comes', 'from', 'taiwan'], ['the', 'capital', 'of', 'turkey', 'is', 'ankara']]


In [None]:
locations = set(["australia","ankara","paris","stanford","taiwan","turkey"])

train_labels = [[1 if word in locations else 0 for word in sent] for sent in train_sentences]
print(train_labels)

[[0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1, 0, 1]]


In [None]:
vocabulary = set(w for s in train_sentences for w in s)
print(vocabulary)

{'capital', 'ankara', 'he', 'of', 'we', 'professor', 'in', 'paris', 'australia', 'always', 'to', 'i', 'live', 'stanford', 'turkey', 'from', 'comes', 'is', 'the', 'come', 'taiwan'}


In [None]:
#Add the unknown token to our vocabulary
vocabulary.add("<unk>")

vocabulary.add("<pad>")
def pad_window(sentence, window_size, pad_token="<pad>"):
  window = [pad_token] * window_size
  return window + sentence + window

window_size = 2
print(pad_window(train_sentences[0], window_size=window_size))

['<pad>', '<pad>', 'we', 'always', 'come', 'to', 'paris', '<pad>', '<pad>']


In [None]:
ix_to_word = sorted(list(vocabulary))
word_to_ix = {word:ind for ind, word in enumerate(ix_to_word)}
print(word_to_ix)

{'<pad>': 0, '<unk>': 1, 'always': 2, 'ankara': 3, 'australia': 4, 'capital': 5, 'come': 6, 'comes': 7, 'from': 8, 'he': 9, 'i': 10, 'in': 11, 'is': 12, 'live': 13, 'of': 14, 'paris': 15, 'professor': 16, 'stanford': 17, 'taiwan': 18, 'the': 19, 'to': 20, 'turkey': 21, 'we': 22}


In [None]:
ix_to_word[1]

'<unk>'

In [None]:
def convert_token_to_indices(sentence, word_to_ix):
  indices = []
  for token in sentence:
    if token in word_to_ix:
      index = word_to_ix[token]
    else:
      index = word_to_ix["<unk>"]
    indices.append(index)
  return indices

def _convert_token_to_indices(sentence, word_to_ix):
  return [word_to_ind.get(token, word_to_ix["<unk>"]) for token in sentence]

example_sentence = ["we","always","come","to","kuwait"]
example_indices = convert_token_to_indices(example_sentence, word_to_ix)
restored_example = [ix_to_word[ind] for ind in example_indices]

print(f"Original sentence is : {example_sentence}")
print(f"Going from words to indices: {example_indices}")
print(f"Going from indices to words: {restored_example}")

Original sentence is : ['we', 'always', 'come', 'to', 'kuwait']
Going from words to indices: [22, 2, 6, 20, 1]
Going from indices to words: ['we', 'always', 'come', 'to', '<unk>']


In [None]:
example_padded_indices = [convert_token_to_indices(s, word_to_ix) for s in train_sentences]
print(example_padded_indices)

[[22, 2, 6, 20, 15], [19, 16, 12, 8, 4], [10, 13, 11, 17], [9, 7, 8, 18], [19, 5, 14, 21, 12, 3]]


In [None]:
embedding_dim = 5
embeds = nn.Embedding(len(vocabulary), embedding_dim)
list(embeds.parameters())

[Parameter containing:
 tensor([[ 0.1128,  1.4686, -0.3391,  1.6743,  0.8044],
         [ 2.0078, -1.7484, -1.0447,  0.5435,  0.6409],
         [ 0.6804,  0.4305,  1.4015,  0.3133,  0.0727],
         [ 1.1641, -0.2828, -0.3186, -0.3514,  0.2487],
         [-1.2190,  2.6069,  0.3437,  0.8992, -0.8399],
         [-0.6473,  0.8299,  1.2357,  0.7291,  0.4590],
         [-1.4029,  0.4665, -0.6737, -0.8666, -0.2332],
         [ 1.9436,  0.0778,  0.8547,  2.3466, -0.2070],
         [ 1.1855,  1.8207, -0.8656, -0.9261, -0.2997],
         [ 0.9397, -0.3572, -0.1898,  1.1983, -1.1415],
         [ 2.0151,  0.3944,  0.6605, -0.2400, -0.4339],
         [ 0.7393,  0.4328, -1.8211, -0.7153,  0.1100],
         [ 0.3360,  1.2404, -0.5756,  0.0267, -0.3576],
         [-1.6264,  0.7343,  0.1250,  0.3723,  0.7326],
         [ 1.3607,  2.2061, -1.0595,  1.3854, -0.3610],
         [ 1.5852, -0.0804, -0.4942,  0.0179,  1.4699],
         [ 0.3919,  0.5853, -0.1710,  1.3005, -0.4444],
         [ 0.6841,  0.159

In [None]:
index = word_to_ix["paris"]
index_tensor = torch.tensor(index, dtype=torch.long)
paris_embed = embeds(index_tensor)
print(paris_embed)

tensor([ 1.5852, -0.0804, -0.4942,  0.0179,  1.4699],
       grad_fn=<EmbeddingBackward0>)


In [None]:
index_paris = word_to_ix["paris"]
index_ankara = word_to_ix["ankara"]
indices = [index_paris,index_ankara]
indices_tensor = torch.tensor(indices, dtype=torch.long)
embeddings = embeds(indices_tensor)
print(embeddings)

tensor([[ 1.5852, -0.0804, -0.4942,  0.0179,  1.4699],
        [ 1.1641, -0.2828, -0.3186, -0.3514,  0.2487]],
       grad_fn=<EmbeddingBackward0>)


In [None]:
from torch.utils.data import DataLoader
from functools import partial

def custom_collate_fn(batch, window_size, word_to_ix):
  x, y = zip(*batch)

  def pad_window(sentence, window_size, pad_token="<pad>"):
    window = [pad_token]*window_size
    return window + sentence + window

  x = [pad_window(s,window_size = window_size) for s in x]

  def convert_tokens_to_indices(sentence, word_to_ix):
    return [word_to_ix.get(token, word_to_ix["<unk>"]) for token in sentence]
  
  x = [convert_tokens_to_indices(s,word_to_ix) for s in x]
  
  pad_token_ix = word_to_ix["<pad>"]

  x = [torch.LongTensor(x_i) for x_i in x]
  x_padded = nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=pad_token_ix)

  lengths = [len(label) for label in y]
  lengths = torch.LongTensor(lengths)

  y = [torch.LongTensor(y_i) for y_i in y]
  y_padded = nn.utils.rnn.pad_sequence(y, batch_first=True, padding_value=0)

  return x_padded, y_padded, lengths


In [None]:
def _custom_collate_fn(batch, window_size, word_to_ix):
  x, y = zip(*batch)
  x = [pad_window(s,window_size=window_size) for s in x]
  x = [convert_tokens_to_indices(s, word_to_ix) for s in x]

  pad_token_ix = word_to_ix["<pad>"]
  x = [torch.LongTensor(x_i) for x_i in x]
  x_padded = nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value = pad_token_ix)

  lengths = [len(label) for label in y]
  lengths = torch.LongTensor(lengths)
  y = [torch.LongTensor(y_i) for y_i in y]
  y_padded = nn.utils.rnn.pad_sequence(y,batch_first=True,padding_value=0)

  return x_padded, y_padded, lengths

In [None]:
data = list(zip(train_sentences, train_labels))
batch_size = 2
shuffle = True
window_size = 2
collate_fn = partial(custom_collate_fn, window_size = window_size, word_to_ix = word_to_ix)
loader = DataLoader(data, batch_size = batch_size, shuffle=shuffle, collate_fn=collate_fn)
counter = 0
for batched_x, batched_y, batched_lengths in loader:
  print(f"Iteration {counter}")
  print("Batched Input:")
  print(batched_x)
  print("Batched Labels:")
  print(batched_y)
  print("Batched Lengths:")
  print(batched_lengths)
  print("")
  counter += 1

Iteration 0
Batched Input:
tensor([[ 0,  0,  9,  7,  8, 18,  0,  0,  0],
        [ 0,  0, 22,  2,  6, 20, 15,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 1, 0],
        [0, 0, 0, 0, 1]])
Batched Lengths:
tensor([4, 5])

Iteration 1
Batched Input:
tensor([[ 0,  0, 19,  5, 14, 21, 12,  3,  0,  0],
        [ 0,  0, 10, 13, 11, 17,  0,  0,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 1, 0, 1],
        [0, 0, 0, 1, 0, 0]])
Batched Lengths:
tensor([6, 4])

Iteration 2
Batched Input:
tensor([[ 0,  0, 19, 16, 12,  8,  4,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 0, 1]])
Batched Lengths:
tensor([5])



In [None]:
print(f"Original Tensor:")
print(batched_x)
print("")

chunk = batched_x.unfold(1, window_size*2 + 1, 1)
print(f"Windows:")
print(chunk)

Original Tensor:
tensor([[ 0,  0, 19, 16, 12,  8,  4,  0,  0]])

Windows:
tensor([[[ 0,  0, 19, 16, 12],
         [ 0, 19, 16, 12,  8],
         [19, 16, 12,  8,  4],
         [16, 12,  8,  4,  0],
         [12,  8,  4,  0,  0]]])


In [None]:
class WordWindowClassifier(nn.Module):
  def __init__(self, hyperparameters, vocab_size, pad_ix=0):
    super(WordWindowClassifier, self).__init__()

    self.window_size = hyperparameters["window_size"]
    self.embed_dim = hyperparameters["embed_dim"]
    self.hidden_dim = hyperparameters["hidden_dim"]
    self.freeze_embeddings = hyperparameters["freeze_embeddings"]

    self.embeds = nn.Embedding(vocab_size, self.embed_dim, padding_idx=pad_ix)
    if self.freeze_embeddings:
      self.embed_layer.weight.requires_grad = False
    
    full_window_size = 2*window_size+1
    self.hidden_layer = nn.Sequential(
        nn.Linear(full_window_size * self.embed_dim, self.hidden_dim),
        nn.Tanh()
    )
    self.output_layer = nn.Linear(self.hidden_dim, 1)
    self.probabilities = nn.Sigmoid()

  def forward(self, inputs):
    B, L = inputs.size()
    token_windows = inputs.unfold(1, 2*self.window_size+1, 1)
    _, adjusted_length,_ = token_windows.size()

    assert token_windows.size() == (B, adjusted_length, 2*self.window_size+1)

    embedded_windows = self.embeds(token_windows)

    embedded_windows = embedded_windows.view(B, adjusted_length, -1)

    layer_1 = self.hidden_layer(embedded_windows)

    output = self.output_layer(layer_1)

    output = self.probabilities(output)
    output = output.view(B,-1)

    return output



In [None]:
data = list(zip(train_sentences, train_labels))
batch_size = 2
shuffle = True 
window_size = 2
collate_fn = partial(custom_collate_fn, window_size = window_size, word_to_ix = word_to_ix)
loader = DataLoader(data, batch_size = batch_size, shuffle=shuffle, collate_fn=collate_fn)
model_hyperparameters = {
    "batch_size":4,
    "window_size":2,
    "embed_dim":25,
    "hidden_dim":25,
    "freeze_embeddings":False,
}

vocab_size = len(word_to_ix)
model = WordWindowClassifier(model_hyperparameters, vocab_size)

learning_rate = 0.01
optimizer = torch.optim.SGD(model.parameters(),lr=learning_rate)

def loss_function(batch_outputs, batch_labels, batch_lengths):
  bceloss = nn.BCELoss()
  loss = bceloss(batch_outputs, batch_labels.float())
  loss = loss/batch_lengths.sum().float()

  return loss

In [None]:
def train_epoch(loss_function, optimizer, model, loader):
  total_loss = 0
  for batch_inputs, batch_labels, batch_lengths in loader:
    optimizer.zero_grad()
    outputs = model.forward(batch_inputs)
    loss = loss_function(outputs, batch_labels, batch_lengths)
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
  return total_loss

def train(loss_function, optimizer, moel, loader, num_epochs=10000):
  for epoch in range(num_epochs):
    epoch_loss = train_epoch(loss_function, optimizer, model, loader)
    if epoch%100 == 0: print(epoch_loss)

In [None]:
num_epochs = 1000
train(loss_function, optimizer, model, loader, num_epochs=num_epochs)

0.23924871534109116
0.1817886345088482
0.1397198811173439
0.09675858914852142
0.08276921324431896
0.067077761515975
0.047861034981906414
0.03553804475814104
0.03287338186055422
0.026043725665658712


In [None]:
test_corpus = ["She comes from Paris"]
test_sentences = [s.lower().split() for s in test_corpus]
test_labels = [[0,0,0,1]]

test_data = list(zip(test_sentences, test_labels))
batch_size = 1
shuffle = False
window_size = 2
collate_fn = partial(custom_collate_fn, window_size = 2, word_to_ix = word_to_ix)
test_loader = torch.utils.data.DataLoader(test_data,
                                          batch_size=1,
                                          shuffle=False,
                                          collate_fn = collate_fn)


In [None]:
for test_instance, labels, _ in test_loader:
  outputs = model.forward(test_instance)
  print(labels)
  print(outputs)


tensor([[0, 0, 0, 1]])
tensor([[0.1542, 0.0419, 0.4553, 0.9657]], grad_fn=<ViewBackward0>)
