In [1]:
#!pip install -r requirements.txt
import gensim.downloader
import os
import wget

os.chdir("..")

In [2]:
# Question 1.1 Download

# This downloads under the directory "~/gensim-data" by default
# change GENSIM_DATA_DIR environment variable if you don't want this
# size is about 1.6 GB
w2v = gensim.downloader.load('word2vec-google-news-300')

In [3]:
# 3 million words
len(w2v), type(w2v)

(3000000, gensim.models.keyedvectors.KeyedVectors)

In [4]:
# Each vector is 300 long
print(w2v["compute"][:10])
w2v["compute"].shape

[ 0.22753906 -0.34570312  0.0625      0.11132812  0.17089844  0.03442383
  0.13574219  0.16699219  0.07177734 -0.07421875]


(300,)

In [5]:
# Question 1.2 Download

# Create the directory if it doesn't exist
if not os.path.exists('data'):
    os.makedirs('data')

conll_raw_url = "https://raw.githubusercontent.com/TheAnig/NER-LSTM-CNN-Pytorch/master/data/"
filenames = ["eng.train", "eng.testa", "eng.testb"] 

urls = {(f, f"{conll_raw_url}/{f}") for f in filenames}

for fn, url in urls:
    save_path = f"data/{fn}"

    if os.path.exists(save_path):
        print(f"{fn} already exists. Skipping")
        continue
    wget.download(url, save_path)

eng.train already exists. Skipping
eng.testa already exists. Skipping
eng.testb already exists. Skipping


# Question 1. 1

The most similar words are

    (a) student:  students, 0.729
    (b) Apple:  Apple_AAPL, 0.746
    (c) apple:  apples, 0.720

In [6]:
# (a) “student”; (b) “Apple”; (c) “apple”
words = ["student", "Apple", "apple"]

for w in words:
    most_sim_w, score = w2v.most_similar(w)[0]
    print(f"{w}:  {most_sim_w}, {score:.3f}")

student:  students, 0.729
Apple:  Apple_AAPL, 0.746
apple:  apples, 0.720


# Question 1.2

    (a1) training set has 14987 sentences
    (a2) development set has 3466 sentences
    (a3) testing set has 3684 sentences
    (a4) train tags      : {'O', 'B-MISC', 'B-LOC', 'I-MISC', 'I-LOC', 'B-ORG', 'I-PER', 'I-ORG'}
    (a5) development tags: {'O', 'B-MISC', 'I-MISC', 'I-LOC', 'I-PER', 'I-ORG'}
    (a6) testing tags    : {'O', 'B-MISC', 'B-LOC', 'I-MISC', 'I-LOC', 'B-ORG', 'I-PER', 'I-ORG'}

    (b1) 9/16 - Luo Yigang (China) beat Hwang Sun-ho (South Korea) 15-3
    (b2) (i) ['Luo Yigang', 'China', 'Hwang', 'Sun-ho', 'South Korea']


    (b2) (ii) 
    The function get_named_entities takes a sentence as input, represented as a list of (word, tag) tuples. It iterates through the sentence. 
    
    `entities` is an array that accumulates all Named Entities in the sentence.

    `current_tag` specifies the current Named Entity Type of the current Entity we are processing.

    `current_entities` is an array that accumulates word of the same entity. 
    
    If current iteration find a tag that starts with 'B-', it indicates a start of a new named entity. If there are words in the `current_entities`, it will contain an entire Named Entity and we append to the `entities` array. Then we reset `current_entities` with the current word and the `current_tag` with the current type.

    If current iteration find a tag that starts with 'I-', it indicates either a start of a new named entity or a continuation depending on the history. If `current_tag` == type of this iteration, then it means it's a continuation since this would begin with 'B-' otherwise. Thus, we append to the 'current_entities' array. 
    Otherwise, this is a start of a new named entity. Thus, if there are words in the `current_entities`, it will contain an entire Named Entity and we append to the `entities` array. Then we reset `current_entities` with the current word and the `current_tag` with the current type. 
    
    Lastly, if current iteration is an 'O' type, it indicates this is not part of a named entity. If there are words in the `current_entities`, it will contain an entire Named Entity and we append to the `entities` array. Then we reset `current_entities` with the empty list [] and the `current_tag` with None.

    At the end of this loop, if the `current_entities` is not empty, this indicates the last named entity ends the sentence, and we simply append it to the `entities` list, and we are done.






In [7]:
# File has one line

# First column is the word
# Second is POS tag
# Third is Consistuency parsing tag
# Fourth is NER tag

# The NER tagging column
TAGGING_INDEX  = 3

# Returns a 3 dim array of sentences x words x (word_value, word_category)
def process_sets(filepath):
    raw = open(filepath)
    fin, curr = [], []
    
    for r in raw:
        if r == "\n":
            fin.append(curr)
            curr = []
            continue
            
        r = r[:-1].split()

        # Some files have these which are used to divide sentences
        if r[0] == "-DOCSTART-":
            continue
        
        r = [r[i] for i in (0, TAGGING_INDEX)]  # select first and last columns
        curr.append(r)
    
    fin.append(curr)
    return fin

trainset = process_sets("data/eng.train")
devset = process_sets("data/eng.testa")   # aka validation set
testset = process_sets("data/eng.testb")

len(trainset), len(devset), len(testset)

(14987, 3466, 3684)

In [8]:
train_tags = set(w[1] for s in trainset for w in s)
dev_tags = set(w[1] for s in devset for w in s)
test_tags = set(w[1] for s in testset for w in s)

print(f"train tags: {train_tags}")
print(f"development tags: {dev_tags}")
print(f"testing tags: {test_tags}")

train tags: {'B-LOC', 'I-MISC', 'B-MISC', 'I-ORG', 'I-LOC', 'O', 'B-ORG', 'I-PER'}
development tags: {'I-MISC', 'B-MISC', 'I-ORG', 'I-LOC', 'O', 'I-PER'}
testing tags: {'B-LOC', 'I-MISC', 'B-MISC', 'I-ORG', 'I-LOC', 'O', 'B-ORG', 'I-PER'}


In [9]:
trainset[0]

[['EU', 'I-ORG'],
 ['rejects', 'O'],
 ['German', 'I-MISC'],
 ['call', 'O'],
 ['to', 'O'],
 ['boycott', 'O'],
 ['British', 'I-MISC'],
 ['lamb', 'O'],
 ['.', 'O']]

In [10]:
# get all sentences with 2 consecutive named entities
sent_eg = [] # array to store sentences with 2 consecutive tags

for sentence in trainset:
    count = 0 # count of consecutive types
    count_buffer = 0 # buffer to store running count of identical types
    current_tag = None 

    for word in sentence: # iterate through each word in sentence 
        if word[1] == 'O': # if word is not a named entity
            current_tag = None
            continue

        elif current_tag == None: # if this is the first named entity after a non-named entity or start of sentence
            current_tag = word[1]

        elif (word[1][2:] == current_tag[2:]) and count_buffer == 0: # if this is the first word that follows the same entity as the previous word
            count += 1
            count_buffer += 1

        elif word[1][2:] != current_tag[2:]: # if this is a different entity from the previous word
            current_tag = word[1]
            count_buffer = 0

        else: # if this follows the same entity as the previous word but is not the first word to do so
            continue

    if count == 2: # if there are 2 consecutive named entities, add the sentence to the array
        sent_eg.append(sentence)

In [11]:
# get random sentence with 2 consecutive named entities
print(sent_eg[0])
# print just the words 
print([word[0] for word in sent_eg[0]])

[['9/16', 'O'], ['-', 'O'], ['Luo', 'I-PER'], ['Yigang', 'I-PER'], ['(', 'O'], ['China', 'I-LOC'], [')', 'O'], ['beat', 'O'], ['Hwang', 'I-PER'], ['Sun-ho', 'I-MISC'], ['(', 'O'], ['South', 'I-LOC'], ['Korea', 'I-LOC'], [')', 'O'], ['15-3', 'O']]
['9/16', '-', 'Luo', 'Yigang', '(', 'China', ')', 'beat', 'Hwang', 'Sun-ho', '(', 'South', 'Korea', ')', '15-3']


In [12]:
# form complete named entities from sentence
def get_named_entities(sentence):
    entities = [] # array to store named entities
    current_entity = [] # holds words of a currently processed entity
    current_tag = None  # hold the the type of current named entity 

    for word, tag in sentence:
        if tag.startswith('B-'): # start of a new entity
            if current_entity:  # if the current_entity list is not empty, append it to entities
                entities.append(" ".join(current_entity)) # join the words in the current_entity list to form a named entity
            current_entity = [word] # start a new entity
            current_tag = tag[2:] # set current_tag to the type of the new entity
        
        elif tag.startswith('I-'): # either a continuation of the current entity or a different entity
            if current_tag == tag[2:]:  # Continue the same entity
                current_entity.append(word)
            else:  # Different entity found
                if current_entity:  # Save the previous entity
                    entities.append(" ".join(current_entity))
                current_entity = [word]  # Start a new entity
                current_tag = tag[2:]

        else: # 'O' found
            if current_entity:  # if current entity is not empty, it means we just finished processing an entity
                entities.append(" ".join(current_entity))
            current_entity = [] # reset current_entity
            current_tag = None # reset current_tag

    if current_entity:  # Catch remaining entity
        entities.append(" ".join(current_entity))
        
    return entities

print(get_named_entities(sent_eg[0]))

['Luo Yigang', 'China', 'Hwang', 'Sun-ho', 'South Korea']


# Question 1.3

    (a) Words present in the training set, dev set and test set, but not in the word2vec model will be assigned a vector of zeros. However, if these words in lower case is present in word2vec model, assign that instead.
    (b) LSTM network used (details in pdf)

In [13]:
os.chdir("./utils")

In [14]:
import numpy as np
import torch
from torch.utils.data import DataLoader
from common_utils import *
from model import *
set_seed(42)

In [15]:
# check number of words not in word2vec model

fullset = trainset + devset + testset
abs_w2v, abs_w2v_lower = get_words_not_in_model(fullset, w2v)
print(len(abs_w2v), len(abs_w2v_lower))
# we see that there are 83,402 words in fullset not in the word2vec
# however, a slight decrease to 83,322 words when we convert the missing words to lower case and check again

83402 83322


Data preproccesing

In [16]:
# append <UNK> token to words not in word2vec model
w2v['<UNK>'] = np.zeros(300)
print(len(w2v))
print(w2v.key_to_index['<UNK>'])

3000001
3000000


In [17]:
word_to_ix = {"<PAD>": 3000000, "<UNK>": 3000000} # assign <PAD> and <UNK> tokens to index 3000000

for sentence in fullset:
    for word, tag in sentence:
        if word in w2v.key_to_index:
            word_to_ix[word] = w2v.key_to_index[word]
        else:
            if word.lower() in w2v.key_to_index:
                word_to_ix[word] = w2v.key_to_index[word.lower()]

tag_to_ix = {"<PAD>": 0, "O": 1, "B-MISC": 2, "B-LOC": 3, "I-MISC": 4, "I-LOC": 5, "B-ORG": 6, "I-PER": 7, "I-ORG": 8}

print(word_to_ix)
print(tag_to_ix)

{'<PAD>': 0, 'O': 1, 'B-MISC': 2, 'B-LOC': 3, 'I-MISC': 4, 'I-LOC': 5, 'B-ORG': 6, 'I-PER': 7, 'I-ORG': 8}


In [18]:
sentences, labels = separate_words_tags(trainset) # separate words and tags
train_data = NERDataset(sentences, labels, word_to_ix, tag_to_ix)

sentences, labels = separate_words_tags(devset)
dev_data = NERDataset(sentences, labels, word_to_ix, tag_to_ix)

sentences, labels = separate_words_tags(testset)
test_data = NERDataset(sentences, labels, word_to_ix, tag_to_ix)

# Creating DataLoader ; pad_collate function pads sentences to the length of the longest sentence in the batch
batch_size = 16

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=pad_collate)  
dev_loader = DataLoader(dev_data, batch_size=batch_size, shuffle=True, collate_fn=pad_collate)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True, collate_fn=pad_collate)

---
### Training 
    (c) 

In [19]:
embedding_dim = 300
hidden_dim = 16
vocab_size = len(word_to_ix)
tagset_size = len(tag_to_ix)
weights_matrix = torch.FloatTensor(w2v.vectors)


model = NERModel(embedding_dim=embedding_dim, hidden_dim=hidden_dim, vocab_size=vocab_size, tagset_size=tagset_size, weights_matrix=weights_matrix)

In [20]:
lr = 0.001 
optimiser = torch.optim.Adam(model.parameters(), lr=lr)
loss = torch.nn.CrossEntropyLoss()

if torch.cuda.is_available(): # nvidia gpu
    device = torch.device("cuda")
elif torch.backends.mps.is_available(): # apple gpu
    device = torch.device("mps")
else:
    device = torch.device("cpu")
model.to(device)

epochs = 300 # number of epochs
early_stopper = EarlyStopper(patience=10) # initialise early stopper

# Make directory to save baseline model
model_path = "../saved_models/"
if not os.path.exists(model_path):
    os.makedirs(model_path, exist_ok=True)

# Define the device-specific path
device_type = None
if device == torch.device("cuda"):
    device_type = "cuda"
elif device == torch.device("mps"):
    device_type = "mps"
else:
    device_type = "cpu"

# Construct the full path
device_path = os.path.join(model_path, device_type)

# Create the directory if it doesn't exist
if not os.path.exists(device_path):
    os.mkdir(device_path)

In [21]:
correct = 0
loss = torch.nn.CrossEntropyLoss()
for data in dev_loader:
    inputs, labels, _ = data
    inputs, labels = inputs.to(device), labels.to(device)

    outputs = model(inputs)

    
    _, predicted = torch.max(outputs.data, 1)

    correct += (predicted == labels).sum().item()

print(correct)


86109


In [22]:
train_loss_list, val_loss_list, val_acc_list = train(model, train_loader, dev_loader, optimiser, loss, device, epochs, early_stopper, device_path) # train model

Epoch 1/300:   0%|          | 0/937 [00:00<?, ?it/s]

Epoch 1/300: 100%|██████████| 937/937 [00:12<00:00, 74.40it/s, Training loss=0.3988]


Epoch 1/300 took 13.59s | Train loss: 0.3988 | Val loss: 0.1740 | Val accuracy: 94.68% | EarlyStopper count: 0


Epoch 2/300:  85%|████████▍ | 795/937 [00:11<00:02, 69.10it/s, Training loss=0.1512]


KeyboardInterrupt: 