In [2]:
#!pip install -r requirements.txt
import gensim.downloader
import os
import wget

os.chdir("..")

In [3]:
# Question 1.1 Download

# This downloads under the directory "~/gensim-data" by default
# change GENSIM_DATA_DIR environment variable if you don't want this
# size is about 1.6 GB
w2v = gensim.downloader.load('word2vec-google-news-300')

In [4]:
# 3 million words
len(w2v), type(w2v)

(3000000, gensim.models.keyedvectors.KeyedVectors)

In [9]:
# Each vector is 300 long
print(w2v["compute"][:10])
w2v["compute"].shape

[ 0.22753906 -0.34570312  0.0625      0.11132812  0.17089844  0.03442383
  0.13574219  0.16699219  0.07177734 -0.07421875]


(300,)

In [13]:
# Question 1.2 Download

# Create the directory if it doesn't exist
if not os.path.exists('data'):
    os.makedirs('data')

conll_raw_url = "https://raw.githubusercontent.com/TheAnig/NER-LSTM-CNN-Pytorch/master/data/"
filenames = ["eng.train", "eng.testa", "eng.testb"] 

urls = {(f, f"{conll_raw_url}/{f}") for f in filenames}

for fn, url in urls:
    save_path = f"data/{fn}"

    if os.path.exists(save_path):
        print(f"{fn} already exists. Skipping")
        continue
    wget.download(url, save_path)

# Question 1. 1

The most similar words are

    (a) student:  students, 0.729
    (b) Apple:  Apple_AAPL, 0.746
    (c) apple:  apples, 0.720

In [10]:
# (a) “student”; (b) “Apple”; (c) “apple”
words = ["student", "Apple", "apple"]

for w in words:
    most_sim_w, score = w2v.most_similar(w)[0]
    print(f"{w}:  {most_sim_w}, {score:.3f}")

student:  students, 0.729
Apple:  Apple_AAPL, 0.746
apple:  apples, 0.720


# Question 1.2

    (a1) training set has 14987 sentences
    (a2) development set has 3466 sentences
    (a3) testing set has 3684 sentences
    (a4) train tags      : {'O', 'B-MISC', 'B-LOC', 'I-MISC', 'I-LOC', 'B-ORG', 'I-PER', 'I-ORG'}
    (a5) development tags: {'O', 'B-MISC', 'I-MISC', 'I-LOC', 'I-PER', 'I-ORG'}
    (a6) testing tags    : {'O', 'B-MISC', 'B-LOC', 'I-MISC', 'I-LOC', 'B-ORG', 'I-PER', 'I-ORG'}

    (b1) 9/16 - Luo Yigang (China) beat Hwang Sun-ho (South Korea) 15-3

In [14]:
# File has one line

# First column is the word
# Second is POS tag
# Third is Consistuency parsing tag
# Fourth is NER tag

# The NER tagging column
TAGGING_INDEX  = 3

# Returns a 3 dim array of sentences x words x (word_value, word_category)
def process_sets(filepath):
    raw = open(filepath)
    fin, curr = [], []
    
    for r in raw:
        if r == "\n":
            fin.append(curr)
            curr = []
            continue
            
        r = r[:-1].split()

        # Some files have these which are used to divide sentences
        if r[0] == "-DOCSTART-":
            continue
        
        r = [r[i] for i in (0, TAGGING_INDEX)]  # select first and last columns
        curr.append(r)
    
    fin.append(curr)
    return fin

trainset = process_sets("data/eng.train")
devset = process_sets("data/eng.testa")   # aka validation set
testset = process_sets("data/eng.testb")

len(trainset), len(devset), len(testset)

(14987, 3466, 3684)

In [15]:
train_tags = set(w[1] for s in trainset for w in s)
dev_tags = set(w[1] for s in devset for w in s)
test_tags = set(w[1] for s in testset for w in s)

print(f"train tags: {train_tags}")
print(f"development tags: {dev_tags}")
print(f"testing tags: {test_tags}")

train tags: {'O', 'I-PER', 'I-LOC', 'B-LOC', 'B-MISC', 'I-ORG', 'I-MISC', 'B-ORG'}
development tags: {'I-PER', 'O', 'I-LOC', 'B-MISC', 'I-ORG', 'I-MISC'}
testing tags: {'O', 'I-PER', 'I-LOC', 'B-LOC', 'B-MISC', 'I-ORG', 'I-MISC', 'B-ORG'}


In [20]:
trainset[0]

[['EU', 'I-ORG'],
 ['rejects', 'O'],
 ['German', 'I-MISC'],
 ['call', 'O'],
 ['to', 'O'],
 ['boycott', 'O'],
 ['British', 'I-MISC'],
 ['lamb', 'O'],
 ['.', 'O']]

In [21]:
sent_eg = [] # array to store sentences with 2 consecutive tags

for sentence in trainset:
    count = 0 # count of consecutive types
    count_buffer = 0 # buffer to store running count of identical types
    current_tag = None 

    for set in sentence: # iterate through each word in sentence 
        if set[1] == 'O': # if word is not a named entity
            current_tag = None
            continue

        elif current_tag == None: # if this is the first named entity after a non-named entity or start of sentence
            current_tag = set[1]

        elif (set[1][2:] == current_tag[2:]) and count_buffer == 0: # if this is the first word that follows the same entity as the previous word
            count += 1
            count_buffer += 1

        elif set[1][2:] != current_tag[2:]: # if this is a different entity from the previous word
            current_tag = set[1]
            count_buffer = 0

        else: # if this follows the same entity as the previous word but is not the first word to do so
            continue

    if count == 2: # if there are 2 consecutive named entities, add the sentence to the array
        sent_eg.append(sentence)

In [29]:
# get random sentence with 2 consecutive named entities
print(sent_eg[0])
# print just the words 
print([word[0] for word in sent_eg[0]])

[['9/16', 'O'], ['-', 'O'], ['Luo', 'I-PER'], ['Yigang', 'I-PER'], ['(', 'O'], ['China', 'I-LOC'], [')', 'O'], ['beat', 'O'], ['Hwang', 'I-PER'], ['Sun-ho', 'I-MISC'], ['(', 'O'], ['South', 'I-LOC'], ['Korea', 'I-LOC'], [')', 'O'], ['15-3', 'O']]
['9/16', '-', 'Luo', 'Yigang', '(', 'China', ')', 'beat', 'Hwang', 'Sun-ho', '(', 'South', 'Korea', ')', '15-3']
