In [None]:
#!pip install -r requirements.txt
import gensim.downloader
import os
import wget

os.chdir("..")

In [None]:
# Question 1.1 Download

# This downloads under the directory "~/gensim-data" by default
# change GENSIM_DATA_DIR environment variable if you don't want this
# size is about 1.6 GB
w2v = gensim.downloader.load('word2vec-google-news-300')

In [None]:
# 3 million words
len(w2v), type(w2v)

In [None]:
# Each vector is 300 long
print(w2v["compute"][:10])
w2v["compute"].shape

In [None]:
# Question 1.2 Download

conll_raw_url = "https://raw.githubusercontent.com/TheAnig/NER-LSTM-CNN-Pytorch/master/data/"
filenames = ["eng.train", "eng.testa", "eng.testb"] 

urls = {(f, f"{conll_raw_url}/{f}") for f in filenames}

for fn, url in urls:
    save_path = f"data/{fn}"

    if os.path.exists(save_path):
        print(f"{fn} already exists. Skipping")
        continue
    wget.download(url, save_path)

# Question 1. 1

The most similar words are

    (a) student:  students, 0.729
    (b) Apple:  Apple_AAPL, 0.746
    (c) apple:  apples, 0.720

In [None]:
# (a) “student”; (b) “Apple”; (c) “apple”
words = ["student", "Apple", "apple"]

for w in words:
    most_sim_w, score = w2v.most_similar(w)[0]
    print(f"{w}:  {most_sim_w}, {score:.3f}")

# Question 1.2

    (a1) training set has 14987 sentences
    (a2) development set has 3466 sentences
    (a3) testing set has 3684 sentences
    (a4) train tags      : {'O', 'B-MISC', 'B-LOC', 'I-MISC', 'I-LOC', 'B-ORG', 'I-PER', 'I-ORG'}
    (a5) development tags: {'O', 'B-MISC', 'I-MISC', 'I-LOC', 'I-PER', 'I-ORG'}
    (a6) testing tags    : {'O', 'B-MISC', 'B-LOC', 'I-MISC', 'I-LOC', 'B-ORG', 'I-PER', 'I-ORG'}

In [None]:
# File has one line

# The tagging column that we will use
# 
TAGGING_INDEX  = 3

# Returns a 3 dim array of sentences x words x (word_value, word_category)
def process_sets(filepath):
    raw = open(filepath)
    fin, curr = [], []
    
    for r in raw:
        if r == "\n":
            fin.append(curr)
            curr = []
            continue
            
        r = r[:-1].split()

        # Some files have these which are used to divide sentences
        if r[0] == "-DOCSTART-":
            continue
        
        r = [r[i] for i in (0, TAGGING_INDEX)]  # select first and last columns
        curr.append(r)
    
    fin.append(curr)
    return fin

trainset = process_sets("data/eng.train")
devset = process_sets("data/eng.testa")   # aka validation set
testset = process_sets("data/eng.testb")

len(trainset), len(devset), len(testset)

In [None]:
train_tags = set(w[1] for s in trainset for w in s)
dev_tags = set(w[1] for s in devset for w in s)
test_tags = set(w[1] for s in testset for w in s)

print(f"train tags: {train_tags}")
print(f"development tags: {dev_tags}")
print(f"testing tags: {test_tags}")