# Practical 1: word2vec

Repo: https://github.com/oxford-cs-deepnlp-2017/practical-1

XML data [link](https://github.com/oxford-cs-deepnlp-2017/practical-1/blob/master/ted_en-20160408.xml)

In [43]:
# !jt -t onedork -T -N -kl -f fira
# !jt -t onedork -fs 95 -f fira -altp -tfs 11 -nfs 115 -cellw 88% -T
# !jt -t onedork -f firacode -fs 100 -nf roboto -nfs 95 -tf ptsans -tfs 105 -T -altmd -lineh 150

In [1]:
import re
import zipfile
from pathlib import Path
from random import shuffle
import lxml.etree
import urllib.request as ureq

import numpy as np

In [2]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

### Retrieve and/or load data

In [3]:
# Get XML data
output_freq: int = 100
def raw_hooker(chunk, max_size, total_size = -1):
    """Output streaming info about download object."""
    total_chunks: int = round((total_sz / max_sz) + 0.5)
    chunk_denom: int = (total_chunks + 1) // output_freq
    if chunk % chunk_denom == 0:
        print(chunk, max_sz, total_sz)

xml_raw_url: str = "https://github.com/oxford-cs-deepnlp-2017/practical-1/blob/master/ted_en-20160408.xml?raw=true"
res = re.search(r"^.+/(.+)\?.+$", xml_raw_url)

xml_filename: str = ""
    
if res:
    xml_filename = res.group(1)
    print(xml_filename)
    xml_local_path = Path().joinpath(xml_filename)
    if not xml_local_path.is_file():
        _fn, _hdr = ureq.urlretrieve(xml_raw_url, filename = xml_filename, reporthook=raw_hooker)
        if xml_local_path.is_file():
            print(f"Filename: {_fn}\nHeader data: {_hdr}")

ted_en-20160408.xml


In [4]:
# Retain only the subtitle test of the dataset.
xml_doc = lxml.etree.parse(xml_filename)
input_text = "\n".join(xml_doc.xpath("//content/text()"))
del xml_doc

### Preprocess data

In [5]:
idx = input_text.find("Hyowon Gweon: See this?") 
if idx:
    print(idx)
    print(input_text[idx-20:idx+150])

2017819
 baby does.
(Video) Hyowon Gweon: See this? (Ball squeaks) Did you see that? (Ball squeaks) Cool. See this one? (Ball squeaks) Wow.
Laura Schulz: Told you. (Laughs)
(Vide


In [6]:
# Remove parenthesized strings
clean_input = re.sub(r"\([^\)]*\)", "", input_text)

In [7]:
idx = clean_input.find("Hyowon Gweon: See this?") 
clean_input[idx-20:idx+150]

"hat the baby does.\n Hyowon Gweon: See this?  Did you see that?  Cool. See this one?  Wow.\nLaura Schulz: Told you. \n HG: See this one?  Hey Clara, this one's for you. You "

In [8]:
# --- Remove names of speaking characters the occur at the beginning of a line.

dialog_pattern: str = r"^([a-z ]+?:\s*)"
comp_dialog = re.compile(dialog_pattern, flags = re.I | re.M)
cleaner_input = comp_dialog.sub("", clean_input)


In [9]:
# --- Split into sentence chunks and/or just create a single string of all sentances without the fluff.

enumlist = lambda iterable: {k:v for k, v in enumerate(iterable)}

lines = re.split(r".\n+", cleaner_input)
print(f"No. lines: {len(lines):,}\n")
print("\n".join([f"{i}: {line}" for i, line in enumlist(lines[:5]).items()]))

No. lines: 48,411

0: they only do more of the same, or they only do what's new
1: To me the real, real solution to quality growth is figuring out the balance between two activities: exploration and exploitation. Both are necessary, but it can be too much of a good thing
2: Consider Facit. I'm actually old enough to remember them. Facit was a fantastic company. They were born deep in the Swedish forest, and they made the best mechanical calculators in the world. Everybody used them. And what did Facit do when the electronic calculator came along? They continued doing exactly the same. In six months, they went from maximum revenue ... and they were gone. Gone
3: To me, the irony about the Facit story is hearing about the Facit engineers, who had bought cheap, small electronic calculators in Japan that they used to double-check their calculators
4: Facit did too much exploitation. But exploration can go wild, too


In [10]:
# --- Remove non-alphanumeric characters and split by whitespace.

# re.split(re.sub(r"([\W]+)", " ", lines[0]), " ")
s_dict = {}

for i, line in enumlist(lines).items():
    s_dict[i] = re.split(r"\s+", re.sub(r"([^\w]+)", " ", line.lower()))
#     sentences.append(re.split(r"\s+", re.sub(r"([^\w]+)", " ", line)))
print(f"Dictionary length: {len(s_dict):,}\n")

Dictionary length: 48,411



In [37]:
# --- Get Top 1000 words by frequency

sort_dict_by_values = lambda d, rev = False: dict(sorted(d.items(), key = lambda q: q[1], reverse=rev))

def frequencies(iterable):
    tmp_dict = dict()
    if iterable.keys():
        for k in iterable:
            for w in iterable[k]:
                if len(w) > 0:
                    if not w in tmp_dict:
                        tmp_dict[w] = 1
                    else:
                        tmp_dict[w] += 1
    return sort_dict_by_values(tmp_dict)


freq_dict = frequencies(s_dict)

# unique_counts = sorted(list(set(freq_dict.values())), reverse = True)


# unique_values = list(set(freq_dict.values()))
# value_rnk = {i:v for i, v in enumerate(sorted(unique_values, reverse=True))}
# word_rnk = {word:ct for i,(word, ct) in enumerate(freq_dict.items()) if i < TOPN}
# print(("\n".join([f"{k}: {word_rnk[k]}" for k in word_rnk if word_rnk[k] < 20]))

# print("\n".join([f"{k}: {freq_dict[k]}" for k in freq_dict if freq_dict[k] < 20]))

In [39]:

TOPN: int = 100


def top_n_words(d: dict):
    __w_dict: dict = dict(word=[], word_len=[], freq=[])
        
    ddict= dict(sorted(d.items(), key = lambda q: q[1], reverse = True))
    
    for i, (k, v) in enumerate(ddict.items(), start=1):
        if i > TOPN:
              break
        __w_dict[k] = v 
#         __w_dict["word"].append(k)
#         __w_dict["freq"].append(v)
    return __w_dict

top_n_dict = top_n_words(freq_dict)
max_width = max([len(w) for w in top_n_dict.keys()])
print(max_width)

def print_word_rank(d: dict):
    # If limit is -1, print all results.
    max_w = max([len(w) for w in d.keys()])
#     max_w += int(max_w * 0.2)

    items = [f"{i:<5}\t{k:<{max_w}}\t{v[i]}" for i, (k, v) in enumerate(d.items(), start=1)]
    print("{a:<5}\t{b:<{w}}\t{c}".format(a="Index", b="Word", c="Count", w= max_w))
    print("\n".join(items))

print_word_rank(top_n_dict)

9


IndexError: list index out of range

In [31]:
# print_top_n(word_rnk, 10)
# print("\n".join([f"{i}: {k} - {word_rnk[k]}" for k in word_rnk if word_rnk[k] < 20]))

# print_top_n(word_rnk, TOPN)
# unique_counts = sorted(list(set(freq_dict.values())), reverse = True)
# max_count = max(freq_dict.values())
# mean_count = sum(freq_dict.values()) / len(freq_dict)
# print(max_count, "\n", f"{mean_count:.3f}")

max(map(len, freq_dict.keys()))


26

In [48]:
# --- Clean up some unnecessary objects.
if clean_input:
    del clean_input
if input_text:
    del input_text