In [1]:
# Notebook to check out of vocab rate for 20000 words in fast text

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [16]:
import os
import sys
from collections import Counter
from operator import itemgetter

import numpy as np
import matplotlib.pyplot as plt

In [4]:
root = os.path.dirname(os.getcwd())
src = f"{root}/src"

In [5]:
sys.path.extend((root, src))

In [6]:
from src import data
from src.utils import utils_io

In [9]:
word2id = utils_io.pickle_load(f"{root}/embeddings/word2id_20000.p")

In [10]:
word2id["<unk>"]

1

In [11]:
training, validation, testing = data.build(
    f"{root}/data/interim/book_1.txt", 
    50,
    word2id,
    seed=50,
)

In [27]:
all_inputs = np.concatenate((training[0], validation[0], testing[0]))

Fraction of sequences which have unknown tokens

In [24]:
tot_tokens = all_inputs.shape[0] * all_inputs.shape[1]
print(f"Total number of tokens: {tot_tokens}")

Total number of tokens: 1461450


In [25]:
tot_unknown = np.sum(np.where(all_inputs == 1, True, False))
print(f"Total number of unknown tokens {tot_unknown}")

Total number of unknown tokens 97857


In [26]:
print(f"Fraction of unknown {tot_unknown / tot_tokens}")

Fraction of unknown 0.06695884224571487


Inspect labels

In [12]:
all_labels = np.concatenate((training[1], validation[1], testing[1]))

In [13]:
all_labels.shape

(29229,)

In [30]:
print(f"Fraction of unknown {np.sum(np.where(all_labels == 1, True, False)) / all_labels.shape[0]}")

Fraction of unknown 0.06698826507920216


Histogram of labels

In [14]:
freq_count = {}
for i in all_labels:
    freq_count[i] = freq_count.get(i, 0) + 1

In [24]:
sorted(freq_count.items(), key=itemgetter(1), reverse=True)

[(1, 1958),
 (5, 1639),
 (3, 974),
 (2, 974),
 (7, 871),
 (9, 727),
 (11, 632),
 (23, 594),
 (163, 552),
 (674, 542),
 (8, 512),
 (54, 462),
 (34, 411),
 (10, 368),
 (26, 355),
 (15, 315),
 (22, 263),
 (88, 246),
 (572, 218),
 (31, 212),
 (147, 200),
 (19, 192),
 (45, 182),
 (21, 181),
 (63, 176),
 (51, 169),
 (18, 153),
 (74, 152),
 (61, 151),
 (42, 148),
 (38, 145),
 (220, 143),
 (78, 141),
 (33, 134),
 (456, 128),
 (57, 125),
 (98, 117),
 (17, 108),
 (48, 104),
 (228, 101),
 (93, 100),
 (84, 99),
 (46, 96),
 (66, 96),
 (50, 94),
 (91, 94),
 (105, 89),
 (72, 88),
 (201, 87),
 (70, 85),
 (62, 85),
 (2067, 83),
 (85, 83),
 (940, 83),
 (181, 83),
 (102, 80),
 (40, 80),
 (75, 79),
 (173, 77),
 (6410, 75),
 (25, 75),
 (462, 74),
 (224, 73),
 (67, 71),
 (107, 68),
 (137, 68),
 (140, 67),
 (92, 66),
 (226, 63),
 (1884, 63),
 (156, 63),
 (71, 63),
 (2411, 63),
 (86, 63),
 (58, 62),
 (450, 61),
 (100, 60),
 (17029, 59),
 (97, 58),
 (589, 58),
 (52, 57),
 (60, 57),
 (29, 57),
 (11107, 57),
 (1

In [27]:
id2word = {v: k for (k, v) in word2id.items()}

In [33]:
all_labels_words = [id2word[i] for i in all_labels]

In [34]:
freq_count_words = {}
for i in all_labels_words:
    freq_count_words[i] = freq_count_words.get(i, 0) + 1

In [35]:
sorted(freq_count_words.items(), key=itemgetter(1), reverse=True)

[('<unk>', 1958),
 ('the', 1639),
 ('<eos>', 974),
 ('<start>', 974),
 ('and', 871),
 ('to', 727),
 ('a', 632),
 ('it', 594),
 ('she', 552),
 ('i', 542),
 ('of', 512),
 ('said', 462),
 ('you', 411),
 ('in', 368),
 ('was', 355),
 ('that', 315),
 ('as', 263),
 ('her', 246),
 ('t', 218),
 ('at', 212),
 ('s', 200),
 ('on', 192),
 ('all', 182),
 ('with', 181),
 ('had', 176),
 ('but', 169),
 ('for', 153),
 ('they', 152),
 ('so', 151),
 ('be', 148),
 ('not', 145),
 ('very', 143),
 ('what', 141),
 ('this', 134),
 ('little', 128),
 ('he', 125),
 ('out', 117),
 ('is', 108),
 ('one', 104),
 ('down', 101),
 ('up', 100),
 ('there', 99),
 ('his', 96),
 ('if', 96),
 ('about', 94),
 ('then', 94),
 ('no', 89),
 ('them', 88),
 ('know', 87),
 ('like', 85),
 ('were', 85),
 ('herself', 83),
 ('would', 83),
 ('went', 83),
 ('again', 83),
 ('do', 80),
 ('have', 80),
 ('when', 79),
 ('could', 77),
 ('queen', 75),
 ('or', 75),
 ('thought', 74),
 ('off', 73),
 ('time', 71),
 ('me', 68),
 ('how', 68),
 ('see', 6