In [2]:
import logging
import os

from tqdm import tqdm

In [1]:
import sys
sys.path.append("../../imports/")
sys.path.append("../")
import saver as sv
import data_utils_conv as du

In [None]:
logging.basicConfig(format='[%(asctime)s] %(message)s', level=logging.INFO)
os.makedirs('data/', exist_ok=True)

In [None]:
sentences = sv.load("wiki_sentences_sp")

In [None]:
print("Minimum length of token:",sentences.wiki.token_min_len)

In [None]:
for sentence in sentences:
    print(sentence[:15])
    break

In [None]:
from localgensim.gensim2 import utils
from collections import defaultdict 
from six import string_types,iteritems

In [None]:
raw_vocab = None
max_vocab_size = None

def _scan_vocab(sentences, progress_per, trim_rule):
        global raw_vocab, max_vocab_size
        sentence_no = -1
        total_words = 0
        min_reduce = 1
        vocab = defaultdict(int)
        checked_string_types = 0
        for sentence_no, sentence in enumerate(sentences):
            if not checked_string_types:
                if isinstance(sentence, string_types):
                    logging.warning(
                        "Each 'sentences' item should be a list of words (usually unicode strings). "
                        "First item here is instead plain %s.",
                        type(sentence)
                    )
                checked_string_types += 1
            if sentence_no % progress_per == 0:
                logging.info(
                    "PROGRESS: at sentence #%i, processed %i words, keeping %i word types",
                    sentence_no, total_words, len(vocab)
                )
            for word in sentence:
                vocab[word] += 1
            total_words += len(sentence)

            if max_vocab_size and len(vocab) > max_vocab_size:
                utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule)
                min_reduce += 1

        corpus_count = sentence_no + 1
        raw_vocab = vocab
        return total_words, corpus_count

def scan_vocab(sentences=None, progress_per=100000, trim_rule=None):
        logging.info("collecting all words and their counts")

        total_words, corpus_count = _scan_vocab(sentences, progress_per, trim_rule)

        logging.info(
            "collected %i word types from a corpus of %i raw words and %i sentences",
            len(raw_vocab), total_words, corpus_count
        )

        return total_words, corpus_count

In [None]:
total_words, corpus_count = scan_vocab(sentences=sentences)

In [None]:
sv.save(raw_vocab,"en_raw_vocab_gensim")

In [None]:
from localgensim.gensim2.models.keyedvectors import Vocab
from numpy import sqrt

In [None]:
min_count = 10
sample = False
drop_total = drop_unique = 0
update = False
trim_rule = None

In [None]:
if not update:
    logging.info("Loading a fresh vocabulary")
    retain_total, retain_words = 0, []
    # Discard words less-frequent than min_count
    index2word = []
    # make stored settings match these applied settings
    vocab = {}
    effective_min_count = min_count
    for word, v in iteritems(raw_vocab):
        if utils.keep_vocab_item(word, v, effective_min_count, trim_rule=trim_rule):
            retain_words.append(word)
            retain_total += v
            vocab[word] = Vocab(count=v, index=len(index2word))
            index2word.append(word)
        else:
            drop_unique += 1
            drop_total += v
    original_unique_total = len(retain_words) + drop_unique
    retain_unique_pct = len(retain_words) * 100 / max(original_unique_total, 1)
    logging.info("effective_min_count=%d retains %i unique words (%i%% of original %i, drops %i)",
                 effective_min_count, len(retain_words), retain_unique_pct, original_unique_total, drop_unique)
    original_total = retain_total + drop_total
    retain_pct = retain_total * 100 / max(original_total, 1)
    logging.info("effective_min_count=%d leaves %i word corpus (%i%% of original %i, drops %i)",
                 effective_min_count, retain_total, retain_pct, original_total, drop_total)
else:
    logging.info("Updating model with new vocabulary")
    new_total = pre_exist_total = 0
    new_words = pre_exist_words = []
    for word, v in iteritems(raw_vocab):
        if utils.keep_vocab_item(word, v, effective_min_count, trim_rule=trim_rule):
            if word in vocab:
                pre_exist_words.append(word)
                pre_exist_total += v
                vocab[word].count += v
            else:
                new_words.append(word)
                new_total += v
                vocab[word] = Vocab(count=v, index=len(index2word))
                index2word.append(word)
        else:
            drop_unique += 1
            drop_total += v
    original_unique_total = len(pre_exist_words) + len(new_words) + drop_unique
    pre_exist_unique_pct = len(pre_exist_words) * 100 / max(original_unique_total, 1)
    new_unique_pct = len(new_words) * 100 / max(original_unique_total, 1)
    logging.info("New added %i unique words (%i%% of original %i) "
                 "and increased the count of %i pre-existing words (%i%% of original %i)",
                 len(new_words), new_unique_pct, original_unique_total, len(pre_exist_words),
                 pre_exist_unique_pct, original_unique_total)
    retain_words = new_words + pre_exist_words
    retain_total = new_total + pre_exist_total

# Precalculate each vocabulary item's threshold for sampling
if not sample:
    # no words downsampled
    logging.info("NO DOWNSAMPLING")
    threshold_count = retain_total
elif sample < 1.0:
    # traditional meaning: set parameter as proportion of total
    threshold_count = sample * retain_total
else:
    # new shorthand: sample >= 1 means downsample all words with higher count than sample
    threshold_count = int(sample * (3 + sqrt(5)) / 2)

downsample_total, downsample_unique = 0, 0
for w in retain_words:
    v = raw_vocab[w]
    word_probability = (sqrt(v / threshold_count) + 1) * (threshold_count / v)
    if word_probability < 1.0:
        downsample_unique += 1
        downsample_total += word_probability * v
    else:
        word_probability = 1.0
        downsample_total += v
    vocab[w].sample_int = int(round(word_probability * 2**32))


logging.info("deleting the raw counts dictionary of %i items", len(raw_vocab))
raw_vocab = defaultdict(int)

logging.info("sample=%g downsamples %i most-common words", sample, downsample_unique)
logging.info("downsampling leaves estimated %i word corpus (%.1f%% of prior %i)",
             downsample_total, downsample_total * 100.0 / max(retain_total, 1), retain_total)

# return from each step: words-affected, resulting-corpus-size, extra memory estimates
report_values = {
    'drop_unique': drop_unique, 'retain_total': retain_total, 'downsample_unique': downsample_unique,
    'downsample_total': int(downsample_total), 'num_retained_words': len(retain_words)
}

In [None]:
sv.save(vocab,"en_vocab_min10_gensim")
sv.save(index2word,"en_index2word_min10_gensim")

In [None]:
len(vocab)

In [5]:
hparams = dict()
with open ("../hp.json", "r") as jfile:
    hparams=eval(jfile.read())

In [11]:
vocab_pre =  set(sv.load("en_vocab_gensim").keys())
#vocab_pre =  set(sv.load("en_vocab_min10_gensim").keys())


processing_word = du.get_processing_word(lowercase=hparams['lowercase'])

dev   = du.CoNLLDataset(hparams['dev_filename'], processing_word)
test  = du.CoNLLDataset(hparams['test_filename'], processing_word)
train = du.CoNLLDataset(hparams['train_filename'], processing_word)
    

# Build Word and Tag vocab
vocab_words, vocab_tags, singulars = du.get_vocabs([train,test,dev])
print("Total Singulars are: " + str(len(singulars)))

in_pre = vocab_pre & vocab_words
print("Words found in pre-training are: "+str(len(in_pre)))
not_in_pre = vocab_words - in_pre
print("Words NOT found in pre-training are: "+str(len(not_in_pre)))

sni = set()
sni.update(singulars)
for sing in singulars:
    if sing in in_pre:
        sni.remove(sing)
print("Singulars not in pre-training are: "+str(len(sni)))

Building vocab...
- done. 26869 tokens
Total Singulars are: 18350
Words found in pre-training are: 18094
Words NOT found in pre-training are: 8775
Singulars not in pre-training are: 6946
