In [1]:
from google.colab import drive

drive.mount('./gdrive')

Drive already mounted at ./gdrive; to attempt to forcibly remount, call drive.mount("./gdrive", force_remount=True).


In [None]:
! wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
! unzip ngrok-stable-linux-amd64.zip

! cp ./gdrive/My\ Drive/IU/AML/Labs/Lab4/wikipedia_sample_tiny.txt .

--2019-02-22 18:30:40--  https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
Resolving bin.equinox.io (bin.equinox.io)... 52.203.53.176, 52.207.111.186, 52.201.75.180, ...
Connecting to bin.equinox.io (bin.equinox.io)|52.203.53.176|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5363700 (5.1M) [application/octet-stream]
Saving to: ‘ngrok-stable-linux-amd64.zip.1’


2019-02-22 18:30:41 (9.58 MB/s) - ‘ngrok-stable-linux-amd64.zip.1’ saved [5363700/5363700]

Archive:  ngrok-stable-linux-amd64.zip
replace ngrok? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [None]:
N = 5000
CSIZE = 4
LR = 0.001
EPOCHS = 5
NEGATIVES = 5
EMBEDDING_DIM = 50
CONTEXTS_BATCH = 20
BUFFER_LENGTH = 50000

# Vocabulary

In [None]:
import numpy as np

from collections import Counter


class Voc:
    def __init__(self):
        self._initialize_data()

    def _initialize_data(self, text=True):
        if text:
            self.text = []
            self.text_ids = []
        self.num_words = 0
        self.word2count = Counter()
        self.word2index = {}
        self.word2discard = {}

    def add_sentence(self, sentence):
        for word in sentence.split(' '):
            self.add_word(word)

    def add_word(self, word, count=1, text=True):
        if text:
            self.text.append(word)
        self.word2count.update({word: count})
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.num_words += 1

    def prune(self, nth_keep):
        keep_words = self.word2count.most_common(nth_keep)
        self._initialize_data(text=False)
        for word, count in keep_words:
            self.add_word(word, count, text=False)

    def store_words_freqs(self):
        voc_freqs = self.word2count.most_common()
        self.words, frequencies = zip(*voc_freqs)
        self.frequencies = np.array(frequencies)
        self.frequencies_sum = sum(self.frequencies)

    def store_central_context(self):
        self.context = []
        for i, word in enumerate(self.text):
            if word in self.word2index:
                word_id = self.word2index[word]
                self.context.append((i, word, word_id))

# Data Loader

In [None]:
import re
import csv
import logging
import unicodedata
import numpy as np


BUFFER_LENGTH = 50000


class Loader:
    def __init__(self, datafile, preprocess=False, prune=0):
        """
        Loads data into a VOC object

        :param datafile: Path to where is the dataset file located
        :param preprocess: Apply normalization, ascii... to the data
        :param prune: Nth more frequent words to keep (0 if keep all)
        """
        self.datafile = datafile
        self.preprocess = preprocess
        self.prune = prune

        # Create VOC from data
        self.voc = self._load_data()

        # Initialize negative samples random buffer
        self.random_buffer = self._generate_random_buffer(first=True)
        self.index_buffer = 0

    # Batching Methods

    def next_batch(self, n_contexts, context_size, k):
        batch_size = n_contexts * (2 * context_size + k)
        logging.debug('Generating epoch batches (batch_size ~= {})...'.format(batch_size))

        i, x_central_batch, x_samples_batch, y_batch = 0, [], [], []

        total_contexts = len(self.voc.context)
        while i < total_contexts:
            word_pos_text, word, word_id = self.voc.context[i]
            i += 1

            #  Positive samples
            context_words_ids = self._get_context_words(word_pos_text, context_size)
            j, total_context_words = 0, len(context_words_ids)
            while j < total_context_words:
                x_central_batch.append(word_id)
                x_samples_batch.append(context_words_ids[j])
                y_batch.append(1.0)
                j += 1

            #  Negative Samples
            negative_words_ids = self._get_negative_samples(k, word)
            j, total_negative_words = 0, len(negative_words_ids)
            while j < total_negative_words:
                x_central_batch.append(word_id)
                x_samples_batch.append(negative_words_ids[j])
                y_batch.append(0.0)
                j += 1

            # Yield batches when size >= BATCH_SIZE
            if len(y_batch) >= batch_size:
                yield i, x_central_batch, x_samples_batch, y_batch
                x_central_batch, x_samples_batch, y_batch = [], [], []

        yield i, x_central_batch, x_samples_batch, y_batch
        
    def _get_context_words(self, pos, cs):
        begin = max(0, pos - cs)
        end = min(pos + cs + 1, len(self.voc.text))
        return [self.voc.word2index[cw]
                for cw in self.voc.text[begin:pos] + self.voc.text[pos + 1:end]
                if cw in self.voc.word2index]

    def _get_negative_samples(self, k, c):
        if self.index_buffer + k > 50000:
            self.random_buffer += self._generate_random_buffer()
            self.index_buffer = 0

        k_negative_samples = self.random_buffer[self.index_buffer:self.index_buffer + k]
        self.index_buffer += k

        return [self.voc.word2index[nw] for nw in k_negative_samples if nw != c]

    def _generate_random_buffer(self, first=False):
        if first:
            unigram = self.voc.frequencies / self.voc.frequencies_sum
            modified_unigram = np.power(unigram, 0.75)
            self.modified_unigram_weighs = modified_unigram / sum(modified_unigram)
        return np.random.choice(self.voc.words, BUFFER_LENGTH, p=self.modified_unigram_weighs).tolist()

    # MetaData Methods

    def generate_metadata_projector(self):
        with open('metadata.tsv', mode='w', newline='') as f:
            tsv_output = csv.writer(f, delimiter='\t')
            for i in list(self.voc.word2index.keys()):
                tsv_output.writerow([i])

    # Loader Methods

    def _load_data(self):
        """Populates and returns a VOC object"""
        logging.info('Start preparing training data...')
        data = self._read_data()
        voc = Voc()

        logging.info('Counting words...')
        for line in data:
            voc.add_sentence(line)

        logging.info('Counted words: {}'.format(voc.num_words))

        if self.prune > 0:
            logging.info('Pruning Vocabulary...')
            voc.prune(self.prune)
            logging.info('Counted words (after prune): {}'.format(voc.num_words))

        logging.info('Storing words frequencies...')
        voc.store_words_freqs()

        logging.info('Storing word contexts...')
        voc.store_central_context()

        return voc

    def _read_data(self):
        """Reads data from the given file and returns it splitted up in lines"""
        with open(self.datafile, mode='r', encoding='utf-8') as f:
            lines = f.read().strip().split('\n')
            if self.preprocess:
                lines = [Loader._normalize_string(s) for s in lines]
            return lines

    @staticmethod
    def _normalize_string(s):
        """Lowercase, trim, and remove non-letter characters"""
        s = Loader._unicode_to_ascii(s.lower().strip())
        s = re.sub(r'([.!?])', r' \1', s)
        s = re.sub(r'[^a-zA-Z.!?]+', r' ', s)
        s = re.sub(r'\s+', r' ', s).strip()
        return s

    @staticmethod
    def _unicode_to_ascii(s):
        """Turn a Unicode string to plain ASCII"""
        return ''.join(c for c in unicodedata.normalize('NFD', s)
                       if unicodedata.category(c) != 'Mn')

# Word2Vec

In [None]:
import os
import shutil
import logging
import tensorflow as tf

class Word2Vec:
    """Word2Vec model (Skip-gram)"""

    def __init__(self, data):
        self.data = data
        self.voc_size = data.voc.num_words

        # Initialize graph
        tf.reset_default_graph()
        self.build_graph()

    def train(self):
        """Train the model with the data and specified Hyperparameters"""
        sess = tf.Session()
        saver = tf.train.Saver()

        if os.path.exists('./tf_summary'):
            logging.debug('Removing old CG visualizations and checkpoints...')
            shutil.rmtree('./tf_summary', ignore_errors=True)

        writer = tf.summary.FileWriter('./tf_summary', graph=sess.graph)

        logging.info('Training started!')

        init = tf.global_variables_initializer()
        sess.run(init)

        step = 0
        global_step = 0
        for epoch in range(1, EPOCHS + 1):
            print('Epoch {}'.format(epoch))
            progress = tf.keras.utils.Progbar(target=len(self.data.voc.text),
                                              stateful_metrics=['batch loss'],
                                              width=40, interval=10)

            batches = self.data.next_batch(CONTEXTS_BATCH, CSIZE, NEGATIVES)
            for last_word_idx, i_central, i_samples, targets in batches:
                step += 1
                d = {self.x_central: i_central,
                     self.x_samples: i_samples,
                     self.y: targets}

                sess.run(self.optimize, feed_dict=d)

                if step % 1000 == 0:
                    _, loss, summ = sess.run([self.embedding, self.loss, self.summary], feed_dict=d)
                    progress.update(last_word_idx, values=[
                        ('batch loss', loss), ('epoch loss', loss)])

                    logging.debug('Saving Summary {}'.format(global_step))
                    writer.add_summary(summ, global_step=global_step)

                    logging.debug('Saving Checkpoint {}'.format(global_step))
                    saver.save(sess, 'tf_summary/model.ckpt', global_step=global_step)

                    global_step += 1

            progress.update(len(self.data.voc.text), values=[
                ('batch loss', loss), ('epoch loss', loss)])

        writer.close()
        sess.close()

    def build_graph(self):
        """Build and return TensorFlow Computational Graph"""
        # Placeholders
        self.x_central = tf.placeholder(tf.int32, shape=[None], name='X_central')
        self.x_samples = tf.placeholder(tf.int32, shape=[None], name='X_samples')
        self.y = tf.placeholder(tf.float32, shape=[None], name='Labels')

        # Weight Matrices
        W1 = tf.Variable(tf.random_uniform(shape=(self.voc_size, EMBEDDING_DIM),
                                           minval=-1, maxval=1, name='W_in'))
        W2 = tf.Variable(tf.random_uniform(shape=(self.voc_size, EMBEDDING_DIM),
                                           minval=-1, maxval=1, name='W_out'))

        # Lookups
        central_lookup = tf.nn.embedding_lookup(W1, self.x_central)
        samples_lookup = tf.nn.embedding_lookup(W2, self.x_samples)

        mult_lookups = tf.multiply(central_lookup, samples_lookup)
        sum_lookups = tf.reduce_sum(mult_lookups, axis=1)

        #  Loss and Optimization

        sigmoid_ce = tf.nn.sigmoid_cross_entropy_with_logits(labels=self.y,
                                                             logits=sum_lookups)
        self.loss = tf.reduce_mean(sigmoid_ce)
        tf.summary.scalar('loss', self.loss)

        optimizer = tf.contrib.opt.LazyAdamOptimizer(learning_rate=LR)
        self.optimize = optimizer.minimize(self.loss)

        # Final matrix
        embedding_matrix = tf.nn.l2_normalize(0.5 * (W1 + W2), axis=1)
        self.embedding = tf.Variable(embedding_matrix, name='Embedding')

        self.summary = tf.summary.merge_all()
        logging.info('CG built...')

# Main

In [None]:
logging.getLogger().setLevel(logging.INFO)    

In [None]:
LOG_DIR = './tf_summary'
get_ipython().system_raw(
    'tensorboard --logdir {} --host 0.0.0.0 --port 6006 &'.format(LOG_DIR)
)

get_ipython().system_raw('./ngrok http 6006 &')

! curl -s http://localhost:4040/api/tunnels | python3 -c \
    "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"

In [None]:
data = Loader('wikipedia_sample_tiny.txt', prune=N)
data.generate_metadata_projector()

In [None]:
model = Word2Vec(data)
model.train()