In [1]:
import tqdm

In [2]:
import numpy as np
import unicodedata
import os
import pickle

In [3]:
import gensim
from nltk.tokenize import word_tokenize

Using TensorFlow backend.


In [4]:
import unicodedata
import string
import re
import random
import time
import math
from konlpy.tag import  Kkma

In [5]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2 # Count SOS and EOS
      
    def index_words(self, sentence):
        for word in sentence.split(' '):
            self.index_word(word)

    def index_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [6]:
# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalize_string(s):
    s = unicode_to_ascii(s.strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [7]:
def read_langs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('./data/%s-%s.txt' % (lang1, lang2)).read().strip().split('\n')
    
    # Split every line into pairs and normalize
    pairs = [[normalize_string(s) for s in l.split('\t')] for l in lines]
    
    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)
        
    return input_lang, output_lang, pairs

In [8]:
MIN_LENGTH = 0
MAX_LENGTH = 50

def filter_pair(p):
    return (MIN_LENGTH <= len(p[0].split(' ')) < MAX_LENGTH) and (MIN_LENGTH <= len(p[1].split(' ')) < MAX_LENGTH)

def filter_pairs(pairs):
    return [pair for pair in pairs if filter_pair(pair)]

In [9]:
def prepare_data(lang1_name, lang2_name, reverse=False):
    input_lang, output_lang, pairs = read_langs(lang1_name, lang2_name, reverse)
    print("Read %s sentence pairs" % len(pairs))
    
    pairs = filter_pairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    
    print("Indexing words...")
    for pair in pairs:
        input_lang.index_words(pair[0])
        output_lang.index_words(pair[1])

    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepare_data('eng', 'fra', True)

# Print an example pair
print(random.choice(pairs))

Reading lines...
Read 142224 sentence pairs
Trimmed to 142219 sentence pairs
Indexing words...
['Tom ne savait pas ou Marie etait allee skier .', 'Tom didn t know where Mary had gone skiing .']


In [10]:
eng = [x[1] for x in pairs]

In [11]:
with open('./nmt_data/mono.en', 'w') as f:
    for i in range(len(eng)):
        f.write(eng[i] + '\n')

In [12]:
def create_avg_embeddings(word2vec_name='ko_vec', fpath_L='./data/raw/ko.train', fpath_U='./data/raw/ko.train.mono'):

    model = gensim.models.KeyedVectors.load_word2vec_format('../funnies/graph2graph/data/utils/%s.bin' % word2vec_name, binary=True)
    kkma = Kkma()

    # the elements of both matrices below constitute the nodes of our graph
    with open(fpath_L, 'r') as f, open(fpath_U, 'r') as g:
        ss_L = f.readlines()
        ss_L = [x[:-1] for x in ss_L]

        ss_U = g.readlines()
        ss_U = [x[:-1] for x in ss_U]

    # matrix of labeled embeddings
    L = np.zeros((len(ss_L), model.vector_size), dtype='float32')

    # matrix of unlabeled embeddings
    U = np.zeros((len(ss_U), model.vector_size), dtype='float32')

    def word2vec(w):
        """
        with this quick trick I can calculate the embeddings without normalizing the text (removing puctuaction, stop words etc...)
        If I pass a word that is not in the word2vec_model, like a stopword or some weird symbol, it just returns a zero vector that
        does not cotribute to the avg embedding
        """
        out = np.zeros(model.vector_size)
        try:
            out = model.word_vec(w)
        finally:
            return out

    i = 0
    for s in ss_L:
        if word2vec_name=='ko_vec':
            words = [x for x, _ in kkma.pos(s)]
        else:
            words = word_tokenize(s)
        
        # embedding for review is calculated as average of the embeddings of all words
        # this is not ideal but is shown to work reasonably well in literature
        # if you need something a bit more sophisticated, look into Doc2Vec algorithms
        tmp = [word2vec(w) for w in words]
        if(len(tmp)==0):
            val = 0
        else:
            val = np.mean([word2vec(w) for w in words], axis=0)
        L[i] = val
        print(str(i), end='\r')
        i = i+1
        
    print()
    
    with open('./data/graph/labeled.pickle', 'wb') as f:
        pickle.dump(L, f)


    j=0
    for s in ss_U:
        if word2vec_name=='ko_vec':
            words = [x for x, _ in kkma.pos(s)]
        else:
            words = word_tokenize(s)

        U[j] = np.mean([word2vec(w) for w in words], axis=0)
        print(str(j), end='\r')
        j = j+1

    with open('./data/graph/unlabeled.pickle', 'wb') as f:
        pickle.dump(U, f)

In [None]:
create_avg_embeddings('GoogleNews-vectors-negative300', './nmt_data/train.en', './nmt_data/mono.en')

131140