In [1]:
import os
from collections import Counter

In [2]:
from collections import defaultdict
def voc_count(corpus):
    d = defaultdict(int)
    for p in corpus:
        for sent in p:
            for t in sent:
                d[t] += 1
    return d

In [3]:
from nltk.tokenize import sent_tokenize
from nltk import word_tokenize
import string

#load list of stop words
with open('./snowball_stopwords.txt','rb') as sw:
    stop_words = [line.strip() for line in sw]

#load punctuations 
punctuations = string.punctuation

#extra characters
extra = []

def pre_process_par(par):
    """
    input: 
       a paragraph
    output:
       list of sentences. Each sentence is a list of tokens.
    """
    output = []
    # make the par lowecase
    par = par.lower()
    
    # split sentences
    sent_par = sent_tokenize(par)
    
    # tokenize and clean all sentences
    for sent in sent_par:
        
        #tokenize each sentence
        tokens = word_tokenize(sent)
        
        # remove repetitve words in a sentenece
        tokens = list(set(tokens))
        
        #remove stop words and clean texts
        tokens = [tok for tok in tokens if 
                                          (tok not in stop_words) and 
                                          (tok not in punctuations) and 
                                          (tok not in extra)]
        
        # put it in the output
        output.append(tokens)
    return output

In [4]:
w2v_path = './glove.840B.300d.txt'

import numpy as np
rng = np.random.RandomState(seed=1)

cn = 0
word2vec = {}
with open(w2v_path,'rb') as w2v:
    content = w2v.read().strip()
    for line in content.split('\n'):
        cn +=1
        line = line.strip().split()
        v = line[0]
        
        vector = line[1:]
        vector = np.matrix(vector,dtype='float32')
        
        word2vec[v] = vector

In [31]:
def overlap(ds, word2vec):
    voc_in_word2vec = set(word2vec.keys())
    num_voc_in_word2vec = len(voc_in_word2vec)
    print "num_voc_in_word2vec: %d"%num_voc_in_word2vec

    voc_in_ds = []
    for par in ds:
        for sent in par:
            voc_in_ds += sent
    
    num_voc_in_ds = len(voc_in_ds)
    print "num_voc_in_ds: %d"%num_voc_in_ds
    
    print "remove duplicates ... "
    voc_in_ds = set(voc_in_ds)
    
    num_voc_in_ds = len(voc_in_ds)
    print "num_voc_in_ds: %d"%num_voc_in_ds
    
    intersection = set(voc_in_word2vec).intersection(voc_in_ds)
    count_overlap = len(intersection)
    print "count_overlap: %d"%count_overlap
    
    precent_overlap = 100*(count_overlap / float(num_voc_in_ds))
    print "precent_overlap: %.2f%%"%precent_overlap

In [32]:
def statistics_ds(ds):
    print "data size size (# paragraphs): %d"%len(ds)
    
    par_lens = [len(par) for par in ds]
    avg_par_len = np.average(par_lens)
    print "avg_par_len: %.2f"%avg_par_len
    
    min_par_len = np.min(par_lens)
    print "min_par_len: %2.f"%min_par_len
    
    max_par_len = np.max(par_lens)
    print "max_par_len: %2.f"%max_par_len

    std_par_len = np.std(par_lens)
    print "std_par_len: %2.f"%std_par_len


    
    sent_lens = [len(sent) for sent in par for par in ds]
    
    avg_sent_len = np.average(sent_lens)
    print "avg_sent_len: %.2f"%avg_sent_len
    
    min_sent_len = np.min(sent_lens)
    print "min_sent_len: %2.f"%min_sent_len

    max_sent_len = np.max(sent_lens)
    print "max_sent_len: %2.f"%max_sent_len

    std_sent_len = np.std(sent_lens)
    print "std_sent_len: %2.f"%std_sent_len
    


In [6]:
import codecs
with codecs.open('./Hansard/hansard.en.original.10K.out', 'rb','utf8') as orig:
    orig_paragraphs = []
    for line in orig:
        if line != '\n' and len(line)>0:
            orig_paragraphs.append(line.strip())
print "# paragraphs in original: %d"%len(orig_paragraphs)

# paragraphs in original: 5000


In [7]:
from joblib import Parallel, delayed
original_pars = []
original_pars = Parallel(n_jobs=-1, verbose=-1, backend="multiprocessing")(
             map(delayed(pre_process_par), orig_paragraphs))

[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 1488 tasks      | elapsed:   13.6s
[Parallel(n_jobs=-1)]: Done 3648 tasks      | elapsed:   36.1s
[Parallel(n_jobs=-1)]: Done 5000 out of 5000 | elapsed:   50.5s finished


In [8]:
statistics_ds(original_pars)

data size size (# paragraphs): 5000
avg_par_len: 67.95
min_par_len: 13
max_par_len: 703
std_par_len: 38
avg_sent_len: 10.14
min_sent_len:  1
max_sent_len: 30
std_sent_len:  6


In [9]:
with codecs.open('./Hansard/hansard.en.translated.from.fr.10K.out','rb','utf8') as tran:
    tran_paragraphs = []
    for line in tran:
        if line != '\n' and len(line)>0:
            tran_paragraphs.append(line.strip())

print "# paragraphs in translated: %d"%len(tran_paragraphs)

# paragraphs in translated: 5000


In [10]:
from joblib import Parallel, delayed
translated_pars = []
translated_pars = Parallel(n_jobs=-1, verbose=-1, backend="multiprocessing")(
             map(delayed(pre_process_par), tran_paragraphs))

[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 1008 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done 2448 tasks      | elapsed:   23.7s
[Parallel(n_jobs=-1)]: Done 4464 tasks      | elapsed:   43.3s
[Parallel(n_jobs=-1)]: Done 5000 out of 5000 | elapsed:   48.8s finished


In [11]:
statistics_ds(translated_pars)

data size size (# paragraphs): 5000
avg_par_len: 69.86
min_par_len:  8
max_par_len: 334
std_par_len: 37
avg_sent_len: 9.79
min_sent_len:  1
max_sent_len: 32
std_sent_len:  6


In [33]:
overlap(original_pars,word2vec)

num_voc_in_word2vec: 2196016
num_voc_in_ds: 3205261
remove duplicates ... 
num_voc_in_ds: 43142
count_overlap: 38897
precent_overlap: 90.16%


In [34]:
overlap(translated_pars,word2vec)

num_voc_in_word2vec: 2196016
num_voc_in_ds: 3359961
remove duplicates ... 
num_voc_in_ds: 42795
count_overlap: 36754
precent_overlap: 85.88%
