In [2]:
import os
from collections import Counter

In [3]:
from collections import defaultdict
def voc_count(corpus):
    d = defaultdict(int)
    for p in corpus:
        for sent in p:
            for t in sent:
                d[t] += 1
    return d

In [4]:
from nltk.tokenize import sent_tokenize
from nltk import word_tokenize
import string

#load list of stop words
with open('./snowball_stopwords.txt','rb') as sw:
    stop_words = [line.strip() for line in sw]

#load punctuations 
punctuations = string.punctuation

#extra characters
extra = []

def pre_process_par(par):
    """
    input: 
       a paragraph
    output:
       list of sentences. Each sentence is a list of tokens.
    """
    output = []
    # make the par lowecase
    par = par.lower()
    
    # split sentences
    sent_par = sent_tokenize(par)
    
    # tokenize and clean all sentences
    for sent in sent_par:
        
        #tokenize each sentence
        tokens = word_tokenize(sent)
        
        # remove repetitve words in a sentenece
        tokens = list(set(tokens))
        
        #remove stop words and clean texts
        tokens = [tok for tok in tokens if 
                                          (tok not in stop_words) and 
                                          (tok not in punctuations) and 
                                          (tok not in extra)]
        if len(tokens)>0:
            # put it in the output
            output.append(tokens)
    return output

In [5]:
w2v_path = './glove.840B.300d.txt'

import numpy as np
rng = np.random.RandomState(seed=1)

cn = 0
word2vec = {}
with open(w2v_path,'rb') as w2v:
    content = w2v.read().strip()
    for line in content.split('\n'):
        cn +=1
        line = line.strip().split()
        v = line[0]
        
        vector = line[1:]
        vector = np.matrix(vector,dtype='float32')
        
        word2vec[v] = vector

In [6]:
def overlap(ds, word2vec):
    voc_in_word2vec = set(word2vec.keys())
    num_voc_in_word2vec = len(voc_in_word2vec)
    print "num_voc_in_word2vec: %d"%num_voc_in_word2vec

    voc_in_ds = []
    for par in ds:
        for sent in par:
            voc_in_ds += sent
    
    num_voc_in_ds = len(voc_in_ds)
    print "num_voc_in_ds: %d"%num_voc_in_ds
    
    print "remove duplicates ... "
    voc_in_ds = set(voc_in_ds)
    
    num_voc_in_ds = len(voc_in_ds)
    print "num_voc_in_ds: %d"%num_voc_in_ds
    
    intersection = set(voc_in_word2vec).intersection(voc_in_ds)
    count_overlap = len(intersection)
    print "count_overlap: %d"%count_overlap
    
    precent_overlap = 100*(count_overlap / float(num_voc_in_ds))
    print "precent_overlap: %.2f%%"%precent_overlap

In [7]:
def statistics_ds(ds):
    print "data size size (# paragraphs): %d"%len(ds)
    
    par_lens = [len(par) for par in ds]
    avg_par_len = np.average(par_lens)
    print "avg_par_len: %.2f"%avg_par_len
    
    min_par_len = np.min(par_lens)
    print "min_par_len: %2.f"%min_par_len
    
    max_par_len = np.max(par_lens)
    print "max_par_len: %2.f"%max_par_len

    std_par_len = np.std(par_lens)
    print "std_par_len: %2.f"%std_par_len


    
    sent_lens = []
    for par in ds:
        for sent in par:
            sent_lens.append(len(sent))
            
    avg_sent_len = np.average(sent_lens)
    print "avg_sent_len: %.2f"%avg_sent_len
    
    min_sent_len = np.min(sent_lens)
    print "min_sent_len: %2.f"%min_sent_len

    max_sent_len = np.max(sent_lens)
    print "max_sent_len: %2.f"%max_sent_len

    std_sent_len = np.std(sent_lens)
    print "std_sent_len: %2.f"%std_sent_len
    


In [8]:
import codecs
with codecs.open('./Hansard/hansard.en.original.10K.out', 'rb','utf8') as orig:
    orig_paragraphs = []
    for line in orig:
        if line != '\n' and len(line)>0:
            orig_paragraphs.append(line.strip())
print "# paragraphs in original: %d"%len(orig_paragraphs)

# paragraphs in original: 5000


In [9]:
from joblib import Parallel, delayed
original_pars = []
original_pars = Parallel(n_jobs=-1, verbose=-1, backend="multiprocessing")(
             map(delayed(pre_process_par), orig_paragraphs))

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 432 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 1152 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 2160 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 3456 tasks      | elapsed:   21.1s
[Parallel(n_jobs=-1)]: Done 5000 out of 5000 | elapsed:   31.4s finished


In [10]:
statistics_ds(original_pars)

data size size (# paragraphs): 5000
avg_par_len: 67.74
min_par_len: 13
max_par_len: 696
std_par_len: 38
avg_sent_len: 9.46
min_sent_len:  1
max_sent_len: 93
std_sent_len:  6


In [11]:
with codecs.open('./Hansard/hansard.en.translated.from.fr.10K.out','rb','utf8') as tran:
    tran_paragraphs = []
    for line in tran:
        if line != '\n' and len(line)>0:
            tran_paragraphs.append(line.strip())

print "# paragraphs in translated: %d"%len(tran_paragraphs)

# paragraphs in translated: 5000


In [12]:
from joblib import Parallel, delayed
translated_pars = []
translated_pars = Parallel(n_jobs=-1, verbose=-1, backend="multiprocessing")(
             map(delayed(pre_process_par), tran_paragraphs))

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 816 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 2256 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done 4272 tasks      | elapsed:   33.5s
[Parallel(n_jobs=-1)]: Done 5000 out of 5000 | elapsed:   39.7s finished


In [13]:
statistics_ds(translated_pars)

data size size (# paragraphs): 5000
avg_par_len: 69.65
min_par_len:  8
max_par_len: 330
std_par_len: 36
avg_sent_len: 9.65
min_sent_len:  1
max_sent_len: 113
std_sent_len:  6


In [14]:
overlap(original_pars,word2vec)

num_voc_in_word2vec: 2196016
num_voc_in_ds: 3205261
remove duplicates ... 
num_voc_in_ds: 43142
count_overlap: 38897
precent_overlap: 90.16%


In [15]:
overlap(translated_pars,word2vec)

num_voc_in_word2vec: 2196016
num_voc_in_ds: 3359961
remove duplicates ... 
num_voc_in_ds: 42795
count_overlap: 36754
precent_overlap: 85.88%


In [16]:
for par in translated_pars:
    for sent in par:
        if len(sent) == 1:
            print sent

[u'flawed']
[u'hon']
[u'hon']
[u'bill']
[u'important']
[u'clarification']
[u'explain']
[u'hon']
[u'yet']
[u'working']
[u'issue']
[u'war']
[u'reality']
[u'think']
[u'unacceptable']
[u'said']
[u'palestine']
[u'perhaps']
[u'hon']
[u'signal']
[u'unacceptable']
[u'crisis']
[u'must']
[u'unacceptable']
[u'know']
[u'can']
[u'hon']
[u'surprised']
[u'significant']
[u'right']
[u'came']
[u'duty']
[u'response']
[u'good']
[u'later']
[u'hypocrisy']
[u'hon']
[u'inconceivable']
[u'hon']
[u'desirable']
[u'cities']
[u'cities']
[u'reality']
[u'means']
[u'means']
[u'security']
[u'start']
[u'priority']
[u'investment']
[u'hon']
[u'infrastructure']
[u'housing']
[u'things']
[u'hon']
[u'hon']
[u'need']
[u'pity']
[u'keeping']
[u'expect']
[u'unusual']
[u'overlapping']
[u'amendments']
[u'applaud']
[u'enough']
[u'done']
[u'build']
[u'stayed']
[u'enough']
[u'hon']
[u'proof']
[u'disappointing']
[u'true']
[u'ridiculous']
[u'problem']
[u'hon']
[u'appreciate']
[u'incredible']
[u'hon']
[u'hon']
[u'nothing']
[u'need']
[u'

[u'favour']
[u'hon']
[u'hon']
[u'hon']
[u'included']
[u'hon']
[u'agreement']
[u'sad']
[u'hon']
[u'hon']
[u'hon']
[u'beyond']
[u'difficult']
[u'capacity']
[u'distinction']
[u'hon']
[u'late']
[u'works']
[u'ridiculous']
[u'elected']
[u'support']
[u'doubt']
[u'lovely']
[u'answer']
[u'hon']
[u'proud']
[u'hear']
[u'hon']
[u'gems']
[u'contemptuous']
[u'hon']
[u'hon']
[u'hon']
[u'disagree']
[u'revelation']
[u'case']
[u'government']
[u'government']
[u'modernization']
[u'choice']
[u'happening']
[u'knees']
[u'hon']
[u'hon']
[u'think']
[u'hon']
[u'go']
[u'prepare']
[u'resource']
[u'know']
[u'job']
[u'going']
[u'forget']
[u'case']
[u'making']
[u'means']
[u'responsibility']
[u'situation']
[u'hon']
[u'problem']
[u'stake']
[u'true']
[u'true']
[u'nonsense']
[u'bring']
[u'fair']
[u'unacceptable']
[u'complicated']
[u'imagine']
[u'research']
[u'drugs']
[u'funding']
[u'money']
[u'waiting']
[u'much']
[u'1991']
[u'headed']
[u'wonderful']
[u'full']
[u'multitude']
[u'unfortunate']
[u'concerns']
[u'ridiculous']

[u'easy']
[u'key']
[u'anyway']
[u'member']
[u'hon']
[u'hon']
[u'hon']
[u'answer']
[u'thumbs']
[u'ready']
[u'regain']
[u'erosion']
[u'evolution']
[u'act']
[u'scandalous']
[u'outrage']
[u'first']
[u'disaster']
[u'member']
[u'address']
[u'know']
[u'important']
[u'work']
[u'brilliant']
[u'unbelievable']
[u'hon']
[u'hope']
[u'changed']
[u'says']
[u'agree']
[u'problem']
[u'start']
[u'simply']
[u'think']
[u'certainly']
[u'imagine']
[u'refused']
[u'refused']
[u'know']
[u'rub']
[u'happened']
[u'reality']
[u'changed']
[u'happened']
[u'reality']
[u'reality']
[u'detriment']
[u'alone']
[u'reality']
[u'course']
[u'course']
[u'course']
[u'immoral']
[u'know']
[u'related']
[u'clothes']
[u'hon']
[u'lost']
[u'problem']
[u'represent']
[u'bogus']
[u'interesting']
[u'done']
[u'butchered']
[u'want']
[u'hon']
[u'undemocratic']
[u'serious']
[u'painful']
[u'done']
[u'happened']
[u'intent']
[u'happening']
[u'simple']
[u'mean']
[u'fundamental']
[u'recognize']
[u'survived']
[u'hon']
[u'hon']
[u'hon']
[u'hon']
[u'm

[u'mad']
[u'hon']
[u'hon']
[u'addressing']
[u'instigator']
[u'sponsorships']
[u'virtue']
[u'dreaming']
[u'hon']
[u'used']
[u'minimum']
[u'hon']
[u'hon']
[u'hon']
[u'agree']
[u'talk']
[u'plan']
[u'assessed']
[u'answer']
[u'think']
[u'government']
[u'mean']
[u'happening']
[u'discussion']
[u'neighbour']
[u'simple']
[u'none']
[u'course']
[u'wanted']
[u'stopping']
[u'paradox']
[u'concerned']
[u'congratulate']
[u'phobia']
[u'implications']
[u'say']
[u'thank']
[u'concerned']
[u'see']
[u'reality']
[u'case']
[u'reality']
[u'army']
[u'work']
[u'choice']
[u'message']
[u'hon']
[u'statistics']
[u'examples']
[u'hon']
[u'hon']
[u'hon']
[u'hon']
[u'pleased']
[u'hon']
[u'yes']
[u'globalization']
[u'hon']
[u'values']
[u'principle']
[u'cop-out']
[u'mean']
[u'hon']
[u'hon']
[u'exists']
[u'important']
[u'good']
[u'government']
[u'goes']
[u'government']
[u'told']
[u'words']
[u'hon']
[u'money']
[u'market']
[u'agreement']
[u'hon']
[u'hon']
[u'false']
[u'wrong']
[u'untrue']
[u'expectations']
[u'acceptable']
[u

[u'need']
[u'magnificent']
[u'responsible']
[u'matter']
[u'hon']
[u'reality']
[u'reality']
[u'favour']
[u'important']
[u'hon']
[u'happening']
[u'think']
[u'sure']
[u'now']
[u'problem']
[u'journalists']
[u'right']
[u'reality']
[u'proved']
[u'reality']
[u'simple']
[u'reality']
[u'big']
[u'reality']
[u'troubling']
[u'simple']
[u'solution']
[u'hard']
[u'saying']
[u'said']
[u'hon']
[u'hon']
[u'difference']
[u'accepted']
[u'duty']
[u'important']
[u'doubt']
[u'possible']
[u'agreement']
[u'pleased']
[u'accepted']
[u'lucky']
[u'understand']
[u'important']
[u'unanimity']
[u'scandalous']
[u'government']
[u'decide']
[u'eludes']
[u'simple']
[u'happen']
[u'oblige']
[u'interpret']
[u'philosophy']
[u'understand']
[u'suggest']
[u'unfortunate']
[u'abandoned']
[u'ndp']
[u'important']
[u'case']
[u'everybody']
[u'visible']
[u'choice']
[u'happened']
[u'told']
[u'agree']
[u'reality']
[u'saying']
[u'money']
[u'deal']
[u'reality']
[u'done']
[u'reality']
[u'reality']
[u'simple']
[u'true']
[u'difficult']
[u'acce

[u'reality']
[u'simple']
[u'outrageous']
[u'listening']
[u'benefiting']
[u'anything']
[u'doable']
[u'wrong']
[u'constituency']
[u'hon']
[u'hon']
[u'published']
[u'printed']
[u'hope']
[u'illogical']
[u'sure']
[u'minimum']
[u'consensus']
[u'happens']
[u'question']
[u'permissive']
[u'learned']
[u'hon']
[u'devoid']
[u'want']
[u'hon']
[u'never']
[u'extraordinary']
[u'hon']
[u'hon']
[u'hon']
[u'hon']
[u'hon']
[u'message']
[u'check']
[u'matters']
[u'matters']
[u'pleasure']
[u'important']
[u'hon']
[u'hon']
[u'problem']
[u'duty']
[u'hon']
[u'thanks']
[u'cfia']
[u'hon']
[u'bill']
[u'case']
[u'choice']
[u'case']
[u'consequences']
[u'hon']
[u'money']
[u'important']
[u'interesting']
[u'wonder']
[u'coming']
[u'simple']
[u'can']
[u'important']
[u'happened']
[u'nothing']
[u'nothing']
[u'happened']
[u'enough']
[u'reality']
[u'objectives']
[u'know']
[u'yes']
[u'yes']
[u'yes']
[u'true']
[u'complicated']
[u'mean']
[u'want']
[u'department']
[u'achieve']
[u'interested']
[u'important']
[u'hon']
[u'today']
[u

[u'jealous']
[u'just']
[u'accord']
[u'prejudicial']
[u'reinstated']
[u'happened']
[u'inshore']
[u'debate']
[u'hon']
[u'know']
[u'choice']
[u'proud']
[u'principles']
[u'qualms']
[u'arguments']
[u'understand']
[u'know']
[u'sentence']
[u'issue']
[u'problem']
[u'problem']
[u'example']
[u'incredible']
[u'still']
[u'contradiction']
[u'reason']
[u'mean']
[u'worries']
[u'confusing']
[u'discipline']
[u'hon']
[u'say']
[u'pleased']
[u'reality']
[u'directives']
[u'agree']
[u'understand']
[u'hon']
[u'consequences']
[u'know']
[u'investigation']
[u'said']
[u'problem']
[u'kept']
[u'unacceptable']
[u'happened']
[u'talking']
[u'unacceptable']
[u'piecemeal']
[u'hon']
[u'good']
[u'hon']
[u'hon']
[u'hon']
[u'hon']
[u'hon']
[u'happened']
[u'works']
[u'complicated']
[u'agree']
[u'think']
[u'agree']
[u'solutions']
[u'worries']
[u'needed']
[u'hon']
[u'hon']
[u'right']
[u'result']
[u'hon']
[u'speaker']
[u'hon']
[u'right']
[u'works']
[u'important']
[u'hon']
[u'needed']
[u'protection']
[u'job']
[u'philosophy']
[u

[u'problem']
[u'families']
[u'shameful']
[u'done']
[u'imagine']
[u'help']
[u'wonderful']
[u'consistency']
[u'logic']
[u'hon']
[u'unbelievable']
[u'crime']
[u'region']
[u'unfortunate']
[u'cynical']
[u'absolutely']
[u'hon']
[u'hon']
[u'responsibility']
[u'worth']
[u'go']
[u'hon']
[u'really']
[u'might']
[u'case']
[u'hon']
[u'goodness']
[u'examples']
[u'thing']
[u'searched']
[u'continued']
[u'sad']
[u'proud']
[u'right']
[u'hon']
[u'thank']
[u'complicated']
[u'problem']
[u'sure']
[u'works']
[u'complicated']
[u'happening']
[u'war']
[u'principle']
[u'naive']
[u'absurd']
[u'results']
[u'case']
[u'worked']
[u'paradox']
[u'tortuous']
[u'done']
[u'issue']
[u'want']
[u'problem']
[u'bring']
[u'results']
[u'incredible']
[u'come']
[u'astonished']
[u'false']
[u'believe']
[u'problem']
[u'real']
[u'surprise']
[u'montreal']
[u'misappropriated']
[u'change']
[u'reality']
[u'clear']
[u'country']
[u'clear']
[u'think']
[u'elements']
[u'happened']
[u'deficit']
[u'votes']
[u'digress']
[u'course']
[u'centuries']