## Import general packages

In [1]:
import os
from collections import Counter

In [2]:
from collections import defaultdict
def voc_count(corpus):
    d = defaultdict(int)
    for p in corpus:
        for sent in p:
            for t in sent:
                d[t] += 1
    return d

In [3]:
from nltk.tokenize import sent_tokenize
from nltk import word_tokenize
import string

#load list of stop words
with open('./snowball_stopwords.txt','rb') as sw:
    stop_words = [line.strip() for line in sw]

#load punctuations 
punctuations = string.punctuation

#extra characters
extra = []

def pre_process_par(par):
    """
    input: 
       a paragraph
    output:
       list of sentences. Each sentence is a list of tokens.
    """
    output = []
    # make the par lowecase
    par = par.lower()
    
    # split sentences
    sent_par = sent_tokenize(par)
    
    # tokenize and clean all sentences
    for sent in sent_par:
        
        #tokenize each sentence
        tokens = word_tokenize(sent)
        
        # remove repetitve words in a sentenece
        tokens = list(set(tokens))
        
        #remove stop words and clean texts
        tokens = [tok for tok in tokens if 
                                          (tok not in stop_words) and 
                                          (tok not in punctuations) and 
                                          (tok not in extra)]
        
        # put it in the output
        output.append(tokens)
    return output

In [4]:
w2v_path = './glove.840B.300d.txt'

import numpy as np
rng = np.random.RandomState(seed=1)

cn = 0
word2vec = {}
with open(w2v_path,'rb') as w2v:
    content = w2v.read().strip()
    for line in content.split('\n'):
        cn +=1
        line = line.strip().split()
        v = line[0]
        
        vector = line[1:]
        vector = np.matrix(vector,dtype='float32')
        
        word2vec[v] = vector

In [5]:
def sentence_connection(sent1, sent2):
    word_connections = []
    for w2 in sent2:
        max_connection = (w2,None,-1)
        try:
            w2_vec = word2vec[w2]
        except:
            word2vec[w2] = rng.uniform(low=-0.2, high=+0.2, size=(300,))
            w2_vec = word2vec[w2]
            
        for w1 in sent1:
            try:
                w1_vec = word2vec[w1]
            except:
                word2vec[w1] = rng.uniform(low=-0.2, high=+0.2, size=(300,))
                w1_vec = word2vec[w1]
        
            # compute the cosine value
            cosine_w2_w1 =  np.abs(1 - spatial.distance.cosine(w2_vec, w1_vec))
            
            if cosine_w2_w1 >= max_connection[-1]:
                max_connection = (w2,w1,cosine_w2_w1)

            # append max_connection to word_connections
            word_connections.append(max_connection)
        
    # pick up the word connection with maximum weight
    output = (None,None,-1)
    for item in word_connections:
        if item[-1] > output[-1]:
            output = item
            
    # return output
    return output

In [6]:
from collections import defaultdict
from copy import deepcopy
from scipy import spatial
def create_graph(list_of_sents):
    
    # get the number of sentences or nodes
    n = len(list_of_sents)
    
    #initialize the adjacent list for the graph representation of this text
    adj_list = [[] for i in range(n)]

    #for each sentence compute its weight with each previouse sentence   
    for i in range(1, n):            
            sent_i = list_of_sents[i]
            
            for j in range(0,i):
                sent_j = list_of_sents[j]
                

                # compute the connection between sent_i and sent_j sen_j preceds sent_i
                edge_ij = sentence_connection(sent_j, sent_i)
                
                
                # check for the threshold
                weight = edge_ij[-1]
                #if weight>=0.9:
                adj_list[i].append((j,weight))
                

    # return adj_list
    return adj_list
        
    

In [8]:
dump_text = "Barak is the US president . Angela is the chancellor of Germany . Barak and Angela met each other in France ."
#dump_text = "Mohsen likes briliant people . Ali tries hard . Mohsen is a successful mohsen person . Mohsen is smart . "
dump_text = pre_process_par(dump_text)
print dump_text

#sentence_connection(dump_text[1],dump_text[2])

adj_list = create_graph(dump_text)
#print word_connections
g = adj_list
for k, v in enumerate(g):
    print "%d -->  %s"%(k,v)
#x = word2vec['angela'].shape
#np.abs(1 - spatial.distance.cosine(x, x))

[['us', 'barak', 'president'], ['chancellor', 'angela', 'germany'], ['angela', 'france', 'met', 'barak']]
0 -->  []
1 -->  [(0, 0.47426620103463102)]
2 -->  [(0, 0.99999991593103033), (1, 0.99999996671336)]


In [32]:
def save(graph_set, path, weighted=False):
    output_content = []
    for graph_name,  adj_list in enumerate(graph_set):
        output_content.append('XP')
        output_content.append('% '+str(graph_name))
        #output_content.append('t # %d'%graph_name)
        num_nodes = len(adj_list)

        for n in range(num_nodes):
            output_content.append('v %s a'%str(n))

        for i, edges in enumerate(adj_list):
            if i>0:
                for j in edges:
                    #Note: we computed edges backward, but we should 
                    # save them forward to be compatible with NAACL16
                    source = j[0] 
                    target = i
                    if target > source:
                        if source>0 and target>0:
                            if weighted:
                                output_content.append('d %s %s %s'%(str(source),str(target),str(j[1])))
                            else:
                                output_content.append('d %s %s 1'%(str(source),str(target)))
                    else:
                        raise ValueError("Backward eadge?")

    with open(path,'wb') as out:
        out.write('\n'.join(output_content))

In [10]:
import codecs
with codecs.open('./Hansard/hansard.en.original.10K.out', 'rb','utf8') as orig:
    orig_paragraphs = []
    for line in orig:
        if line != '\n' and len(line)>0:
            orig_paragraphs.append(line.strip())
print "# paragraphs in original: %d"%len(orig_paragraphs)

# paragraphs in original: 5000


In [11]:
from joblib import Parallel, delayed
original_pars = []
original_pars = Parallel(n_jobs=-1, verbose=-1, backend="multiprocessing")(
             map(delayed(pre_process_par), orig_paragraphs))

[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 2000 tasks      | elapsed:   16.5s
[Parallel(n_jobs=-1)]: Done 4880 tasks      | elapsed:   44.0s
[Parallel(n_jobs=-1)]: Done 5000 out of 5000 | elapsed:   45.0s finished


In [12]:
from joblib import Parallel, delayed
graphs = []
graphs = Parallel(n_jobs=-1, verbose=-1, backend="multiprocessing")(
                 map(delayed(create_graph), original_pars))

[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 12.2min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 29.9min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed: 50.1min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 74.6min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 112.8min
[Parallel(n_jobs=-1)]: Done 3512 tasks      | elapsed: 159.2min
[Parallel(n_jobs=-1)]: Done 4592 tasks      | elapsed: 208.2min
[Parallel(n_jobs=-1)]: Done 5000 out of 5000 | elapsed: 227.6min finished


In [None]:
# import pickle
# with open('original_pars_graphs_lcg.pickle','wb') as out:
#     pickle.dump(graphs, out)   

In [33]:
save(graphs,'./orig_lcg_weighted_graph_set.g', weighted=True)

In [16]:
with codecs.open('./Hansard/hansard.en.translated.from.fr.10K.out','rb','utf8') as tran:
    tran_paragraphs = []
    for line in tran:
        if line != '\n' and len(line)>0:
            tran_paragraphs.append(line.strip())

print "# paragraphs in translated: %d"%len(tran_paragraphs)

# paragraphs in translated: 5000


In [17]:
from joblib import Parallel, delayed
translated_pars = []
translated_pars = Parallel(n_jobs=-1, verbose=-1, backend="multiprocessing")(
             map(delayed(pre_process_par), tran_paragraphs))

[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 3088 tasks      | elapsed:   26.4s
[Parallel(n_jobs=-1)]: Done 5000 out of 5000 | elapsed:   43.3s finished


In [18]:
from joblib import Parallel, delayed
translated_graphs = []
translated_graphs = Parallel(n_jobs=-1, verbose=-1, backend="multiprocessing")(
             map(delayed(create_graph), translated_pars))

[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 12.8min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 28.0min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed: 51.9min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 83.1min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 118.5min
[Parallel(n_jobs=-1)]: Done 3512 tasks      | elapsed: 159.9min
[Parallel(n_jobs=-1)]: Done 4592 tasks      | elapsed: 211.3min
[Parallel(n_jobs=-1)]: Done 5000 out of 5000 | elapsed: 230.3min finished


In [37]:
save(translated_graphs,'./trans_lcg_weighted_graph_set.g', weighted=True)

In [None]:
# import pickle
# with open('trans_pars_graphs_lcg.pickle','wb') as out:
#     pickle.dump(translated_graphs, out)   

Let's read the data first. We have two corpora: 
         - original
         - translated
Each dataset consists of some paragraphs. Paragraphs are seperated by an empty line.

Now we have a list of paragraphs. We need to pre-process each paragraph and remove all stop words from them.

We should process and clean all pragraphs in both corpora

We need to load word vectors

Let's take a paragraph and build its graph. In this regard we define a function that takes a list of sentenecs (tokenized) as an input text and returns a graph (in ?????? format).

In [20]:
dump_text = "Barak is the US president . Angela is the chancellor of Germany . Barak and Angela met each other in France ."
dump_text = "Mohsen likes Ali people . Mohsen tries Ali hard . Mohsen is a successful person . Mohsen is smart . "
dump_text = pre_process_par(dump_text)
print dump_text
adj_list = create_graph(dump_text)

g = adj_list
for k, v in enumerate(g):
    print "%d -->  %s"%(k,v)

[['people', 'mohsen', 'likes', 'ali'], ['tries', 'hard', 'mohsen', 'ali'], ['successful', 'mohsen', 'person'], ['mohsen', 'smart']]
0 -->  []
1 -->  [(0, 1.0000000580073984)]
2 -->  [(0, 1.0000000580073984), (1, 1.0000000580073984)]
3 -->  [(0, 1.0000000580073984), (1, 1.0000000580073984), (2, 1.0000000580073984)]


In [None]:
len()

In [None]:
#word_connections, adj_list = graphs[0]
#for k,v in enumerate(adj_list):
#    if len(v)>1:
#        print k,v

In [None]:
#g = graphs[0][1]
#for k, v in enumerate(g):
#    print "%d --> %d %s"%(k,v[0],v[1:])

Here we construct graphs for translationese