## Import general packages

In [1]:
import os
from collections import Counter

In [2]:
from collections import defaultdict
def voc_count(corpus):
    d = defaultdict(int)
    for p in corpus:
        for sent in p:
            for t in sent:
                d[t] += 1
    return d

In [3]:
from nltk.tokenize import sent_tokenize
from nltk import word_tokenize
import string

#load list of stop words
with open('./snowball_stopwords.txt','rb') as sw:
    stop_words = [line.strip() for line in sw]

#load punctuations 
punctuations = string.punctuation

#extra characters
extra = []

def pre_process_par(par):
    """
    input: 
       a paragraph
    output:
       list of sentences. Each sentence is a list of tokens.
    """
    output = []
    # make the par lowecase
    par = par.lower()
    
    # split sentences
    sent_par = sent_tokenize(par)
    
    # tokenize and clean all sentences
    for sent in sent_par:
        
        #tokenize each sentence
        tokens = word_tokenize(sent)
        
        # remove repetitve words in a sentenece
        tokens = list(set(tokens))
        
        #remove stop words and clean texts
        tokens = [tok for tok in tokens if 
                                          (tok not in stop_words) and 
                                          (tok not in punctuations) and 
                                          (tok not in extra)]
        
        # put it in the output
        output.append(tokens)
    return output

In [4]:
w2v_path = './glove.840B.300d.txt'

import numpy as np
rng = np.random.RandomState(seed=1)

cn = 0
word2vec = {}
with open(w2v_path,'rb') as w2v:
    content = w2v.read().strip()
    for line in content.split('\n'):
        cn +=1
        line = line.strip().split()
        v = line[0]
        
        vector = line[1:]
        vector = np.matrix(vector,dtype='float32')
        
        word2vec[v] = vector

In [None]:
from copy import deepcopy
def apply_rule4(graph):
    #Rule 4
    adj_list= deepcopy(graph)
    for sent_id, edges in enumerate(adj_list):
        for edge in edges:
            if edge[-1] == 'trans':
                continue
            target_sent_id = edge[0]
            source_word = edge[1]
            target_word = edge[2]
            if target_sent_id != -1:
                trans_source_id = sent_id
                trans_source_word = source_word
                trans_weight = 'trans'
               
                j_edges = adj_list[target_sent_id]
                tmp = [e for e in j_edges if e[1]==target_word]
                for item in tmp:
                    trans_target_id = item[0]
                    trans_target_word = item[2]
                    trans_edge = (trans_target_id, trans_source_word, trans_target_word, trans_weight)
                    adj_list[sent_id].append(trans_edge)

    return adj_list

In [None]:
from collections import defaultdict
from copy import deepcopy
from scipy import spatial
def create_graph(list_of_sents):
    
    # get the number of sentences or nodes
    n = len(list_of_sents)
    
    #initialize the adjacent list for the graph representation of this text
    adj_list = [[(-1,None,None, 0.0)]]*n
    max_weight = [0.0]*n

    #for each sentence check the similarity of each word with all previous words of all connections
    word_connections = dict()
    
    for i in range(1,n):            
            sent_i = list_of_sents[i]
                                 
            for w in sent_i:
                
                word_connections[(i,w)] = (-1.0, None, -1.0)
                try:
                    w_vec = word2vec[w]
                except:
                    word2vec[w] = rng.uniform(low=-0.2, high=+0.2, size=(300,))
                    w_vec = word2vec[w]
                
                #w_vec = get_word_vector(w)    
                for j, sent_j in enumerate(list_of_sents[:i]):
                    for v in sent_j:
                        try:
                            v_vec = word2vec[v]
                        except:
                            word2vec[v] = rng.uniform(low=-0.2, high=+0.2, size=(300,))
                            v_vec = word2vec[v]
                            
                        #v_vec = get_word_vector(v)
                        cosine_w_v =  np.abs(1 - spatial.distance.cosine(w_vec, v_vec))
                        cosine_w_v = np.round(cosine_w_v,4)
                        if cosine_w_v >= word_connections[(i,w)][2]: #Rule 1
                            word_connections[(i,w)] = (j, v, cosine_w_v)
                # Rule 2
                if word_connections[(i,w)][2] > max_weight[i]:
                    adj_list[i] = [(word_connections[(i,w)][0],
                                    w,
                                    word_connections[(i,w)][1],
                                    word_connections[(i,w)][2])]
                    max_weight[i] = word_connections[(i,w)][2]
                # Rule 3
                elif word_connections[(i,w)][2] == max_weight[i]:
                    flag = False
                    for item in adj_list[i]:
                        if item[0] == word_connections[(i,w)][0]:
                            print "here"
                            flag = True
                            break
                    if flag == False:
                        adj_list[i].append(
                            (word_connections[(i,w)][0],
                             w,
                             word_connections[(i,w)][1],
                             word_connections[(i,w)][2]))
    #Rule 4
    #adj_list = apply_rule4(adj_list)
    return adj_list
        
    

In [None]:
def save(graph_set, path):
    output_content = []
    for graph_name,  adj_list in enumerate(graph_set):
        output_content.append('XP')
        output_content.append('% '+str(graph_name))
        #output_content.append('t # %d'%graph_name)
        num_nodes = len(adj_list)

        for n in range(num_nodes):
            output_content.append('v %s a'%str(n))

        for i, edges in enumerate(adj_list):
            if i>0:
                for j in edges:
                    #Note: we computed edges backward, but we should 
                    # save them forward to be compatible with NAACL16
                    source = j[0] 
                    target = i
                    if target > source:
                        if source>0 and target>0:
                            output_content.append('d %s %s 1'%(str(source),str(target)))
                    else:
                        raise ValueError("Backward eadge?")

    with open(path,'wb') as out:
        out.write('\n'.join(output_content))



In [None]:
import codecs
with codecs.open('./Hansard/hansard.en.original.10K.out', 'rb','utf8') as orig:
    orig_paragraphs = []
    for line in orig:
        if line != '\n' and len(line)>0:
            orig_paragraphs.append(line.strip())
print "# paragraphs in original: %d"%len(orig_paragraphs)

In [None]:
from joblib import Parallel, delayed
original_pars = []
original_pars = Parallel(n_jobs=-1, verbose=-1, backend="multiprocessing")(
             map(delayed(pre_process_par), orig_paragraphs))

In [None]:
from joblib import Parallel, delayed
graphs = []
graphs = Parallel(n_jobs=-1, verbose=-1, backend="multiprocessing")(
                 map(delayed(create_graph), original_pars))

In [None]:
import pickle
with open('original_pars_graphs.pickle','wb') as out:
    pickle.dump(graphs, out)   

In [None]:
save(graphs,'./orig_graph_set.g')

In [None]:
with codecs.open('./Hansard/hansard.en.translated.from.fr.10K.out','rb','utf8') as tran:
    tran_paragraphs = []
    for line in tran:
        if line != '\n' and len(line)>0:
            tran_paragraphs.append(line.strip())

print "# paragraphs in translated: %d"%len(tran_paragraphs)

In [None]:
from joblib import Parallel, delayed
translated_pars = []
translated_pars = Parallel(n_jobs=-1, verbose=-1, backend="multiprocessing")(
             map(delayed(pre_process_par), tran_paragraphs))

In [None]:
from joblib import Parallel, delayed
translated_graphs = []
translated_graphs = Parallel(n_jobs=-1, verbose=-1, backend="multiprocessing")(
             map(delayed(create_graph), translated_pars))

In [None]:
import pickle
with open('trans_pars_graphs.pickle','wb') as out:
    pickle.dump(translated_graphs, out)   

In [None]:
save(translated_graphs,'./trans_graph_set.g')

In [None]:
#dump_text = "Barak is the US president . Angela is the chancellor of Germany . Barak and Angela met each other in France ."
#dump_text = "Mohsen likes briliant people . Ali tries hard . Mohsen is a successful mohsen person . Mohsen is smart . "
#dump_text = pre_process_par(dump_text)
#print dump_text
#adj_list = create_graph(dump_text)
#print word_connections
#g = adj_list
#for k, v in enumerate(g):
#    print "%d -->  %s"%(k,v)

In [None]:
#import pickle
#with open('original_pars_graphs.pickle','rb') as out:
#    graphs  = pickle.load(out) 

Let's read the data first. We have two corpora: 
         - original
         - translated
Each dataset consists of some paragraphs. Paragraphs are seperated by an empty line.

Now we have a list of paragraphs. We need to pre-process each paragraph and remove all stop words from them.

We should process and clean all pragraphs in both corpora

We need to load word vectors

Let's take a paragraph and build its graph. In this regard we define a function that takes a list of sentenecs (tokenized) as an input text and returns a graph (in ?????? format).

In [None]:
dump_text = "Barak is the US president . Angela is the chancellor of Germany . Barak and Angela met each other in France ."
dump_text = "Mohsen likes Ali people . Mohsen tries Ali hard . Mohsen is a successful person . Mohsen is smart . "
dump_text = pre_process_par(dump_text)
print dump_text
adj_list = create_graph(dump_text)

g = adj_list
for k, v in enumerate(g):
    print "%d -->  %s"%(k,v)

In [None]:
#word_connections, adj_list = graphs[0]
#for k,v in enumerate(adj_list):
#    if len(v)>1:
#        print k,v

In [None]:
#g = graphs[0][1]
#for k, v in enumerate(g):
#    print "%d --> %d %s"%(k,v[0],v[1:])

Here we construct graphs for translationese