## Import general packages

In [1]:
import os
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

In [2]:
w2v_path = './glove.840B.300d.txt'

import numpy as np
rng = np.random.RandomState(seed=1)

cn = 0
word2vec = {}
with open(w2v_path,'rb') as w2v:
    content = w2v.read().strip()
    for line in content.split('\n'):
        cn +=1
        line = line.strip().split()
        v = line[0]
        
        vector = line[1:]
        vector = np.matrix(vector,dtype='float32')
        
        word2vec[v] = vector

In [3]:
from copy import deepcopy
def apply_rule4(graph):
    #Rule 4
    adj_list= deepcopy(graph)
    for sent_id, edges in enumerate(adj_list):
        for edge in edges:
            if edge[-1] == 'trans':
                continue
            target_sent_id = edge[0]
            source_word = edge[1]
            target_word = edge[2]
            if target_sent_id != -1:
                trans_source_id = sent_id
                trans_source_word = source_word
                trans_weight = 'trans'
               
                j_edges = adj_list[target_sent_id]
                tmp = [e for e in j_edges if e[1]==target_word]
                for item in tmp:
                    trans_target_id = item[0]
                    trans_target_word = item[2]
                    trans_edge = (trans_target_id, trans_source_word, trans_target_word, trans_weight)
                    adj_list[sent_id].append(trans_edge)

    return adj_list

In [4]:
from collections import defaultdict
from copy import deepcopy
from scipy import spatial
def create_graph(list_of_sents):
    
    # get the number of sentences or nodes
    n = len(list_of_sents)
    
    #initialize the adjacent list for the graph representation of this text
    adj_list = [[(-1,None,None, 0.0)]]*n
    max_weight = [0.0]*n

    #for each sentence check the similarity of each word with all previous words of all connections
    word_connections = dict()
    
    for i in range(1,n):            
            sent_i = list_of_sents[i]
                                 
            for w in sent_i:
                
                word_connections[(i,w)] = []#(-1.0, None, -1.0)
                max_weight_word_connection = 0.0
                try:
                    w_vec = word2vec[w]
                except:
                    word2vec[w] = rng.uniform(low=-0.2, high=+0.2, size=(300,))
                    w_vec = word2vec[w]
                
                #w_vec = get_word_vector(w)    
                for j, sent_j in enumerate(list_of_sents[:i]):
                    for v in sent_j:
                        try:
                            v_vec = word2vec[v]
                        except:
                            word2vec[v] = rng.uniform(low=-0.2, high=+0.2, size=(300,))
                            v_vec = word2vec[v]
                            
                        #v_vec = get_word_vector(v)
                        cosine_w_v =  np.abs(1 - spatial.distance.cosine(w_vec, v_vec))
                        cosine_w_v = np.round(cosine_w_v,4)
                        if cosine_w_v > max_weight_word_connection: #Rule 1: connect to the nearest sentence
                            word_connections[(i,w)] = [(j, v, cosine_w_v)]
                            max_weight_word_connection = cosine_w_v
                        elif cosine_w_v == max_weight_word_connection:
                            word_connections[(i,w)].append((j, v, cosine_w_v))
                # Rule 2
                if max_weight_word_connection > max_weight[i]:
                    adj_list[i] = [(item[0],
                                    w,
                                    item[1],
                                    item[2])
                                  for item in word_connections[(i,w)]
                                  ]
                    max_weight[i] = max_weight_word_connection
                    
                # Rule 3
                elif max_weight_word_connection == max_weight[i]:
                    flag = False # do both connect sent_i to the same sentence 
                    for item in adj_list[i]:
                        if item[0] in [wc[0] for wc in word_connections[(i,w)]]:
                            flag = True
                            break
                    if flag == False:
                        adj_list[i] +=  [(item[0], w, item[1], item[2])
                         for item in word_connections[(i,w)]]
    #Rule 4
    #adj_list = apply_rule4(adj_list)
    return adj_list
        
    

In [5]:
def save(graph_set, path):
    output_content = []
    for graph_name,  adj_list in enumerate(graph_set):
        output_content.append('XP')
        output_content.append('% '+str(graph_name))
        #output_content.append('t # %d'%graph_name)
        num_nodes = len(adj_list)

        for n in range(num_nodes):
            output_content.append('v %s a'%str(n))

        for i, edges in enumerate(adj_list):
            if i>0:
                for j in edges:
                    #Note: we computed edges backward, but we should 
                    # save them forward to be compatible with NAACL16
                    source = j[0] 
                    target = i
                    if target > source:
                        if source>0 and target>0:
                            output_content.append('d %s %s 1'%(str(source),str(target)))
                    else:
                        raise ValueError("Backward eadge?")

    with open(path,'wb') as out:
        out.write('\n'.join(output_content))



In [6]:
import sys
def drawProgressBar(shell_out, 
                    begin, k, out_of, end, barLen =25):
    percent = k/float(out_of)
    sys.stdout.write("\r")
    progress = ""
    for i in range(barLen):
        if i < int(barLen * percent):
            progress += "="
        elif i==int(barLen * percent):
            progress +='>'
        else:
            progress += "_"
    text = "%s%d/%d[%s](%.2f%%)%s"%(begin,k,out_of,progress,percent * 100, end)
    if shell_out== True:
        sys.stdout.write(text)
        sys.stdout.flush()
    return text

In [7]:
import pickle
with open("./ted-gender-annotated/dataset.pkl",'rb') as f:
    talks = pickle.load(f)

In [8]:
male_texts = []
female_texts = []

for talk_id, talk in talks.items():
    if talk['gender'] == 'male':
        male_texts.append(talk['content'])
    else:
        female_texts.append(talk['content'])
print "number of male texts:%d"%len(male_texts)
print "number of female texts:%d"%len(female_texts)

number of male texts:1012
number of female texts:344


In [9]:
from joblib import Parallel, delayed
male_graphs = []
male_graphs = Parallel(n_jobs=-1, verbose=-1, backend="multiprocessing")(
             map(delayed(create_graph), male_texts))

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 240 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 600 tasks      | elapsed: 26.0min
[Parallel(n_jobs=-1)]: Done 1012 out of 1012 | elapsed: 44.8min finished


In [10]:
save(male_graphs,'./ted-gender-annotated/male_graphset.g')

In [11]:
from joblib import Parallel, delayed
female_graphs = []
female_graphs = Parallel(n_jobs=-1, verbose=-1, backend="multiprocessing")(
             map(delayed(create_graph), female_texts))

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 240 tasks      | elapsed: 17.4min
[Parallel(n_jobs=-1)]: Done 344 out of 344 | elapsed: 22.1min finished


In [13]:
save(female_graphs,'./ted-gender-annotated/female_graphset.g')

In [None]:
#dump_text = "Barak is the US president . Angela is the chancellor of Germany . Barak and Angela met each other in France ."
#dump_text = "Mohsen likes briliant people . Ali tries hard . Mohsen is a successful mohsen person . Mohsen is smart . "
#dump_text = pre_process_par(dump_text)
#print dump_text
#adj_list = create_graph(dump_text)
#print word_connections
#g = adj_list
#for k, v in enumerate(g):
#    print "%d -->  %s"%(k,v)

In [None]:
#import pickle
#with open('original_pars_graphs.pickle','rb') as out:
#    graphs  = pickle.load(out) 

Let's read the data first. We have two corpora: 
         - original
         - translated
Each dataset consists of some paragraphs. Paragraphs are seperated by an empty line.

Now we have a list of paragraphs. We need to pre-process each paragraph and remove all stop words from them.

We should process and clean all pragraphs in both corpora

We need to load word vectors

Let's take a paragraph and build its graph. In this regard we define a function that takes a list of sentenecs (tokenized) as an input text and returns a graph (in ?????? format).

In [None]:
dump_text = " Barak is the US president . Angela is the chancellor of Germany . Barak and Angela met each other in France ."
#dump_text = "Mohsen likes Ali people . Mohsen tries Ali hard . Mohsen is a successful person . Mohsen is smart . "

dump_text = pre_process_par(dump_text)
print dump_text
adj_list = create_graph(dump_text)

g = adj_list
for k, v in enumerate(g):
    print "%d -->  %s"%(k,v)

In [None]:
#word_connections, adj_list = graphs[0]
#for k,v in enumerate(adj_list):
#    if len(v)>1:
#        print k,v

In [None]:
#g = graphs[0][1]
#for k, v in enumerate(g):
#    print "%d --> %d %s"%(k,v[0],v[1:])

Here we construct graphs for translationese