In [1]:
import json
import numpy as np
import pandas as pd
import math
from operator import add
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import MWETokenizer
from string import punctuation

# Loading word embeddings

In [None]:
from gensim.models import KeyedVectors

# importing wordembedding and building the language model
de_model = KeyedVectors.load_word2vec_format('/home/hahou/WordEmbedding/wiki.multi.en.vec')
en_model = KeyedVectors.load_word2vec_format('/home/hahou/WordEmbedding/wiki.multi.de.vec')
zh_model = KeyedVectors.load_word2vec_format('/home/hahou/WordEmbedding/wiki.zh.align.vec')

# Getting the tokens 
en_words = []
for word in en_model.vocab:
    en_words.append(word)

# Printing out number of tokens available
print("Number of English Tokens: {}".format(len(en_words)))

de_words = []
for word in de_model.vocab:
    de_words.append(word)

# Printing out number of tokens available
print("Number of German Tokens: {}".format(len(de_words)))

zh_words = []
for word in zh_model.vocab:
    zh_words.append(word)

# Printing out number of tokens available
print("Number of Chinese Tokens: {}".format(len(zh_words)))


# Data Preprocessing functions

In [None]:
def traditionalToSimplified(file):
    i = 0
    for el in file:
        file[i] = ''.join(HanziConv.toSimplified(el))
        i += 1
        
def chineseTokenize(file):
    i = 0
    for el in file:
        file[i] = ' '.join(jieba.cut(el, cut_all=False, HMM=True))
        i += 1

        
from nltk.tokenize import MWETokenizer
def MWEtokenize(el):
    i = 0
    tokenizer = MWETokenizer(('barack','obama'))
    tokenizer.add_mwe([('new','york'),('hong', 'kong'), ('los', 'angeles'), ('san', 'francisco'),('united', 'kingdom')])
    el = tokenizer.tokenize(el.split())
    return el
        
def tokenize(file):
    i = 0
    for el in file:
        tokenizerOne = WordPunctTokenizer()
        el = tokenizerOne.tokenize(str(el.lower()))
        el = ' '.join([word for word in el if word not in punctuation])
        file[i] = MWEtokenize(el)
        i += 1
        

punctuation = punctuation + str('；')+  str("：《》「 」“”[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）]+")+str('编辑')+str('%.[')+str('(/')+str(');[')+str('"),')+str(').')+str('.[')+str(',[')+str('][')+str('("')+str('."[')+str('—'+str('."'))+str('.,')



# Document-vector building function

In [None]:
## input: 
    #1. list of preprocessing documents
    #2. language model
    #3. entity name
    #4. language: En/De
    #5. representation of aspects: headline/content
## output: for each documnet, a document vector will be produced

def getDocVectors(content, language_model, termname ,language, aspect):
    
    language_model = language_model

    words = []
    for word in language_model.vocab:
        words.append(word)

    doc_vectors = list()
    
    for i in range(len(content)):
        #print(content[i])
        vectorSum = [0.0000]*300
        l = 0
        for el in content[i]:
            if el in words:
                #print(list(vectors[el]))
                vectorSum = list(map(add, list(language_model[el]), vectorSum))
                l+=1
            #else:
                #print(el)
        #print(vectorSum)
        for m in range(len(vectorSum)):
            if vectorSum[m] != 0:
                vectorSum[m] = float(vectorSum[m])/l ###average the vector sum
            else:
                vectorSum[m] = vectorSum[m]

        doc_vectors.append(vectorSum)
    
    doc_vectors_final = []
    for vec in doc_vectors:
        doc = []
        for dim in vec:
            
            doc.append(float(dim))
        
        doc_vectors_final.append(doc)
            
        
    with open(language+'_'+termname+'_'+aspect+".json", 'w') as f:
        json.dump(doc_vectors_final, f)
  
    
    #return doc_vectors_final 


# run the function! get and save the document vectors!

In [83]:
# define the entity list
#entity_list = ['United_Kingdom', 'Italy', 'Asia','Europe']

#entity_list =  ['Russia','singapore','India', 'Israel','Brazil','Philippines', 'New_York_City',
#'London','Singapore','Hong_Kong','Dubai','Los_Angeles','Paris','Chicago','Washington,_D.C.','San_Francisco',
#'Mumbai','Rome','Toronto','Philadelphia','Monaco','Tokyo','Amsterdam','Boston','Barcelona','Peking']

entity_list = ['Bracak_Obama', 'Donald_Trump']

for entity in entity_list:
    # loading source data
    with open('/home/hahou/WikiDataCrawling/English Corpus/source_en_'+entity+'.json') as json_data:
        source_en = json.load(json_data)
    with open('/home/hahou/WikiDataCrawling/German Corpus/source_de_'+entity+'.json') as json_data:
        source_de = json.load(json_data)
    #with open('/home/hahou/WikiDataCrawling/Chinese Corpus/source_de_'+entity+'.json') as json_data:
        #source_zh = json.load(json_data)    
    
    # exact headlies
    en_headline = list(source_en.keys())
    de_headline = list(source_de.keys())
    zh_headline = list(source_zh.keys())

    #extract context
    en_content = []
    de_content = []
    zh_content = []
    for el in en_headline:
        en_content.append(''.join(source_en[el]))
    for el in de_headline:
        de_content.append(''.join(source_de[el]))
    for el in zh_headline:
        zh_content.append(''.join(source_de[el]))
        
    # preprocessing for en and de
    tokenize(en_content)
    tokenize(de_content)
    tokenize(en_headline)
    tokenize(de_headline)
    
    
    #preprocessing for zh
    traditionalToSimplified(zh_content)
    chineseTokenize(zh_content)
    removePunctuation(zh_content)

    traditionalToSimplified(zh_headline)
    chineseTokenize(zh_headline)
    removePunctuation(zh_headline)    
    
    
    # get the document vectors for headlines of entry page section
    getDocVectors(en_headline, en_model, entity,'En', 'Headline')
    getDocVectors(de_headline, de_model, entity,'De', 'Headline')
    getDocVectors(zh_headline, zh_model, entity,'zh', 'Headline')
    
    # get the document vectors for Contents of entry page section
    getDocVectors(en_content, en_model, entity,'En', 'Content')
    getDocVectors(de_content, de_model, entity,'De', 'Content')
    getDocVectors(zh_content, zh_model, entity,'zh', 'Content')
   

FileNotFoundError: [Errno 2] No such file or directory: '/home/hahou/WikiDataCrawling/English Corpus/source_en_Bracak_Obama.json'

# Appendix

# run functions seperately for each entity

In [3]:
# loading source data
with open('/home/hahou/WikiDataCrawling/English Corpus/source_en_United_States.json') as json_data:
    source_en = json.load(json_data)

with open('/home/hahou/WikiDataCrawling/German Corpus/source_de_United_States.json') as json_data:
    source_de = json.load(json_data)


In [4]:
# exact headlies
en_headline = list(source_en.keys())
de_headline = list(source_de.keys())

#extract context
en_content = []
de_content = []
for el in en_headline:
    en_content.append(''.join(source_en[el]))
for el in de_headline:
    de_content.append(''.join(source_de[el]))

In [6]:
# tokenizetion
tokenize(en_content)
tokenize(de_content)

tokenize(en_headline)
tokenize(de_headline)

In [7]:
en_headline

[['united_states'],
 ['etymology'],
 ['history'],
 ['geography', 'climate', 'and', 'environment'],
 ['demographics'],
 ['government', 'and', 'politics'],
 ['law', 'enforcement', 'and', 'crime'],
 ['economy'],
 ['infrastructure'],
 ['culture']]

In [8]:
'''en_content = convert2Common_texts(en_content)
de_content = convert2Common_texts(de_content)
en_content = toLowerCase(en_content)
de_content = toLowerCase(de_content)

en_headline = convert2Common_texts(en_headline)
de_headline = convert2Common_texts(de_headline)
en_headline = toLowerCase(en_headline)
de_headline = toLowerCase(de_headline)'''

'en_content = convert2Common_texts(en_content)\nde_content = convert2Common_texts(de_content)\nen_content = toLowerCase(en_content)\nde_content = toLowerCase(de_content)\n\nen_headline = convert2Common_texts(en_headline)\nde_headline = convert2Common_texts(de_headline)\nen_headline = toLowerCase(en_headline)\nde_headline = toLowerCase(de_headline)'

In [9]:
# input: 1* n dims sparse matrix, or single vector from query_vect/ train_tfidf matrix
def getSquareSum(vector):
    squaresum = 0
    for i in range(vector.shape[0]):
        squaresum += vector[i]* vector[i]
        
    return squaresum

In [10]:
# input: two 1* n sparse matrixs, or single vector from query_vect/ train_tfidf matrixs
# return the cosine sim of two vectors, type: float

def getCosineSimilarity(query_vector, doc_vector):
    squaresum_query = getSquareSum(query_vector)
    squaresum_doc= getSquareSum(doc_vector)
    if math.sqrt(squaresum_query)*math.sqrt(squaresum_doc) > 0:
        sim = np.dot(query_vector, doc_vector)/(math.sqrt(squaresum_query)*math.sqrt(squaresum_doc))
        return sim
    else:
        return np.dot(query_vector, doc_vector)
        
 

In [73]:
# get the document vectors for headlines of entry page section
getDocVectors(en_headline, en_model, 'United_States','En', 'Headline')
getDocVectors(de_headline, de_model, 'United_States','De', 'Headline')

In [78]:
# get the document vectors for Contents of entry page section
getDocVectors(en_content, en_model, 'United_States','En', 'Content')
getDocVectors(de_content, de_model, 'United_States','De', 'Content')

In [64]:
def saveDocVector(DV, termname, language, aspect):

    with open( language+termname+aspect+".json", 'w') as f:
        json.dump(DV, f)
        
def saveDocVector_headline_De(DV, termname):

    with open( 'De_model_headline'+termname+".json", 'w') as f:
        json.dump(DV, f)

In [65]:
saveDocVector_En(DV_en_content, 'United_States', 'En', 'Headline')
saveDocVector_De(DV_de_content, 'United_States')

In [None]:
getCosineSimilarity(DV_en_content[0],DV_de_content[0])

In [None]:
## input:
# assume we want to link wiki-text in deutsch(arrow language) with the wiki-text in english(target):
# src_content: is the source page data of the wikipedia item in laguage a
# trs_content: is the translation data of the wikipedia item from language b to language a
# en_model: english language model
# de_model: deutsch language
# arrow_la: arrow language of the wiki-text
# target_la: target language of the wiki-text
# termname: entry name of the wiki-text

## return: RUN file for Trec_eval

def getWordEmbeddingRun(en_content, de_content, DV_en_content, DV_de_content, target_la, arrow_la, termname):    
    
    ### get the similarity of query with each doc leader
    
    #get document vector for each aspect
    DV_en_content = DV_en_content
    DV_de_content = DV_de_content
    
    
    result = []
    for j in range(len(DV_de_content)):
        print (DV_de_content[j][:10])
        
        sims = []
        results_ID = []
        results = []
        for i in range(len(DV_en_content)):
            s = getCosineSimilarity(DV_de_content[j], DV_en_content[i])
            #if type(s)is not None:
            print(s)
            sims.append(s)
        
        for x in range(len(sims)):
            if sims[x] >0:
                results_ID.append(x)
                results.append(sims[x])
        
        #print(results, results_ID)
        
        if len(results) >0 and len(results_ID)>0:
            results, results_ID= zip(*sorted(zip(results, results_ID), reverse=True))      
       
        
        for m in range(len(results)):
            result.append([arrow_la+'_'+termname+"_h"+str(j), 0, target_la+'_'+termname+'_h'+str(results_ID[m]), m, results[m], 'wordembedding'])

    df = pd.DataFrame(result)
    if len(de_content[0]) > 10 and len(en_content[0])>10 :
        df.to_csv(termname+'_'+arrow_la+'_'+target_la+'_text_WE.txt', header=None, index=None, sep=' ', mode='a')
    elif len(de_content[0]) < 10 and len(en_content[0])<10:
        df.to_csv(termname+'_'+arrow_la+'_'+target_la+'_headline_WE.txt', header=None, index=None, sep=' ', mode='a')
    elif len(de_content[0]) < 10 and len(en_content[0])>10:
        df.to_csv(termname+'_'+arrow_la+'_'+target_la+'_headline_text_WE.txt', header=None, index=None, sep=' ', mode='a')
    else:
        df.to_csv(termname+'_'+arrow_la+'_'+target_la+'_text_headline_WE.txt', header=None, index=None, sep=' ', mode='a')     
    
    print(df)

In [None]:
# do not run
#DV_en_content = getDocVectors(en_content, en_model)

In [None]:
#DV_de_content = getDocVectors(de_content, de_model)

In [None]:
getWordEmbeddingRun(en_content, de_headline, DV_en_content, DV_de_content,  'en', 'de', 'China')

In [None]:
getWordEmbeddingRun(en_content, de_content, DV_en_content, DV_de_content, 'en', 'de', 'China')

In [None]:
getWordEmbeddingRun(en_headline, de_headline, DV_en_content, DV_de_content, 'en', 'de', 'China')

In [None]:
getWordEmbeddingRun(en_headline, de_content, DV_en_content, DV_de_content, 'en', 'de', 'China')

In [None]:
trs_headline