In [1]:
import json
import numpy as np
import pandas as pd
import math


In [2]:
# input: 1* n dims sparse matrix, or single vector from query_vect/ train_tfidf matrix
def getSquareSum(vector):
    squaresum = 0
    for i in range(len(vector)):
        squaresum += vector[i]* vector[i]
        
    return squaresum

In [3]:
# input: two 1* n sparse matrixs, or single vector from query_vect/ train_tfidf matrixs
# return the cosine sim of two vectors, type: float

def getCosineSimilarity(query_vector, doc_vector):
    squaresum_query = getSquareSum(query_vector)
    squaresum_doc= getSquareSum(doc_vector)
    if math.sqrt(squaresum_query)*math.sqrt(squaresum_doc) > 0:
        sim = np.dot(query_vector, doc_vector)/(math.sqrt(squaresum_query)*math.sqrt(squaresum_doc))
        return sim
    else:
        return np.dot(query_vector, doc_vector)

In [4]:
## input:
# assume we want to link wiki-text in deutsch(arrow language) with the wiki-text in english(target):
# src_content: is the source page data of the wikipedia item in laguage a
# trs_content: is the translation data of the wikipedia item from language b to language a
# en_model: english language model
# de_model: deutsch language
# arrow_la: arrow language of the wiki-text
# target_la: target language of the wiki-text
# termname: entry name of the wiki-text

## return: RUN file for Trec_eval

def getWordEmbeddingRun(en_content, de_content, DV_en_content, DV_de_content,  arrow_la, target_la, termname, vector_source):    
    
    ### get the similarity of query with each doc leader
    
    #get document vector for each aspect
    DV_en_content = DV_en_content
    DV_de_content = DV_de_content
    
    
    result = []
    for j in range(len(DV_en_content)):
        #print (DV_de_content[j][:10])
        
        sims = []
        results_ID = []
        results = []
        for i in range(len(DV_de_content)):
            s = getCosineSimilarity(DV_en_content[j], DV_de_content[i])
            #if type(s)is not None:
            #print(s)
            sims.append(s)
        
        for x in range(len(sims)):
            if sims[x] >0:
                results_ID.append(x)
                results.append(sims[x])
        
        #print(results, results_ID)
        
        if len(results) >0 and len(results_ID)>0:
            results, results_ID= zip(*sorted(zip(results, results_ID), reverse=True))      
       
        
        for m in range(len(results)):
            result.append([arrow_la+'_'+termname+"_h"+str(j), 0, target_la+'_'+termname+'_h'+str(results_ID[m]), m, results[m], 'wordembedding'])
    
    #run for multiple entities
    df = pd.DataFrame(result)
    if len(de_content[0].split()) > 10 and len(en_content[0].split())>10 :
        df.to_csv(vector_source+'_'+arrow_la+'_'+target_la+'_text_WE.txt', header=None, index=None, sep=' ', mode='a')
    elif len(de_content[0].split()) < 10 and len(en_content[0].split())<10:
        df.to_csv(vector_source+'_'+arrow_la+'_'+target_la+'_headline_WE.txt', header=None, index=None, sep=' ', mode='a')
    elif len(de_content[0].split()) < 10 and len(en_content[0].split())>10:
        df.to_csv(vector_source+'_'+arrow_la+'_'+target_la+'_headline_text_WE.txt', header=None, index=None, sep=' ', mode='a')
    else:
        df.to_csv(vector_source+'_'+arrow_la+'_'+target_la+'_text_headline_WE.txt', header=None, index=None, sep=' ', mode='a')     
    
    print(df)

# WE_MUSE

In [65]:
# enter the entity_name:
'''entity_list = ['United_States','China','France', 'Germany', 'Japan', 'Asia', 'Europe', 'Italy', 
               'United_Kingdom','Canada','Russia','India', 'Israel','Brazil','Philippines', 
               'New_York_City','London','Singapore','Hong_Kong','Dubai','Los_Angeles','Paris',
               'Chicago','Washington,_D.C.','San_Francisco',
               'Mumbai','Rome','Toronto','Philadelphia','Monaco','Tokyo','Amsterdam','Boston',
               'Barcelona','Peking', 'Barack_Obama', 'Donald_Trump']'''


entity_list = ['United_States']

#enter the word vector model: (could be 'muse', 'align', 'bp')
vector_source = 'MUSE'

for entity in entity_list:
    
    # loading source data
    with open('/Users/hailianhou/Desktop/MasterThesis/FinalCode/DataCrawling/source_de_'+entity+'.json') as json_data:
        source_de = json.load(json_data)

    with open('/Users/hailianhou/Desktop/MasterThesis/FinalCode/DataCrawling/source_en_'+entity+'.json') as json_data:
        source_en = json.load(json_data)
        
        
    # exact headlies
    en_headline = list(source_en.keys())
    de_headline = list(source_de.keys())

    #extract context
    en_content = []
    de_content = []
    for el in en_headline:
        en_content.append(''.join(source_en[el]))
    for el in de_headline:
        de_content.append(''.join(source_de[el]))
    

    #load document vectors
    
    print('loading document vectors...')
    
    with open('/Users/hailianhou/Desktop/MasterThesis/DocumentVector/WE_MUSE/'+vector_source+'_En_'+entity+'_Content'+'.json') as json_data:
        DV_En_Content = json.load(json_data)

    with open('/Users/hailianhou/Desktop/MasterThesis/DocumentVector/WE_MUSE/'+vector_source+'_En_'+entity+'_Headline'+'.json') as json_data:
        DV_En_Headline = json.load(json_data)

    with open('/Users/hailianhou/Desktop/MasterThesis/DocumentVector/WE_MUSE/'+vector_source+'_De_'+entity+'_Content'+'.json') as json_data:
        DV_De_Content = json.load(json_data)

    with open('/Users/hailianhou/Desktop/MasterThesis/DocumentVector/WE_MUSE/'+vector_source+'_De_'+entity+'_Headline'+'.json') as json_data:
        DV_De_Headline = json.load(json_data)
   



  
    ###cross-lingual linking of aspects, Word Embedding--RCSLS
    print('cross-lingual aspects linking results:')
    
    #1. aspects representation is headline-headline
    getWordEmbeddingRun(en_headline, de_headline, DV_En_Headline, DV_De_Headline, 'en', 'de', entity, vector_source)
    
    #2. aspects representation is content-content
    getWordEmbeddingRun(en_content, de_content, DV_En_Content, DV_De_Content,  'en', 'de', entity, vector_source)
    
    #3. aspects representation is headline-content
    getWordEmbeddingRun(en_content, de_headline, DV_En_Content, DV_De_Headline,  'en', 'de', entity, vector_source)
    
    #4. aspects representation is content-headline
    getWordEmbeddingRun(en_headline, de_content, DV_En_Headline, DV_De_Content,  'en', 'de', entity, vector_source)
                
    

loading document vectors...
cross-lingual aspects linking results:
                      0  1                    2  3         4              5
0   en_United_States_h0  0  de_United_States_h0  0  0.682934  wordembedding
1   en_United_States_h0  0  de_United_States_h4  1  0.199907  wordembedding
2   en_United_States_h0  0  de_United_States_h5  2  0.190947  wordembedding
3   en_United_States_h0  0  de_United_States_h2  3  0.187442  wordembedding
4   en_United_States_h0  0  de_United_States_h3  4  0.182061  wordembedding
5   en_United_States_h0  0  de_United_States_h7  5  0.180614  wordembedding
6   en_United_States_h0  0  de_United_States_h6  6  0.178186  wordembedding
7   en_United_States_h0  0  de_United_States_h1  7  0.160420  wordembedding
8   en_United_States_h0  0  de_United_States_h8  8  0.131717  wordembedding
9   en_United_States_h1  0  de_United_States_h1  0  0.596347  wordembedding
10  en_United_States_h1  0  de_United_States_h2  1  0.400548  wordembedding
11  en_United_States_

# WE_RCSLS EN_DE

In [17]:
entity_list = ['United_States','China','France', 'Germany', 'Japan', 'Asia', 'Europe', 'Italy', 
               'United_Kingdom','Canada','Russia','India', 'Israel','Brazil','Philippines', 
               'New_York_City','London','Singapore','Barack_Obama', 'Donald_Trump','Dubai','Los_Angeles','Paris',
               'Chicago','Washington,_D.C.','San_Francisco',
               'Mumbai','Rome','Toronto','Philadelphia','Monaco','Tokyo','Amsterdam','Boston',
               'Barcelona','Peking']

# define the entity list
#entity_list = ['Canada','Russia','singapore','India', 'Israel','Brazil']
#entity_list = ['Asia', 'Italy', 'United_Kingdom']

#entity_list = ['China','France', 'Germany', 'Japan'] 

#entity_list = ['Barack_Obama', 'Donald_Trump']

#entity_list = ['United_States'] 'China','United_States','China','France', 'Germany', 'Japan', 'Asia', 'Europe', 'Italy', 
               #'United_Kingdom','Canada','Russia','India', 'Israel','Brazil','Philippines', 
               #'New_York_City','London','Singapore','Hong_Kong',


vector_source = 'align'

for entity in entity_list:
    
    # loading source data
    with open('/Users/hailianhou/Desktop/MasterThesis/FinalCode/DataCrawling/source_de_'+entity+'.json') as json_data:
        source_de = json.load(json_data)

    with open('/Users/hailianhou/Desktop/MasterThesis/FinalCode/DataCrawling/source_en_'+entity+'.json') as json_data:
        source_en = json.load(json_data)
        
        
    # exact headlies
    en_headline = list(source_en.keys())
    de_headline = list(source_de.keys())

    #extract context
    en_content = []
    de_content = []
    for el in en_headline:
        en_content.append(''.join(source_en[el]))
    for el in de_headline:
        de_content.append(''.join(source_de[el]))
    

    #load document vectors
    
    print('loading document vectors...')
    
    with open('/Users/hailianhou/Desktop/MasterThesis/DocumentVector/WE_Align/'+vector_source+'_En_'+entity+'_Content'+'.json') as json_data:
        DV_En_Content = json.load(json_data)

    with open('/Users/hailianhou/Desktop/MasterThesis/DocumentVector/WE_Align/'+vector_source+'_En_'+entity+'_Headline'+'.json') as json_data:
        DV_En_Headline = json.load(json_data)

    with open('/Users/hailianhou/Desktop/MasterThesis/DocumentVector/WE_Align/'+vector_source+'_De_'+entity+'_Content'+'.json') as json_data:
        DV_De_Content = json.load(json_data)

    with open('/Users/hailianhou/Desktop/MasterThesis/DocumentVector/WE_Align/'+vector_source+'_De_'+entity+'_Headline'+'.json') as json_data:
        DV_De_Headline = json.load(json_data)
   



  
    ###cross-lingual linking of aspects, Word Embedding--RCSLS
    print('cross-lingual aspects linking results')
    #1. aspects representation is headline-headline
    getWordEmbeddingRun(en_headline, de_headline, DV_En_Headline, DV_De_Headline, 'en', 'de', entity, vector_source)
    
    #2. aspects representation is content-content
    getWordEmbeddingRun(en_content, de_content, DV_En_Content, DV_De_Content,  'en', 'de', entity, vector_source)
    
    #3. aspects representation is headline-content
    getWordEmbeddingRun(en_content, de_headline, DV_En_Content, DV_De_Headline,  'en', 'de', entity, vector_source)
    
    #4. aspects representation is content-headline
    getWordEmbeddingRun(en_headline, de_content, DV_En_Headline, DV_De_Content,  'en', 'de', entity, vector_source)
                
    

loading document vectors...
cross-lingual aspects linking results
                      0  1                    2  3         4              5
0   en_United_States_h0  0  de_United_States_h0  0  0.381292  wordembedding
1   en_United_States_h0  0  de_United_States_h4  1  0.021407  wordembedding
2   en_United_States_h1  0  de_United_States_h1  0  0.233545  wordembedding
3   en_United_States_h1  0  de_United_States_h2  1  0.152140  wordembedding
4   en_United_States_h1  0  de_United_States_h4  2  0.112786  wordembedding
5   en_United_States_h1  0  de_United_States_h8  3  0.059229  wordembedding
6   en_United_States_h2  0  de_United_States_h4  0  0.528069  wordembedding
7   en_United_States_h2  0  de_United_States_h1  1  0.227289  wordembedding
8   en_United_States_h2  0  de_United_States_h2  2  0.210836  wordembedding
9   en_United_States_h2  0  de_United_States_h8  3  0.165871  wordembedding
10  en_United_States_h2  0  de_United_States_h5  4  0.147655  wordembedding
11  en_United_States_h

FileNotFoundError: [Errno 2] No such file or directory: '/Users/hailianhou/Desktop/MasterThesis/DocumentVector/WE_Align/align_En_Dubai_Content.json'

# WE_RCSLS EN_ZH

In [5]:
entity_list = ['United_States','China','France', 'Germany', 'Japan', 'Asia', 'Europe', 'Italy', 
              'United_Kingdom','Canada','Russia','India', 'Israel','Brazil','Philippines', 
             'New_York_City','London','Singapore','Barack_Obama', 'Donald_Trump']

#missing: EN_ZH 'Hong_Kong', 'Dubai','Los_Angeles','Paris',
'''
entity_list = [
               'Chicago','Washington,_D.C.','San_Francisco',
               'Mumbai','Rome','Toronto','Philadelphia','Monaco','Tokyo','Amsterdam','Boston',
               'Barcelona','Peking', 'Barack_Obama', 'Donald_Trump']'''

# define the entity list
#entity_list = ['Canada','Russia','singapore','India', 'Israel','Brazil']
#entity_list = ['Asia', 'Europe', 'Italy', 'United_Kingdom']

#entity_list = ['China','France', 'Germany', 'Japan'] 

#entity_list = ['Barack_Obama', 'Donald_Trump']

#entity_list = ['United_States']


vector_source = 'Align'

for entity in entity_list:
    
    # loading source data
    with open('/Users/hailianhou/Desktop/MasterThesis/FinalCode/DataCrawling/source_zh_'+entity+'.json') as json_data:
        source_zh = json.load(json_data)

    with open('/Users/hailianhou/Desktop/MasterThesis/FinalCode/DataCrawling/source_en_'+entity+'.json') as json_data:
        source_en = json.load(json_data)
        
        
    # exact headlies
    en_headline = list(source_en.keys())
    zh_headline = list(source_zh.keys())

    #extract context
    en_content = []
    zh_content = []
    for el in en_headline:
        en_content.append(''.join(source_en[el]))
    for el in zh_headline:
        zh_content.append(''.join(source_zh[el]))
    

    #load document vectors
    
    print('loading document vectors...')
    
    with open('/Users/hailianhou/Desktop/MasterThesis/DocumentVector/WE_Align/'+vector_source+'_En_'+entity+'_Content'+'.json') as json_data:
        DV_En_Content = json.load(json_data)

    with open('/Users/hailianhou/Desktop/MasterThesis/DocumentVector/WE_Align/'+vector_source+'_En_'+entity+'_Headline'+'.json') as json_data:
        DV_En_Headline = json.load(json_data)

    with open('/Users/hailianhou/Desktop/MasterThesis/DocumentVector/WE_Align/'+vector_source+'_zh_'+entity+'_Content'+'.json') as json_data:
        DV_zh_Content = json.load(json_data)

    with open('/Users/hailianhou/Desktop/MasterThesis/DocumentVector/WE_Align/'+vector_source+'_zh_'+entity+'_Headline'+'.json') as json_data:
        DV_zh_Headline = json.load(json_data)
   



  
    ###cross-lingual linking of aspects, Word Embedding--RCSLS
    print('cross-lingual aspects linking results')
    #1. aspects representation is headline-headline
    getWordEmbeddingRun(en_headline, zh_headline, DV_En_Headline, DV_zh_Headline, 'en', 'zh', entity, vector_source)
    
    #2. aspects representation is content-content
    getWordEmbeddingRun(en_content, zh_content, DV_En_Content, DV_zh_Content,  'en', 'zh', entity, vector_source)
    
    #3. aspects representation is headline-content
    getWordEmbeddingRun(en_content, zh_headline, DV_En_Content, DV_zh_Headline,  'en', 'zh', entity, vector_source)
    
    #4. aspects representation is content-headline
    getWordEmbeddingRun(en_headline, zh_content, DV_En_Headline, DV_zh_Content,  'en', 'zh', entity, vector_source)
                
    

loading document vectors...
cross-lingual aspects linking results
                       0  1                     2   3         4              5
0    en_United_States_h0  0   zh_United_States_h0   0  0.333090  wordembedding
1    en_United_States_h0  0   zh_United_States_h1   1  0.330093  wordembedding
2    en_United_States_h0  0  zh_United_States_h13   2  0.315872  wordembedding
3    en_United_States_h0  0   zh_United_States_h9   3  0.307869  wordembedding
4    en_United_States_h0  0   zh_United_States_h8   4  0.295692  wordembedding
5    en_United_States_h0  0   zh_United_States_h7   5  0.286303  wordembedding
6    en_United_States_h0  0   zh_United_States_h4   6  0.277914  wordembedding
7    en_United_States_h0  0  zh_United_States_h11   7  0.276671  wordembedding
8    en_United_States_h0  0  zh_United_States_h12   8  0.269418  wordembedding
9    en_United_States_h0  0   zh_United_States_h3   9  0.268940  wordembedding
10   en_United_States_h0  0   zh_United_States_h2  10  0.265973  

# BP EN_ZH

In [59]:
# enter the entity_name:
entity_list = ['United_States']

#enter the word vector model: (could be 'muse', 'align', 'bp')
vector_source = 'BP'

for entity in entity_list:
    
    # loading source data
    with open('/Users/hailianhou/Desktop/MasterThesis/FinalCode/DataCrawling/source_zh_'+entity+'.json') as json_data:
        source_zh = json.load(json_data)

    with open('/Users/hailianhou/Desktop/MasterThesis/FinalCode/DataCrawling/source_en_'+entity+'.json') as json_data:
        source_en = json.load(json_data)
        
        
    # exact headlies
    en_headline = list(source_en.keys())
    zh_headline = list(source_zh.keys())

    #extract context
    en_content = []
    zh_content = []
    for el in en_headline:
        en_content.append(''.join(source_en[el]))
    for el in zh_headline:
        zh_content.append(''.join(source_zh[el]))
    

    #load document vectors
    
    print('loading document vectors...')
    
    with open('/Users/hailianhou/Desktop/MasterThesis/DocumentVector/WE_BP/'+vector_source+'_En_'+entity+'_Content'+'.json') as json_data:
        DV_En_Content = json.load(json_data)

    with open('/Users/hailianhou/Desktop/MasterThesis/DocumentVector/WE_BP/'+vector_source+'_En_'+entity+'_Headline'+'.json') as json_data:
        DV_En_Headline = json.load(json_data)

    with open('/Users/hailianhou/Desktop/MasterThesis/DocumentVector/WE_BP/'+vector_source+'_zh_'+entity+'_Content'+'.json') as json_data:
        DV_zh_Content = json.load(json_data)

    with open('/Users/hailianhou/Desktop/MasterThesis/DocumentVector/WE_BP/'+vector_source+'_zh_'+entity+'_Headline'+'.json') as json_data:
        DV_zh_Headline = json.load(json_data)
   

              
    

  
    ###cross-lingual linking of aspects, Word Embedding--RCSLS
    print('cross-lingual aspects linking results:')
    
    #1. aspects representation is headline-headline EN_ZH BP
    getWordEmbeddingRun(en_headline, zh_headline, DV_En_Headline, DV_zh_Headline, 'en', 'zh', entity, vector_source)
    
    #2. aspects representation is content-content
    getWordEmbeddingRun(en_content, zh_content, DV_En_Content, DV_zh_Content,  'en', 'zh', entity, vector_source)
    
    #3. aspects representation is headline-content
    getWordEmbeddingRun(en_content, zh_headline, DV_En_Content, DV_zh_Headline,  'en', 'zh', entity, vector_source)
    
    #4. aspects representation is content-headline
    getWordEmbeddingRun(en_headline, zh_content, DV_En_Headline, DV_zh_Content,  'en', 'zh', entity, vector_source)
  

loading document vectors...
cross-lingual aspects linking results:
                      0  1                     2   3         4              5
0   en_United_States_h0  0   zh_United_States_h8   0  0.009221  wordembedding
1   en_United_States_h0  0   zh_United_States_h7   1  0.008815  wordembedding
2   en_United_States_h0  0  zh_United_States_h11   2  0.005950  wordembedding
3   en_United_States_h0  0  zh_United_States_h13   3  0.005532  wordembedding
4   en_United_States_h0  0   zh_United_States_h0   4  0.005532  wordembedding
5   en_United_States_h0  0   zh_United_States_h9   5  0.002268  wordembedding
6   en_United_States_h0  0   zh_United_States_h1   6  0.000637  wordembedding
7   en_United_States_h2  0   zh_United_States_h7   0  0.002628  wordembedding
8   en_United_States_h3  0  zh_United_States_h13   0  0.031977  wordembedding
9   en_United_States_h3  0   zh_United_States_h0   1  0.031977  wordembedding
10  en_United_States_h3  0   zh_United_States_h7   2  0.029159  wordembeddi