In [13]:
import pandas as pd
import numpy as np
import csv
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
nltk.download('wordnet')
nltk.download("stopwords")
from nltk.corpus import stopwords
esw = stopwords.words("english")+['edit']
from string import punctuation

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/hailianhou/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hailianhou/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
from googletrans import Translator
from collections import defaultdict

In [15]:
import json
import math

# base knowledge:
1. for each wikipedia term, there is a term name
2. for each wikipedia term, its English page was crawled as source text in language a -"en": source_a
3. for each wikipedia term, its German page was crawled as source text in language b -"de": source_b
4. for each source page in language b -"de", it will be translated into language a, saved as translation_b_to_a
5. links between source_a and source_b are to be represented by the links between source-a and translation_b_to_a



# getTranslationRun Function

In [16]:
### translate the text in the format of dictionary from src language into dest language

##input:
# text: text in dictionary format {}
# la1: source language b of the text 
# la2: target language a the text to be translated into
# termname: term name of the text

def getTranslationRun(text, src_b, dest_a, termname):
    text = text
    dest = dest_a
    translator = Translator()
    dict_trans = defaultdict(list)
    headline = list(text.keys())
    for el in headline:
        h = translator.translate(text = el, dest = dest)
        t = translator.translate(text = text[el], dest = dest)
        for m in t:
            dict_trans[h.text].append(m.text)
            #print(m.text)

    print(dict_trans)
    return dict_trans

In [17]:
import requests


# data preprocessing Function

In [18]:
# data preprocessing function

def toLowerList(List):
    for i in range(len(List)):
        List[i] = List[i].lower()
    return List

def tokenizeDVSM(file):
    i = 0
    tokenizer = WordPunctTokenizer()
    for el in file:
        file[i] = ' '.join(tokenizer.tokenize(str(el)))
        i += 1       

def removeStopwords(file):
    i = 0
    for el in file:
        el = ' '.join([word for word in el.split() if word not in esw])
        file[i] = el
        i += 1

punctuation = punctuation + str('%.[')+str('(/')+str(');[')+str('"),')+str(').')+str('.[')+str(',[')+str('][')+str('("')+str('."[')+str('—'+str('."'))+str('.,')
def removePunctuation(file):
    
    i = 0
    for el in file:
        el = ' '.join([word for word in el.split() if word not in punctuation])
        file[i] = el
        i += 1

def stemming(file):
    ps = PorterStemmer()
    i = 0
    for el in file:
        el = ' '.join([ps.stem(word) for word in el.lower().split() ])
        file[i] = el
        i += 1

def removeNumbers(file):
    i = 0
    for el in file:
        file[i] = ' '.join([word for word in el.split() if not word.isdigit()])
        i+=1

        
def lemmatize_n(file):
    lemmatizer = WordNetLemmatizer()
    i = 0
    for el in file:
        el = ' '.join([lemmatizer.lemmatize(word, pos = 'n') for word in el.lower().split() ])
        file[i] = el
        i += 1


def lemmatize_v(file):
    lemmatizer = WordNetLemmatizer()
    i = 0
    for el in file:
        el = ' '.join([lemmatizer.lemmatize(word, pos = 'v') for word in el.lower().split() ])
        file[i] = el
        i += 1
        

def lemmatize_a(file):
    lemmatizer = WordNetLemmatizer()
    i = 0
    for el in file:
        el = ' '.join([lemmatizer.lemmatize(word, pos = 'a') for word in el.lower().split() ])
        file[i] = el
        i += 1
        
def lemmatize_r(file):
    lemmatizer = WordNetLemmatizer()
    i = 0
    for el in file:
        el = ' '.join([lemmatizer.lemmatize(word, pos = 'r') for word in el.lower().split() ])
        file[i] = el
        i += 1

In [19]:
#preprocessing: tokenize, removing stopwords, punctuation, and stemming

def preprocessing(content):
    tokenizeDVSM(content)
    removePunctuation(content)
    removeStopwords(content)
    removeNumbers(content)
    ### lemmatization, NOUNs
    lemmatize_n(content)
    ### lemmatization, Verbs
    lemmatize_v(content)
    ### lemmatization, Adjactives
    lemmatize_a(content)
    ### lemmatization
    lemmatize_r(content)

# DTM, Query-vector building

In [20]:
##### build functions to generate document-term matrix

## get DTM, weighted by tfidf, the sqrt of the query vector is 1
# therefore, in the retrive phase, 
# dot product of the doc vector and query vector can be used to represent the cosine similarity

# input format: train_doc_file.text

def get_DTM_tfidf(file):
    train_count_vect = CountVectorizer() #remove english stopwords
    X_train_counts = train_count_vect.fit_transform(file)
    vocabulary = list(train_count_vect.vocabulary_.keys())
    #print(vocabulary)

    tfidf_transformer = TfidfTransformer()
    
    X_train_tfidf = CountVectorizer(vocabulary = vocabulary)
    X_train_tfidf = X_train_tfidf.fit_transform(file)
    
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_tfidf)
    return X_train_tfidf


## get DTM, weighted by term frequency
def get_DTM_tf(file):
    train_count_vect = CountVectorizer() #remove english stopwords
    X_train_counts = train_count_vect.fit_transform(file)
    vocabulary = list(train_count_vect.vocabulary_.keys())

    X_train_tf = CountVectorizer(vocabulary = vocabulary)
    X_train_tf = X_train_tf.fit_transform(file)
    return X_train_tf


In [21]:
###Generate query vector for each query

# get_QueryVector_tfidf helps to get the tiidf weighted query vector, the sqrt of the query vector is 1
# therefore, in the retrive phase, 
# dot product of the doc vector and query vector can be used to represent the cosine similarity

# input format: train_doc_file.text, train_query_file.text


def get_QueryVector_tfidf(queryFile, docFile):
    train_count_vect = CountVectorizer() #remove english stopwords
    X_train_counts = train_count_vect.fit_transform(docFile)
    vocabulary = list(train_count_vect.vocabulary_.keys())
    
    if type(queryFile) is str:
        query_vect = []
        query = queryFile
        query = query.split()
        frequency = defaultdict(int)
        for el in vocabulary:
            if el in query:
                frequency[el]+= 1
            else:
                frequency[el] = 0
        query_vect = list(dict(frequency).values())/np.linalg.norm(list(dict(frequency).values()))
        
    else:
        tfidf_transformer = TfidfTransformer()
        
        query_vect = CountVectorizer(analyzer = "word", vocabulary = vocabulary)
        query_vect = query_vect.fit_transform(queryFile)
        
        query_vect = tfidf_transformer.fit_transform(query_vect)
    return query_vect



###Generate the query vector, weighted by term frequency
def get_QueryVector(queryFile, docFile):
    train_count_vect = CountVectorizer() #remove english stopwords
    X_train_counts = train_count_vect.fit_transform(docFile)
    vocabulary = list(train_count_vect.vocabulary_.keys())
    
    if type(queryFile) is str:
        query_vect = []
        query = queryFile
        query = query.split()
        frequency = defaultdict(int)
        for el in vocabulary:
            if el in query:
                frequency[el]+= 1
            else:
                frequency[el] = 0
        query_vect = list(dict(frequency).values())
        
    else:
        query_vect = CountVectorizer(analyzer = "word", vocabulary = vocabulary)
        query_vect = query_vect.fit_transform(queryFile)
        
    return query_vect

# cosine similarity Function

In [22]:
# input: 1* n dims sparse matrix, or single vector from query_vect/ train_tfidf matrix
def getSquareSum(vector):
    squaresum = 0
    for i in range(vector.shape[1]):
        squaresum += vector[0,i]* vector[0,i]
        
    return squaresum 

In [23]:
# input: 1* n sparse matrix, or single vector from query_vect/ train_tfidf matrix
# return the cosine sim of two vectors

def getCosineSimilarity(query_vector, doc_vector):
    squaresum_query = getSquareSum(query_vector)
    squaresum_doc= getSquareSum(doc_vector)
    if math.sqrt(squaresum_query)*math.sqrt(squaresum_doc) > 0:
        sim = np.dot(query_vector, doc_vector.transpose())[0,0]/(math.sqrt(squaresum_query)*math.sqrt(squaresum_doc))
        return sim
    else:
        return np.dot(query_vector, doc_vector.transpose())[0,0]
 

# input: 1* n dims sparse matrix, or single vector from query_vect/ train_tfidf matrix
def getSquareSum(vector):
    squaresum = 0
    for i in range(vector.shape[1]):
        squaresum += vector[i]* vector[i]
        
    return squaresum

# input: two 1* n sparse matrixs, or single vector from query_vect/ train_tfidf matrixs
# return the cosine sim of two vectors, type: float

def getCosineSimilarity(query_vector, doc_vector):
    squaresum_query = getSquareSum(query_vector)
    squaresum_doc= getSquareSum(doc_vector)
    if math.sqrt(squaresum_query)*math.sqrt(squaresum_doc) > 0:
        sim = np.dot(query_vector, doc_vector)/(math.sqrt(squaresum_query)*math.sqrt(squaresum_doc))
        return sim
    else:
        return np.dot(query_vector, doc_vector)

# getVectorSpaceModelRun Function

link headlines in the source text with headline in the translation text through cosinesimilarity

In [24]:
# src_content: is the source page data of the wikipedia item in laguage a
# trs_content: is the translation data of the wikipedia item from language b to language a
# we link the headlines of the wikipedia item in different languages through translation text and source text
# further more, link of headlines is predicted through similarity between the text vectors below the headlines
# b: source language b of the text 
# a: target language a the text to be translated into
# termname: term name of the text

# return: RUN file for Trec_eval

def getDocumentVectorSpaceModelRun(trs_content, src_content, la1, la2, termname):    
    ### get the similarity of query with each doc leader
    queryVector = get_QueryVector_tfidf(trs_content, src_content)
    src_DTM = get_DTM_tfidf(src_content)
    
    result = []
    for j in range(queryVector.shape[0]):
        
        sims = []
        results_ID = []
        results = []
        for i in range(src_DTM.shape[0]):
            s = getCosineSimilarity(queryVector[j], src_DTM[i,])
            #if type(s)is not None:
            sims.append(s)

        
        for x in range(len(sims)):
            if sims[x] >0:
                results_ID.append(x)
                results.append(sims[x])
        
        #print(results, results_ID)
        
        if len(results) >0 and len(results_ID)>0:
            results, results_ID= zip(*sorted(zip(results, results_ID), reverse=True))      
       
        
        for m in range(len(results)):
            result.append([la1+'_'+termname+"_h"+str(j), 0, la2+'_'+termname+'_h'+str(results_ID[m]), m, results[m], 'cosinesims'])

    df = pd.DataFrame(result)
    if len(trs_content[0].split()) > 10 and len(src_content[0].split())>10 :
        df.to_csv('DWSM_'+termname+'_'+la1+'_'+la2+'_text.txt', header=None, index=None, sep=' ', mode='a')
    elif len(trs_content[0].split()) < 10 and len(src_content[0].split())<10:
        df.to_csv('DWSM_'+termname+'_'+la1+'_'+la2+'_headline.txt', header=None, index=None, sep=' ', mode='a')
    elif len(trs_content[0].split()) < 10 and len(src_content[0].split())>10:
        df.to_csv('DWSM_'+termname+'_'+la1+'_'+la2+'_headline_text.txt', header=None, index=None, sep=' ', mode='a')
    else:
        df.to_csv('DWSM_'+termname+'_'+la1+'_'+la2+'_text_headline.txt', header=None, index=None, sep=' ', mode='a')     
    
    print(df)

# Make Functions Run!
1. data loading as source_a and source_b
2. translating source_b into transaltion_b_to_a
3. data preprocessing 
4. make baseline run(headlines)
5. make cross link run(contexts)



In [25]:
from gensim.models import KeyedVectors

# importing word vector for WVSM
en_model = KeyedVectors.load_word2vec_format('/Users/hailianhou/Desktop/MasterThesis/Wiki Data/wiki-news-300d-1M.vec')

In [30]:
# 'United_States','Germany', 'Japan','China', 'France','Italy', 'Canada', 'United_Kingdom', 
#'Asia','Europe', 'Russia','singapore','India', 'Israel','Brazil','Philippines'
# 'New_York_City','London','Singapore','Hong_Kong','Dubai','Los_Angeles','Paris','Chicago','San_Francisco','Mumbai','Rome',
#'Toronto','Philadelphia','Monaco','Tokyo','Amsterdam','Boston','Barcelona','Peking'

entity_name = 'United_States'

In [31]:
# loading source data
with open('/Users/hailianhou/Desktop/MasterThesis/FinalCode/DataCrawling/source_zh_'+entity_name+'.json') as json_data:
    source_zh = json.load(json_data)

with open('/Users/hailianhou/Desktop/MasterThesis/FinalCode/DataCrawling/source_en_'+entity_name+'.json') as json_data:
    source_en = json.load(json_data)

In [32]:
## input: 
#source_zh_Barack_Obama: dict

translation_zh_to_en = getTranslationRun(source_zh, 'zh', 'en', entity_name)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [33]:
#rename

source_a = source_en
source_b = source_zh

translation_b_to_a = translation_zh_to_en


NameError: name 'translation_zh_to_en' is not defined

In [758]:
# exact headlies
src_headline = list(source_a.keys())
trs_headline = list(translation_b_to_a .keys())

#extract context
src_content = []
trs_content = []
for el in src_headline:
    src_content.append(''.join(source_a[el]))
for el in trs_headline:
    trs_content.append(''.join(translation_b_to_a[el]))

In [759]:
preprocessing(src_content)
preprocessing(trs_content)

preprocessing(src_headline)
preprocessing(trs_headline)

In [760]:
# English-German aspects links through context--context 

getDocumentVectorSpaceModelRun(src_content, trs_content, 'en', 'zh', entity_name)

                    0  1                2  3         4           5
0     en_Amsterdam_h0  0  de_Amsterdam_h0  0  0.538282  cosinesims
1     en_Amsterdam_h0  0  de_Amsterdam_h7  1  0.528043  cosinesims
2     en_Amsterdam_h0  0  de_Amsterdam_h3  2  0.468488  cosinesims
3     en_Amsterdam_h0  0  de_Amsterdam_h1  3  0.333967  cosinesims
4     en_Amsterdam_h0  0  de_Amsterdam_h6  4  0.289278  cosinesims
5     en_Amsterdam_h0  0  de_Amsterdam_h4  5  0.271678  cosinesims
6     en_Amsterdam_h0  0  de_Amsterdam_h5  6  0.214613  cosinesims
7     en_Amsterdam_h0  0  de_Amsterdam_h2  7  0.154069  cosinesims
8     en_Amsterdam_h0  0  de_Amsterdam_h8  8  0.081040  cosinesims
9     en_Amsterdam_h1  0  de_Amsterdam_h3  0  0.206090  cosinesims
10    en_Amsterdam_h1  0  de_Amsterdam_h0  1  0.110817  cosinesims
11    en_Amsterdam_h1  0  de_Amsterdam_h4  2  0.071878  cosinesims
12    en_Amsterdam_h1  0  de_Amsterdam_h5  3  0.052722  cosinesims
13    en_Amsterdam_h1  0  de_Amsterdam_h7  4  0.051014  cosine

In [761]:
# English-German aspects linking through headline--headline

getDocumentVectorSpaceModelRun(src_headline, trs_headline, 'en', 'zh', entity_name)

                 0  1                2  3         4           5
0  en_Amsterdam_h0  0  de_Amsterdam_h0  0  1.000000  cosinesims
1  en_Amsterdam_h2  0  de_Amsterdam_h3  0  0.920794  cosinesims
2  en_Amsterdam_h3  0  de_Amsterdam_h2  0  0.920794  cosinesims
3  en_Amsterdam_h5  0  de_Amsterdam_h5  0  0.920794  cosinesims
4  en_Amsterdam_h6  0  de_Amsterdam_h7  0  0.677373  cosinesims
5  en_Amsterdam_h7  0  de_Amsterdam_h6  0  0.920794  cosinesims
6  en_Amsterdam_h8  0  de_Amsterdam_h4  0  0.920794  cosinesims


In [762]:
# English-German aspects linking through headline--content

getDocumentVectorSpaceModelRun(src_content, trs_headline,  'en', 'zh', entity_name)

                   0  1                2  3         4           5
0    en_Amsterdam_h0  0  de_Amsterdam_h0  0  0.988362  cosinesims
1    en_Amsterdam_h0  0  de_Amsterdam_h7  1  0.083340  cosinesims
2    en_Amsterdam_h0  0  de_Amsterdam_h1  2  0.080272  cosinesims
3    en_Amsterdam_h2  0  de_Amsterdam_h0  0  0.998465  cosinesims
4    en_Amsterdam_h2  0  de_Amsterdam_h1  1  0.049702  cosinesims
5    en_Amsterdam_h3  0  de_Amsterdam_h0  0  1.000000  cosinesims
6    en_Amsterdam_h4  0  de_Amsterdam_h0  0  0.998861  cosinesims
7    en_Amsterdam_h4  0  de_Amsterdam_h6  1  0.043939  cosinesims
8    en_Amsterdam_h5  0  de_Amsterdam_h0  0  0.893462  cosinesims
9    en_Amsterdam_h5  0  de_Amsterdam_h5  1  0.347134  cosinesims
10   en_Amsterdam_h5  0  de_Amsterdam_h1  2  0.162431  cosinesims
11   en_Amsterdam_h5  0  de_Amsterdam_h3  3  0.074839  cosinesims
12   en_Amsterdam_h5  0  de_Amsterdam_h6  4  0.054419  cosinesims
13   en_Amsterdam_h6  0  de_Amsterdam_h0  0  0.945846  cosinesims
14   en_Am

In [763]:
# English-German aspects linking through content--headline

getDocumentVectorSpaceModelRun(src_headline, trs_content,  'en', 'zh', entity_name)

                   0  1                2  3         4           5
0    en_Amsterdam_h0  0  de_Amsterdam_h1  0  0.632870  cosinesims
1    en_Amsterdam_h0  0  de_Amsterdam_h0  1  0.407440  cosinesims
2    en_Amsterdam_h0  0  de_Amsterdam_h7  2  0.406263  cosinesims
3    en_Amsterdam_h0  0  de_Amsterdam_h6  3  0.276485  cosinesims
4    en_Amsterdam_h0  0  de_Amsterdam_h3  4  0.209986  cosinesims
5    en_Amsterdam_h0  0  de_Amsterdam_h5  5  0.172987  cosinesims
6    en_Amsterdam_h0  0  de_Amsterdam_h4  6  0.124671  cosinesims
7    en_Amsterdam_h0  0  de_Amsterdam_h8  7  0.084833  cosinesims
8    en_Amsterdam_h0  0  de_Amsterdam_h2  8  0.072041  cosinesims
9    en_Amsterdam_h2  0  de_Amsterdam_h5  0  0.056425  cosinesims
10   en_Amsterdam_h5  0  de_Amsterdam_h5  0  0.047657  cosinesims
11   en_Amsterdam_h5  0  de_Amsterdam_h3  1  0.014463  cosinesims
12   en_Amsterdam_h7  0  de_Amsterdam_h8  0  0.221367  cosinesims
13   en_Amsterdam_h8  0  de_Amsterdam_h6  0  0.072147  cosinesims
14   en_Am

entity_list = ['Germany', 'Japan','France','Italy', 'United_Kingdom', 'Asia','Europe']


for entity in entity_list:
    # loading source data
    with open('/Users/hailianhou/Desktop/MasterThesis/FinalCode/DataCrawling/source_de_'+entity+'.json') as json_data:
        source_de = json.load(json_data)

    with open('/Users/hailianhou/Desktop/MasterThesis/FinalCode/DataCrawling/source_en_'+entity+'.json') as json_data:
        source_en = json.load(json_data)
        
    ## input: 
    #source_de_Barack_Obama: dict

    translation_de_to_en = getTranslationRun(source_de, 'de', 'en', el)
    
    #rename

    source_a = source_en
    source_b = source_de

    translation_b_to_a = translation_de_to_en
    
    # exact headlies
    src_headline = list(source_a.keys())
    trs_headline = list(translation_b_to_a .keys())

    #extract context
    src_content = []
    trs_content = []
    for el in src_headline:
        src_content.append(''.join(source_a[el]))
    for el in trs_headline:
        trs_content.append(''.join(translation_b_to_a[el]))
    
    #preprocessing
    preprocessing(src_content)
    preprocessing(trs_content)

    preprocessing(src_headline)
    preprocessing(trs_headline)
    
    # predict headline links through context--context
    getDocumentVectorSpaceModelRun(src_content, trs_content, 'en', 'de', el)
    
    # predict headline links through headline--headline
    getDocumentVectorSpaceModelRun(src_headline, trs_headline, 'en', 'de', el)
    
    # predict headline links through headline--content
    getDocumentVectorSpaceModelRun(src_content, trs_headline,  'en', 'de', el)
    
    # predict headline links through content--headline
    getDocumentVectorSpaceModelRun(src_headline, trs_content,  'en', 'de', el)

    

In [271]:
import json
import nltk
import math
import numpy as np
import pandas as pd
import gensim
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
from nltk.tokenize import WordPunctTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')
nltk.download("stopwords")
from nltk.corpus import stopwords
esw = stopwords.words("english")+['edit']
from string import punctuation

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/hailianhou/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hailianhou/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [499]:
from operator import add

In [500]:
def traditionalToSimplified(file):
    i = 0
    for el in file:
        file[i] = ''.join(HanziConv.toSimplified(el))
        i += 1
        
def chineseTokenize(file):
    i = 0
    for el in file:
        file[i] = ' '.join(jieba.cut(el, cut_all=False, HMM=True))
        i += 1

        
from nltk.tokenize import MWETokenizer
def MWEtokenize(el):
    i = 0
    tokenizer = MWETokenizer()
    
    el = tokenizer.tokenize(el.split())
    return el

punctuation = punctuation + str('；')+  str("：《》「 」“”[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）]+")+str('编辑')+str('%.[')+str('(/')+str(');[')+str('"),')+str(').')+str('.[')+str(',[')+str('][')+str('("')+str('."[')+str('—'+str('."'))+str('.,')        
def tokenize(file):
    i = 0
    for el in file:
        tokenizerOne = WordPunctTokenizer()
        el = tokenizerOne.tokenize(str(el.lower()))
        el = ' '.join([word for word in el if word not in punctuation])
        file[i] = MWEtokenize(el)
        i += 1
        




In [764]:
# exact headlies
en_headline = list(source_a.keys())
zh_headline = list(translation_b_to_a .keys())

#extract context
en_content = []
zh_content = []
for el in en_headline:
    en_content.append(''.join(source_a[el]))
for el in zh_headline:
    zh_content.append(''.join(translation_b_to_a[el]))

In [765]:
# preprocessing for en and de
tokenize(en_content)
tokenize(zh_content)
tokenize(en_headline)
tokenize(zh_headline)

In [766]:
en_headline

[['amsterdam'],
 ['etymology', 'edit'],
 ['history', 'edit'],
 ['geography', 'edit'],
 ['demographics', 'edit'],
 ['cityscape', 'and', 'architecture', 'edit'],
 ['economy', 'edit'],
 ['culture', 'edit'],
 ['politics', 'edit'],
 ['transport', 'edit'],
 ['education', 'edit'],
 ['media', 'edit'],
 ['housing', 'edit']]

In [767]:
#Building the word2vect model for the documents corpus

model = Word2Vec(en_content, size=300)

In [768]:
en_content

[['amsterdam',
  'ˈæmstərdæm',
  '/,',
  'uk',
  'also',
  'ˌæmstərˈdæm',
  '/;[',
  '10',
  '11',
  'dutch',
  'ɑmstərˈdɑm',
  'listen',
  '))',
  'is',
  'the',
  'capital',
  'city',
  'and',
  'most',
  'populous',
  'municipality',
  'of',
  'the',
  'netherlands',
  'its',
  'status',
  'as',
  'the',
  'capital',
  'is',
  'mandated',
  'by',
  'the',
  'constitution',
  'of',
  'the',
  'netherlands',
  '12',
  'although',
  'it',
  'is',
  'not',
  'the',
  'seat',
  'of',
  'the',
  'government',
  'which',
  'is',
  'the',
  'hague',
  '13',
  'amsterdam',
  'has',
  'a',
  'population',
  'of',
  '854',
  '047',
  'within',
  'the',
  'city',
  'proper',
  '1',
  '357',
  '675',
  'in',
  'the',
  'urban',
  'area',
  '5',
  'and',
  '2',
  '410',
  '960',
  'in',
  'the',
  'metropolitan',
  'area',
  '9',
  'the',
  'city',
  'is',
  'located',
  'in',
  'the',
  'province',
  'of',
  'north',
  'holland',
  'in',
  'the',
  'west',
  'of',
  'the',
  'country',
  'but',


In [769]:
def getDocVectors(content):
    
    doc_vectors = list()
    for i in range(len(content)):
        
        vectorSum = [0]*300
        l = 0
        for el in content[i]:
            if el in list(model.wv.vocab):
                # adding the word vectors togenther by their dimensions:
                vectorSum = list(map(add, list(model.wv[el]), vectorSum))
                l+=1
        
        for m in range(len(vectorSum)):
            if vectorSum[m] != 0:
                ###anormalize the vector sum:
                vectorSum[m] = float(vectorSum[m])/l 
            else:
                vectorSum[m] = vectorSum[m]
          
        doc_vectors.append(vectorSum)

    return doc_vectors 
    

In [770]:
# input: 1* n dims sparse matrix, or single vector from query_vect/ train_tfidf matrix
def getSquareSumWVSM(vector):
    squaresum = 0
    for i in range(len(vector)):
        squaresum += vector[i]* vector[i]
        
    return squaresum 

In [771]:
# input: 1* n sparse matrix, or single vector from query_vect/ train_tfidf matrix
# return the cosine sim of two vectors

def getCosineSimilarityWVSM(query_vector, doc_vector):
    squaresum_query = getSquareSumWVSM(query_vector)
    squaresum_doc= getSquareSumWVSM(doc_vector)
    if math.sqrt(squaresum_query)*math.sqrt(squaresum_doc) > 0:
        sim = np.dot(query_vector, doc_vector)/(math.sqrt(squaresum_query)*math.sqrt(squaresum_doc))
        return sim
    else:
        return np.dot(query_vector, doc_vector)


In [772]:
# src_content: is the source page data of the wikipedia item in laguage a
# trs_content: is the translation data of the wikipedia item from language b to language a
# we link the headlines of the wikipedia item in different languages through translation text and source text
# further more, link of headlines is predicted through similarity between the text vectors below the headlines
# b: source language b of the text 
# a: target language a the text to be translated into
# termname: term name of the text

# return: RUN file for Trec_eval

def getWordVectorSpaceModelRun(trs_content, src_content, la1, la2, termname):    
    ### get the similarity of query with each doc leader
    DV_src_content = getDocVectors(src_content)
    DV_trs_content = getDocVectors(trs_content)
    
    
    result = []
    for j in range(len(DV_trs_content)):
        
        sims = []
        results_ID = []
        results = []
        for i in range(len(DV_src_content)):
            s = getCosineSimilarityWVSM(DV_trs_content[j], DV_src_content[i])
            #if type(s)is not None:
            sims.append(s)

        
        for x in range(len(sims)):
            if sims[x] >0:
                results_ID.append(x)
                results.append(sims[x])
        
        #print(results, results_ID)
        
        if len(results) >0 and len(results_ID)>0:
            results, results_ID= zip(*sorted(zip(results, results_ID), reverse=True))      
       
        
        for m in range(len(results)):
            result.append([la1+'_'+termname+"_h"+str(j), 0, la2+'_'+termname+'_h'+str(results_ID[m]), m, results[m], 'word2vector'])

    df = pd.DataFrame(result)
    if len(trs_content[0]) > 10 and len(src_content[0])>10 :
        df.to_csv('WVSM_'+termname+'_'+la1+'_'+la2+'_text.txt', header=None, index=None, sep=' ', mode='a')
    elif len(trs_content[0]) < 10 and len(src_content[0])<10:
        df.to_csv('WVSM_'+termname+'_'+la1+'_'+la2+'_headline.txt', header=None, index=None, sep=' ', mode='a')
    elif len(trs_content[0]) < 10 and len(src_content[0])>10:
        df.to_csv('WVSM_'+termname+'_'+la1+'_'+la2+'_headline_text.txt', header=None, index=None, sep=' ', mode='a')
    else:
        df.to_csv('WVSM_'+termname+'_'+la1+'_'+la2+'_text_headline.txt', header=None, index=None, sep=' ', mode='a')     
    
    print(df)

In [773]:
# Englis-German Aspects Linking Headline-Headline 

getWordVectorSpaceModelRun(en_headline, de_headline, 'en', 'zh', entity_name)

                   0  1                2  3         4            5
0    en_Amsterdam_h0  0  de_Amsterdam_h0  0  1.000000  word2vector
1    en_Amsterdam_h0  0  de_Amsterdam_h7  1  0.999977  word2vector
2    en_Amsterdam_h0  0  de_Amsterdam_h1  2  0.999969  word2vector
3    en_Amsterdam_h0  0  de_Amsterdam_h6  3  0.999337  word2vector
4    en_Amsterdam_h5  0  de_Amsterdam_h7  0  1.000000  word2vector
5    en_Amsterdam_h5  0  de_Amsterdam_h1  1  0.999979  word2vector
6    en_Amsterdam_h5  0  de_Amsterdam_h0  2  0.999977  word2vector
7    en_Amsterdam_h5  0  de_Amsterdam_h6  3  0.999324  word2vector
8    en_Amsterdam_h7  0  de_Amsterdam_h6  0  1.000000  word2vector
9    en_Amsterdam_h7  0  de_Amsterdam_h0  1  0.999337  word2vector
10   en_Amsterdam_h7  0  de_Amsterdam_h7  2  0.999324  word2vector
11   en_Amsterdam_h7  0  de_Amsterdam_h1  3  0.999312  word2vector
12   en_Amsterdam_h9  0  de_Amsterdam_h0  0  0.999373  word2vector
13   en_Amsterdam_h9  0  de_Amsterdam_h7  1  0.999362  word2ve

In [774]:
# Englis-German Aspects Linking Content-Content

getWordVectorSpaceModelRun(en_content, de_content, 'en', 'zh', entity_name)

                    0  1                2  3         4            5
0     en_Amsterdam_h0  0  de_Amsterdam_h7  0  1.000000  word2vector
1     en_Amsterdam_h0  0  de_Amsterdam_h4  1  1.000000  word2vector
2     en_Amsterdam_h0  0  de_Amsterdam_h3  2  1.000000  word2vector
3     en_Amsterdam_h0  0  de_Amsterdam_h6  3  1.000000  word2vector
4     en_Amsterdam_h0  0  de_Amsterdam_h0  4  1.000000  word2vector
5     en_Amsterdam_h0  0  de_Amsterdam_h2  5  1.000000  word2vector
6     en_Amsterdam_h0  0  de_Amsterdam_h5  6  1.000000  word2vector
7     en_Amsterdam_h0  0  de_Amsterdam_h1  7  0.999998  word2vector
8     en_Amsterdam_h0  0  de_Amsterdam_h8  8  0.999998  word2vector
9     en_Amsterdam_h1  0  de_Amsterdam_h3  0  1.000000  word2vector
10    en_Amsterdam_h1  0  de_Amsterdam_h4  1  1.000000  word2vector
11    en_Amsterdam_h1  0  de_Amsterdam_h7  2  1.000000  word2vector
12    en_Amsterdam_h1  0  de_Amsterdam_h2  3  1.000000  word2vector
13    en_Amsterdam_h1  0  de_Amsterdam_h5  4  1.

In [775]:
# Englis-German Aspects Linking Content-Headline

getWordVectorSpaceModelRun(en_content, de_headline, 'en', 'zh', entity_name)

                   0  1                2  3         4            5
0    en_Amsterdam_h0  0  de_Amsterdam_h7  0  0.999991  word2vector
1    en_Amsterdam_h0  0  de_Amsterdam_h0  1  0.999989  word2vector
2    en_Amsterdam_h0  0  de_Amsterdam_h1  2  0.999978  word2vector
3    en_Amsterdam_h0  0  de_Amsterdam_h6  3  0.999328  word2vector
4    en_Amsterdam_h1  0  de_Amsterdam_h7  0  0.999991  word2vector
5    en_Amsterdam_h1  0  de_Amsterdam_h0  1  0.999987  word2vector
6    en_Amsterdam_h1  0  de_Amsterdam_h1  2  0.999977  word2vector
7    en_Amsterdam_h1  0  de_Amsterdam_h6  3  0.999326  word2vector
8    en_Amsterdam_h2  0  de_Amsterdam_h7  0  0.999991  word2vector
9    en_Amsterdam_h2  0  de_Amsterdam_h0  1  0.999988  word2vector
10   en_Amsterdam_h2  0  de_Amsterdam_h1  2  0.999978  word2vector
11   en_Amsterdam_h2  0  de_Amsterdam_h6  3  0.999328  word2vector
12   en_Amsterdam_h3  0  de_Amsterdam_h7  0  0.999990  word2vector
13   en_Amsterdam_h3  0  de_Amsterdam_h0  1  0.999989  word2ve

In [776]:
# Englis-German Aspects Linking Headline-Content

getWordVectorSpaceModelRun(en_headline, de_content, 'en', 'zh', entity_name)

                   0  1                2  3         4            5
0    en_Amsterdam_h0  0  de_Amsterdam_h1  0  0.999994  word2vector
1    en_Amsterdam_h0  0  de_Amsterdam_h8  1  0.999991  word2vector
2    en_Amsterdam_h0  0  de_Amsterdam_h0  2  0.999989  word2vector
3    en_Amsterdam_h0  0  de_Amsterdam_h7  3  0.999989  word2vector
4    en_Amsterdam_h0  0  de_Amsterdam_h6  4  0.999989  word2vector
5    en_Amsterdam_h0  0  de_Amsterdam_h5  5  0.999988  word2vector
6    en_Amsterdam_h0  0  de_Amsterdam_h4  6  0.999988  word2vector
7    en_Amsterdam_h0  0  de_Amsterdam_h3  7  0.999988  word2vector
8    en_Amsterdam_h0  0  de_Amsterdam_h2  8  0.999988  word2vector
9    en_Amsterdam_h5  0  de_Amsterdam_h3  0  0.999991  word2vector
10   en_Amsterdam_h5  0  de_Amsterdam_h7  1  0.999991  word2vector
11   en_Amsterdam_h5  0  de_Amsterdam_h5  2  0.999991  word2vector
12   en_Amsterdam_h5  0  de_Amsterdam_h6  3  0.999990  word2vector
13   en_Amsterdam_h5  0  de_Amsterdam_h4  4  0.999990  word2ve

In [450]:
## input: 
    #1. list of preprocessing documents
    #2. language model
    #3. entity name
    #4. language: En/De
    #5. representation of aspects: headline/content
## output: for each documnet, a document vector will be produced

def getDocVectorsWVSMFinal(content, language_model):
    
    language_model = language_model

    words = []
    for word in language_model.vocab:
        words.append(word)

    doc_vectors = list()
    
    for i in range(len(content)):
        #print(content[i])
        vectorSum = [0.0000]*300
        l = 0
        for el in content[i]:
            if el in words:
                #print(list(vectors[el]))
                vectorSum = list(map(add, list(language_model[el]), vectorSum))
                l+=1
            #else:
                #print(el)
        #print(vectorSum)
        for m in range(len(vectorSum)):
            if vectorSum[m] != 0:
                vectorSum[m] = float(vectorSum[m])/l ###average the vector sum
            else:
                vectorSum[m] = vectorSum[m]

        doc_vectors.append(vectorSum)
    
    #doc_vectors_final = []
    #for vec in doc_vectors:
        #doc = []
        #for dim in vec:
            
            #doc.append(float(dim))
        
        #doc_vectors_final.append(doc)
            
        
    #with open(language+'_'+termname+'_'+aspect+".json", 'w') as f:
        #json.dump(doc_vectors_final, f)
  
    
    return doc_vectors 



In [451]:
# input: 1* n dims sparse matrix, or single vector from query_vect/ train_tfidf matrix
def getSquareSumWE(vector):
    squaresum = 0
    for i in range(len(vector)):
        squaresum += vector[i]* vector[i]
        
    return squaresum

In [452]:
# input: two 1* n sparse matrixs, or single vector from query_vect/ train_tfidf matrixs
# return the cosine sim of two vectors, type: float

def getCosineSimilarityWE(query_vector, doc_vector):
    squaresum_query = getSquareSumWE(query_vector)
    squaresum_doc= getSquareSumWE(doc_vector)
    if math.sqrt(squaresum_query)*math.sqrt(squaresum_doc) > 0:
        sim = np.dot(query_vector, doc_vector)/(math.sqrt(squaresum_query)*math.sqrt(squaresum_doc))
        return sim
    else:
        return np.dot(query_vector, doc_vector)
        
 

In [453]:
## input:
# assume we want to link wiki-text in deutsch(arrow language) with the wiki-text in english(target):
# src_content: is the source page data of the wikipedia item in laguage a
# trs_content: is the translation data of the wikipedia item from language b to language a
# en_model: english language model
# de_model: deutsch language
# arrow_la: arrow language of the wiki-text
# target_la: target language of the wiki-text
# termname: entry name of the wiki-text

## return: RUN file for Trec_eval

def getWordVectorSpace2Run(en_content, de_content, DV_en, DV_de, la1, la2, termname):    
    
    ### get the similarity of query with each doc leader
    
    #get document vector for each aspect
    DV_en_content = DV_en
    DV_de_content = DV_de
    
    
    result = []
    for j in range(len(DV_en_content)):
        #print (DV_de_content[j][:10])
        
        sims = []
        results_ID = []
        results = []
        for i in range(len(DV_de_content)):
            s = getCosineSimilarityWE(DV_en_content[j], DV_de_content[i])
            #if type(s)is not None:
            #print(s)
            sims.append(s)
        
        for x in range(len(sims)):
            if sims[x] >0:
                results_ID.append(x)
                results.append(sims[x])
        
        #print(results, results_ID)
        
        if len(results) >0 and len(results_ID)>0:
            results, results_ID= zip(*sorted(zip(results, results_ID), reverse=True))      
       
        
        for m in range(len(results)):
            result.append([la1+'_'+termname+"_h"+str(j), 0, la2+'_'+termname+'_h'+str(results_ID[m]), m, results[m], 'wordembedding'])

    df = pd.DataFrame(result)
    if len(de_content[0]) > 10 and len(en_content[0])>10 :
        df.to_csv('WVSMFinal_'+termname+'_'+la1+'_'+la2+'_text.txt', header=None, index=None, sep=' ', mode='a')
    elif len(de_content[0]) < 10 and len(en_content[0])<10:
        df.to_csv('WVSMFinal_'+termname+'_'+la1+'_'+la2+'_headline.txt', header=None, index=None, sep=' ', mode='a')
    elif len(de_content[0]) < 10 and len(en_content[0])>10:
        df.to_csv('WVSMFinal_'+termname+'_'+la1+'_'+la2+'_headline_text.txt', header=None, index=None, sep=' ', mode='a')
    else:
        df.to_csv('WVSMFinal_'+termname+'_'+la1+'_'+la2+'_text_headline.txt', header=None, index=None, sep=' ', mode='a')     
    
    print(df)

In [778]:
DV_en_headline = getDocVectorsWVSMFinal(en_headline, en_model)
DV_en_content = getDocVectorsWVSMFinal(en_content, en_model)

DV_zh_headline = getDocVectorsWVSMFinal(zh_headline, en_model)
DV_zh_content = getDocVectorsWVSMFinal(zh_content, en_model)

In [779]:
getWordVectorSpace2Run(en_headline, zh_headline, DV_en_headline, DV_zh_headline, 'en','zh', entity_name)

                    0  1                2  3         4              5
0     en_Amsterdam_h0  0  de_Amsterdam_h0  0  1.000000  wordembedding
1     en_Amsterdam_h0  0  de_Amsterdam_h8  1  0.328429  wordembedding
2     en_Amsterdam_h0  0  de_Amsterdam_h1  2  0.267095  wordembedding
3     en_Amsterdam_h0  0  de_Amsterdam_h6  3  0.263959  wordembedding
4     en_Amsterdam_h0  0  de_Amsterdam_h7  4  0.261862  wordembedding
5     en_Amsterdam_h0  0  de_Amsterdam_h5  5  0.261492  wordembedding
6     en_Amsterdam_h0  0  de_Amsterdam_h2  6  0.242564  wordembedding
7     en_Amsterdam_h0  0  de_Amsterdam_h4  7  0.238223  wordembedding
8     en_Amsterdam_h0  0  de_Amsterdam_h3  8  0.224148  wordembedding
9     en_Amsterdam_h1  0  de_Amsterdam_h2  0  0.812261  wordembedding
10    en_Amsterdam_h1  0  de_Amsterdam_h3  1  0.805303  wordembedding
11    en_Amsterdam_h1  0  de_Amsterdam_h6  2  0.800517  wordembedding
12    en_Amsterdam_h1  0  de_Amsterdam_h4  3  0.786783  wordembedding
13    en_Amsterdam_h

In [780]:
getWordVectorSpace2Run(en_headline, zh_content, DV_en_headline, DV_zh_content, 'en','zv', entity_name)

                    0  1                2  3         4              5
0     en_Amsterdam_h0  0  de_Amsterdam_h1  0  0.600057  wordembedding
1     en_Amsterdam_h0  0  de_Amsterdam_h8  1  0.536614  wordembedding
2     en_Amsterdam_h0  0  de_Amsterdam_h0  2  0.457167  wordembedding
3     en_Amsterdam_h0  0  de_Amsterdam_h7  3  0.402822  wordembedding
4     en_Amsterdam_h0  0  de_Amsterdam_h6  4  0.386379  wordembedding
5     en_Amsterdam_h0  0  de_Amsterdam_h3  5  0.326772  wordembedding
6     en_Amsterdam_h0  0  de_Amsterdam_h4  6  0.310785  wordembedding
7     en_Amsterdam_h0  0  de_Amsterdam_h2  7  0.302455  wordembedding
8     en_Amsterdam_h0  0  de_Amsterdam_h5  8  0.299950  wordembedding
9     en_Amsterdam_h1  0  de_Amsterdam_h4  0  0.498023  wordembedding
10    en_Amsterdam_h1  0  de_Amsterdam_h6  1  0.494421  wordembedding
11    en_Amsterdam_h1  0  de_Amsterdam_h3  2  0.492393  wordembedding
12    en_Amsterdam_h1  0  de_Amsterdam_h0  3  0.487803  wordembedding
13    en_Amsterdam_h

In [781]:
getWordVectorSpace2Run(en_content, zh_content, DV_en_content, DV_zh_content, 'en','zh', entity_name)

                    0  1                2  3         4              5
0     en_Amsterdam_h0  0  de_Amsterdam_h7  0  0.989807  wordembedding
1     en_Amsterdam_h0  0  de_Amsterdam_h0  1  0.984952  wordembedding
2     en_Amsterdam_h0  0  de_Amsterdam_h3  2  0.984568  wordembedding
3     en_Amsterdam_h0  0  de_Amsterdam_h6  3  0.983110  wordembedding
4     en_Amsterdam_h0  0  de_Amsterdam_h4  4  0.981431  wordembedding
5     en_Amsterdam_h0  0  de_Amsterdam_h2  5  0.969671  wordembedding
6     en_Amsterdam_h0  0  de_Amsterdam_h5  6  0.969355  wordembedding
7     en_Amsterdam_h0  0  de_Amsterdam_h1  7  0.940698  wordembedding
8     en_Amsterdam_h0  0  de_Amsterdam_h8  8  0.766341  wordembedding
9     en_Amsterdam_h1  0  de_Amsterdam_h3  0  0.975771  wordembedding
10    en_Amsterdam_h1  0  de_Amsterdam_h4  1  0.969266  wordembedding
11    en_Amsterdam_h1  0  de_Amsterdam_h5  2  0.963544  wordembedding
12    en_Amsterdam_h1  0  de_Amsterdam_h7  3  0.961434  wordembedding
13    en_Amsterdam_h

In [782]:
getWordVectorSpace2Run(en_content, zh_headline, DV_en_content, DV_zh_headline, 'en','zh', entity_name)

                    0  1                2  3         4              5
0     en_Amsterdam_h0  0  de_Amsterdam_h1  0  0.761034  wordembedding
1     en_Amsterdam_h0  0  de_Amsterdam_h7  1  0.702296  wordembedding
2     en_Amsterdam_h0  0  de_Amsterdam_h8  2  0.549765  wordembedding
3     en_Amsterdam_h0  0  de_Amsterdam_h6  3  0.526005  wordembedding
4     en_Amsterdam_h0  0  de_Amsterdam_h5  4  0.519510  wordembedding
5     en_Amsterdam_h0  0  de_Amsterdam_h2  5  0.516462  wordembedding
6     en_Amsterdam_h0  0  de_Amsterdam_h3  6  0.503821  wordembedding
7     en_Amsterdam_h0  0  de_Amsterdam_h4  7  0.497746  wordembedding
8     en_Amsterdam_h0  0  de_Amsterdam_h0  8  0.386591  wordembedding
9     en_Amsterdam_h1  0  de_Amsterdam_h1  0  0.743821  wordembedding
10    en_Amsterdam_h1  0  de_Amsterdam_h7  1  0.670341  wordembedding
11    en_Amsterdam_h1  0  de_Amsterdam_h8  2  0.529771  wordembedding
12    en_Amsterdam_h1  0  de_Amsterdam_h2  3  0.496139  wordembedding
13    en_Amsterdam_h