In [1]:
import pandas as pd
import numpy as np
import csv
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
nltk.download('wordnet')
nltk.download("stopwords")
from nltk.corpus import stopwords
esw = stopwords.words("english")+['edit']
from string import punctuation

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/hailianhou/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hailianhou/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
from googletrans import Translator
from collections import defaultdict

In [3]:
import json
import math

# base knowledge:
1. for each wikipedia term, there is a term name
2. for each wikipedia term, its English page was crawled as source text in language a -"en": source_a
3. for each wikipedia term, its German page was crawled as source text in language b -"de": source_b
4. for each source page in language b -"de", it will be translated into language a, saved as translation_b_to_a
5. links between source_a and source_b are to be represented by the links between source-a and translation_b_to_a



# getTranslationRun Function

In [4]:
### translate the text in the format of dictionary from src language into dest language

##input:
# text: text in dictionary format {}
# la1: source language b of the text 
# la2: target language a the text to be translated into
# termname: term name of the text

def getTranslationRun(text, src_b, dest_a, termname):
    text = text
    dest = dest_a
    translator = Translator()
    dict_trans = defaultdict(list)
    headline = list(text.keys())
    for el in headline:
        h = translator.translate(text = el, dest = dest)
        t = translator.translate(text = text[el], dest = dest)
        for m in t:
            dict_trans[h.text].append(m.text)
            #print(m.text)

    print(dict_trans)
    return dict_trans

In [5]:
import requests


# data preprocessing Function

In [255]:
# data preprocessing function

def toLowerList(List):
    for i in range(len(List)):
        List[i] = List[i].lower()
    return List

def tokenizeDVSM(file):
    i = 0
    tokenizer = WordPunctTokenizer()
    for el in file:
        file[i] = ' '.join(tokenizer.tokenize(str(el)))
        i += 1       

def removeStopwords(file):
    i = 0
    for el in file:
        el = ' '.join([word for word in el.split() if word not in esw])
        file[i] = el
        i += 1

punctuation = punctuation + str('%.[')+str('(/')+str(');[')+str('"),')+str(').')+str('.[')+str(',[')+str('][')+str('("')+str('."[')+str('—'+str('."'))+str('.,')
def removePunctuation(file):
    
    i = 0
    for el in file:
        el = ' '.join([word for word in el.split() if word not in punctuation])
        file[i] = el
        i += 1

def stemming(file):
    ps = PorterStemmer()
    i = 0
    for el in file:
        el = ' '.join([ps.stem(word) for word in el.lower().split() ])
        file[i] = el
        i += 1

def removeNumbers(file):
    i = 0
    for el in file:
        file[i] = ' '.join([word for word in el.split() if not word.isdigit()])
        i+=1

        
def lemmatize_n(file):
    lemmatizer = WordNetLemmatizer()
    i = 0
    for el in file:
        el = ' '.join([lemmatizer.lemmatize(word, pos = 'n') for word in el.lower().split() ])
        file[i] = el
        i += 1


def lemmatize_v(file):
    lemmatizer = WordNetLemmatizer()
    i = 0
    for el in file:
        el = ' '.join([lemmatizer.lemmatize(word, pos = 'v') for word in el.lower().split() ])
        file[i] = el
        i += 1
        

def lemmatize_a(file):
    lemmatizer = WordNetLemmatizer()
    i = 0
    for el in file:
        el = ' '.join([lemmatizer.lemmatize(word, pos = 'a') for word in el.lower().split() ])
        file[i] = el
        i += 1
        
def lemmatize_r(file):
    lemmatizer = WordNetLemmatizer()
    i = 0
    for el in file:
        el = ' '.join([lemmatizer.lemmatize(word, pos = 'r') for word in el.lower().split() ])
        file[i] = el
        i += 1

In [256]:
#preprocessing: tokenize, removing stopwords, punctuation, and stemming

def preprocessing(content):
    tokenizeDVSM(content)
    removePunctuation(content)
    removeStopwords(content)
    removeNumbers(content)
    ### lemmatization, NOUNs
    lemmatize_n(content)
    ### lemmatization, Verbs
    lemmatize_v(content)
    ### lemmatization, Adjactives
    lemmatize_a(content)
    ### lemmatization
    lemmatize_r(content)

# DTM, Query-vector building

In [175]:
##### build functions to generate document-term matrix

## get DTM, weighted by tfidf, the sqrt of the query vector is 1
# therefore, in the retrive phase, 
# dot product of the doc vector and query vector can be used to represent the cosine similarity

# input format: train_doc_file.text

def get_DTM_tfidf(file):
    train_count_vect = CountVectorizer() #remove english stopwords
    X_train_counts = train_count_vect.fit_transform(file)
    vocabulary = list(train_count_vect.vocabulary_.keys())
    #print(vocabulary)

    tfidf_transformer = TfidfTransformer()
    
    X_train_tfidf = CountVectorizer(vocabulary = vocabulary)
    X_train_tfidf = X_train_tfidf.fit_transform(file)
    
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_tfidf)
    return X_train_tfidf


## get DTM, weighted by term frequency
def get_DTM_tf(file):
    train_count_vect = CountVectorizer() #remove english stopwords
    X_train_counts = train_count_vect.fit_transform(file)
    vocabulary = list(train_count_vect.vocabulary_.keys())

    X_train_tf = CountVectorizer(vocabulary = vocabulary)
    X_train_tf = X_train_tf.fit_transform(file)
    return X_train_tf


In [176]:
###Generate query vector for each query

# get_QueryVector_tfidf helps to get the tiidf weighted query vector, the sqrt of the query vector is 1
# therefore, in the retrive phase, 
# dot product of the doc vector and query vector can be used to represent the cosine similarity

# input format: train_doc_file.text, train_query_file.text


def get_QueryVector_tfidf(queryFile, docFile):
    train_count_vect = CountVectorizer() #remove english stopwords
    X_train_counts = train_count_vect.fit_transform(docFile)
    vocabulary = list(train_count_vect.vocabulary_.keys())
    
    if type(queryFile) is str:
        query_vect = []
        query = queryFile
        query = query.split()
        frequency = defaultdict(int)
        for el in vocabulary:
            if el in query:
                frequency[el]+= 1
            else:
                frequency[el] = 0
        query_vect = list(dict(frequency).values())/np.linalg.norm(list(dict(frequency).values()))
        
    else:
        tfidf_transformer = TfidfTransformer()
        
        query_vect = CountVectorizer(analyzer = "word", vocabulary = vocabulary)
        query_vect = query_vect.fit_transform(queryFile)
        
        query_vect = tfidf_transformer.fit_transform(query_vect)
    return query_vect



###Generate the query vector, weighted by term frequency
def get_QueryVector(queryFile, docFile):
    train_count_vect = CountVectorizer() #remove english stopwords
    X_train_counts = train_count_vect.fit_transform(docFile)
    vocabulary = list(train_count_vect.vocabulary_.keys())
    
    if type(queryFile) is str:
        query_vect = []
        query = queryFile
        query = query.split()
        frequency = defaultdict(int)
        for el in vocabulary:
            if el in query:
                frequency[el]+= 1
            else:
                frequency[el] = 0
        query_vect = list(dict(frequency).values())
        
    else:
        query_vect = CountVectorizer(analyzer = "word", vocabulary = vocabulary)
        query_vect = query_vect.fit_transform(queryFile)
        
    return query_vect

# cosine similarity Function

In [177]:
# input: 1* n dims sparse matrix, or single vector from query_vect/ train_tfidf matrix
def getSquareSum(vector):
    squaresum = 0
    for i in range(vector.shape[1]):
        squaresum += vector[0,i]* vector[0,i]
        
    return squaresum 

In [178]:
# input: 1* n sparse matrix, or single vector from query_vect/ train_tfidf matrix
# return the cosine sim of two vectors

def getCosineSimilarity(query_vector, doc_vector):
    squaresum_query = getSquareSum(query_vector)
    squaresum_doc= getSquareSum(doc_vector)
    if math.sqrt(squaresum_query)*math.sqrt(squaresum_doc) > 0:
        sim = np.dot(query_vector, doc_vector.transpose())[0,0]/(math.sqrt(squaresum_query)*math.sqrt(squaresum_doc))
        return sim
    else:
        return np.dot(query_vector, doc_vector.transpose())[0,0]
 

# input: 1* n dims sparse matrix, or single vector from query_vect/ train_tfidf matrix
def getSquareSum(vector):
    squaresum = 0
    for i in range(vector.shape[1]):
        squaresum += vector[i]* vector[i]
        
    return squaresum

# input: two 1* n sparse matrixs, or single vector from query_vect/ train_tfidf matrixs
# return the cosine sim of two vectors, type: float

def getCosineSimilarity(query_vector, doc_vector):
    squaresum_query = getSquareSum(query_vector)
    squaresum_doc= getSquareSum(doc_vector)
    if math.sqrt(squaresum_query)*math.sqrt(squaresum_doc) > 0:
        sim = np.dot(query_vector, doc_vector)/(math.sqrt(squaresum_query)*math.sqrt(squaresum_doc))
        return sim
    else:
        return np.dot(query_vector, doc_vector)

# getVectorSpaceModelRun Function

link headlines in the source text with headline in the translation text through cosinesimilarity

In [179]:
# src_content: is the source page data of the wikipedia item in laguage a
# trs_content: is the translation data of the wikipedia item from language b to language a
# we link the headlines of the wikipedia item in different languages through translation text and source text
# further more, link of headlines is predicted through similarity between the text vectors below the headlines
# b: source language b of the text 
# a: target language a the text to be translated into
# termname: term name of the text

# return: RUN file for Trec_eval

def getDocumentVectorSpaceModelRun(trs_content, src_content, la1, la2, termname):    
    ### get the similarity of query with each doc leader
    queryVector = get_QueryVector_tfidf(trs_content, src_content)
    src_DTM = get_DTM_tfidf(src_content)
    
    result = []
    for j in range(queryVector.shape[0]):
        
        sims = []
        results_ID = []
        results = []
        for i in range(src_DTM.shape[0]):
            s = getCosineSimilarity(queryVector[j], src_DTM[i,])
            #if type(s)is not None:
            sims.append(s)

        
        for x in range(len(sims)):
            if sims[x] >0:
                results_ID.append(x)
                results.append(sims[x])
        
        #print(results, results_ID)
        
        if len(results) >0 and len(results_ID)>0:
            results, results_ID= zip(*sorted(zip(results, results_ID), reverse=True))      
       
        
        for m in range(len(results)):
            result.append([la1+'_'+termname+"_h"+str(j), 0, la2+'_'+termname+'_h'+str(results_ID[m]), m, results[m], 'cosinesims'])

    df = pd.DataFrame(result)
    if len(trs_content[0].split()) > 10 and len(src_content[0].split())>10 :
        df.to_csv('DWSM_'+termname+'_'+la1+'_'+la2+'_text.txt', header=None, index=None, sep=' ', mode='a')
    elif len(trs_content[0].split()) < 10 and len(src_content[0].split())<10:
        df.to_csv('DWSM_'+termname+'_'+la1+'_'+la2+'_headline.txt', header=None, index=None, sep=' ', mode='a')
    elif len(trs_content[0].split()) < 10 and len(src_content[0].split())>10:
        df.to_csv('DWSM_'+termname+'_'+la1+'_'+la2+'_headline_text.txt', header=None, index=None, sep=' ', mode='a')
    else:
        df.to_csv('DWSM_'+termname+'_'+la1+'_'+la2+'_text_headline.txt', header=None, index=None, sep=' ', mode='a')     
    
    print(df)

# Make Functions Run!
1. data loading as source_a and source_b
2. translating source_b into transaltion_b_to_a
3. data preprocessing 
4. make baseline run(headlines)
5. make cross link run(contexts)



In [32]:
from gensim.models import KeyedVectors

# importing word vector for WVSM
en_model = KeyedVectors.load_word2vec_format('/Users/hailianhou/Desktop/MasterThesis/Wiki Data/wiki-news-300d-1M.vec')

In [257]:
entity_name = 'France'

In [258]:
# loading source data
with open('/Users/hailianhou/Desktop/MasterThesis/FinalCode/DataCrawling/source_de_'+entity_name+'.json') as json_data:
    source_de = json.load(json_data)

with open('/Users/hailianhou/Desktop/MasterThesis/FinalCode/DataCrawling/source_en_'+entity_name+'.json') as json_data:
    source_en = json.load(json_data)

In [259]:
## input: 
#source_de_Barack_Obama: dict

translation_de_to_en = getTranslationRun(source_de, 'de', 'en', entity_name)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [223]:
#rename

source_a = source_en
source_b = source_de

translation_b_to_a = translation_de_to_en


In [224]:
# exact headlies
src_headline = list(source_a.keys())
trs_headline = list(translation_b_to_a .keys())

#extract context
src_content = []
trs_content = []
for el in src_headline:
    src_content.append(''.join(source_a[el]))
for el in trs_headline:
    trs_content.append(''.join(translation_b_to_a[el]))

In [225]:
preprocessing(src_content)
preprocessing(trs_content)

preprocessing(src_headline)
preprocessing(trs_headline)

In [226]:
# English-German aspects links through context--context 

getDocumentVectorSpaceModelRun(src_content, trs_content, 'en', 'de', entity_name)

                0  1              2   3         4           5
0    en_Canada_h0  0   de_Canada_h0   0  0.347434  cosinesims
1    en_Canada_h0  0  de_Canada_h11   1  0.344143  cosinesims
2    en_Canada_h0  0   de_Canada_h7   2  0.306810  cosinesims
3    en_Canada_h0  0   de_Canada_h2   3  0.290119  cosinesims
4    en_Canada_h0  0   de_Canada_h6   4  0.278781  cosinesims
5    en_Canada_h0  0   de_Canada_h3   5  0.273033  cosinesims
6    en_Canada_h0  0   de_Canada_h5   6  0.254985  cosinesims
7    en_Canada_h0  0  de_Canada_h13   7  0.243692  cosinesims
8    en_Canada_h0  0  de_Canada_h14   8  0.238585  cosinesims
9    en_Canada_h0  0  de_Canada_h10   9  0.238253  cosinesims
10   en_Canada_h0  0   de_Canada_h1  10  0.229420  cosinesims
11   en_Canada_h0  0   de_Canada_h4  11  0.183316  cosinesims
12   en_Canada_h0  0   de_Canada_h9  12  0.141053  cosinesims
13   en_Canada_h0  0   de_Canada_h8  13  0.140524  cosinesims
14   en_Canada_h0  0  de_Canada_h12  14  0.133339  cosinesims
15   en_

In [227]:
# English-German aspects linking through headline--headline

getDocumentVectorSpaceModelRun(src_headline, trs_headline, 'en', 'de', entity_name)

              0  1              2  3    4           5
0  en_Canada_h0  0   de_Canada_h0  0  1.0  cosinesims
1  en_Canada_h2  0   de_Canada_h2  0  1.0  cosinesims
2  en_Canada_h3  0   de_Canada_h3  0  1.0  cosinesims
3  en_Canada_h5  0  de_Canada_h11  0  1.0  cosinesims
4  en_Canada_h7  0  de_Canada_h14  0  1.0  cosinesims


In [228]:
# English-German aspects linking through headline--content

getDocumentVectorSpaceModelRun(src_content, trs_headline,  'en', 'de', entity_name)

               0  1              2  3         4           5
0   en_Canada_h0  0   de_Canada_h0  0  0.919017  cosinesims
1   en_Canada_h0  0  de_Canada_h11  1  0.265310  cosinesims
2   en_Canada_h0  0  de_Canada_h12  2  0.175333  cosinesims
3   en_Canada_h0  0   de_Canada_h9  3  0.151298  cosinesims
4   en_Canada_h0  0   de_Canada_h5  4  0.132655  cosinesims
5   en_Canada_h0  0  de_Canada_h14  5  0.117422  cosinesims
6   en_Canada_h1  0   de_Canada_h0  0  0.873470  cosinesims
7   en_Canada_h1  0   de_Canada_h1  1  0.434907  cosinesims
8   en_Canada_h2  0   de_Canada_h0  0  0.950787  cosinesims
9   en_Canada_h2  0   de_Canada_h8  1  0.177363  cosinesims
10  en_Canada_h2  0   de_Canada_h5  2  0.134191  cosinesims
11  en_Canada_h2  0   de_Canada_h6  3  0.131944  cosinesims
12  en_Canada_h2  0  de_Canada_h14  4  0.089087  cosinesims
13  en_Canada_h2  0  de_Canada_h11  5  0.067095  cosinesims
14  en_Canada_h2  0   de_Canada_h2  6  0.052908  cosinesims
15  en_Canada_h2  0   de_Canada_h7  7  0

In [229]:
# English-German aspects linking through content--headline

getDocumentVectorSpaceModelRun(src_headline, trs_content,  'en', 'de', entity_name)

               0  1              2   3         4           5
0   en_Canada_h0  0   de_Canada_h1   0  0.398943  cosinesims
1   en_Canada_h0  0  de_Canada_h10   1  0.254362  cosinesims
2   en_Canada_h0  0  de_Canada_h13   2  0.211400  cosinesims
3   en_Canada_h0  0  de_Canada_h11   3  0.204494  cosinesims
4   en_Canada_h0  0   de_Canada_h7   4  0.178859  cosinesims
5   en_Canada_h0  0   de_Canada_h6   5  0.172236  cosinesims
6   en_Canada_h0  0  de_Canada_h14   6  0.163733  cosinesims
7   en_Canada_h0  0   de_Canada_h0   7  0.154022  cosinesims
8   en_Canada_h0  0   de_Canada_h3   8  0.141663  cosinesims
9   en_Canada_h0  0   de_Canada_h5   9  0.137266  cosinesims
10  en_Canada_h0  0  de_Canada_h12  10  0.114326  cosinesims
11  en_Canada_h0  0   de_Canada_h2  11  0.110888  cosinesims
12  en_Canada_h0  0   de_Canada_h8  12  0.108899  cosinesims
13  en_Canada_h0  0   de_Canada_h9  13  0.062386  cosinesims
14  en_Canada_h2  0  de_Canada_h14   0  0.038403  cosinesims
15  en_Canada_h3  0  de_

entity_list = ['Germany', 'Japan','France','Italy', 'United_Kingdom', 'Asia','Europe']


for entity in entity_list:
    # loading source data
    with open('/Users/hailianhou/Desktop/MasterThesis/FinalCode/DataCrawling/source_de_'+entity+'.json') as json_data:
        source_de = json.load(json_data)

    with open('/Users/hailianhou/Desktop/MasterThesis/FinalCode/DataCrawling/source_en_'+entity+'.json') as json_data:
        source_en = json.load(json_data)
        
    ## input: 
    #source_de_Barack_Obama: dict

    translation_de_to_en = getTranslationRun(source_de, 'de', 'en', el)
    
    #rename

    source_a = source_en
    source_b = source_de

    translation_b_to_a = translation_de_to_en
    
    # exact headlies
    src_headline = list(source_a.keys())
    trs_headline = list(translation_b_to_a .keys())

    #extract context
    src_content = []
    trs_content = []
    for el in src_headline:
        src_content.append(''.join(source_a[el]))
    for el in trs_headline:
        trs_content.append(''.join(translation_b_to_a[el]))
    
    #preprocessing
    preprocessing(src_content)
    preprocessing(trs_content)

    preprocessing(src_headline)
    preprocessing(trs_headline)
    
    # predict headline links through context--context
    getDocumentVectorSpaceModelRun(src_content, trs_content, 'en', 'de', el)
    
    # predict headline links through headline--headline
    getDocumentVectorSpaceModelRun(src_headline, trs_headline, 'en', 'de', el)
    
    # predict headline links through headline--content
    getDocumentVectorSpaceModelRun(src_content, trs_headline,  'en', 'de', el)
    
    # predict headline links through content--headline
    getDocumentVectorSpaceModelRun(src_headline, trs_content,  'en', 'de', el)

    

In [190]:
import json
import nltk
import math
import numpy as np
import pandas as pd
import gensim
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
from nltk.tokenize import WordPunctTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')
nltk.download("stopwords")
from nltk.corpus import stopwords
esw = stopwords.words("english")+['edit']
from string import punctuation

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/hailianhou/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hailianhou/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [191]:
from operator import add

In [230]:
def traditionalToSimplified(file):
    i = 0
    for el in file:
        file[i] = ''.join(HanziConv.toSimplified(el))
        i += 1
        
def chineseTokenize(file):
    i = 0
    for el in file:
        file[i] = ' '.join(jieba.cut(el, cut_all=False, HMM=True))
        i += 1

        
from nltk.tokenize import MWETokenizer
def MWEtokenize(el):
    i = 0
    tokenizer = MWETokenizer()
    
    el = tokenizer.tokenize(el.split())
    return el

punctuation = punctuation + str('；')+  str("：《》「 」“”[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）]+")+str('编辑')+str('%.[')+str('(/')+str(');[')+str('"),')+str(').')+str('.[')+str(',[')+str('][')+str('("')+str('."[')+str('—'+str('."'))+str('.,')        
def tokenize(file):
    i = 0
    for el in file:
        tokenizerOne = WordPunctTokenizer()
        el = tokenizerOne.tokenize(str(el.lower()))
        el = ' '.join([word for word in el if word not in punctuation])
        file[i] = MWEtokenize(el)
        i += 1
        




In [231]:
# exact headlies
en_headline = list(source_a.keys())
de_headline = list(translation_b_to_a .keys())

#extract context
en_content = []
de_content = []
for el in en_headline:
    en_content.append(''.join(source_a[el]))
for el in de_headline:
    de_content.append(''.join(translation_b_to_a[el]))

In [232]:
# preprocessing for en and de
tokenize(en_content)
tokenize(de_content)
tokenize(en_headline)
tokenize(de_headline)

In [233]:
en_headline

[['canada'],
 ['etymology'],
 ['history'],
 ['geography', 'and', 'climate'],
 ['government', 'and', 'politics'],
 ['economy'],
 ['demographics'],
 ['culture']]

In [234]:
#Building the word2vect model for the documents corpus

model = Word2Vec(en_content, size=300)

In [235]:
en_content

[['coordinates',
  '60',
  '°',
  'n',
  '95',
  '°',
  'w',
  '\ufeff',
  '\ufeff',
  '60',
  '°',
  'n',
  '95',
  '°',
  'w',
  '\ufeff',
  '60',
  '95',
  'canada',
  'canadian',
  'french',
  'kanadɑ',
  '])',
  'is',
  'a',
  'country',
  'in',
  'the',
  'northern',
  'part',
  'of',
  'north',
  'america',
  'its',
  'ten',
  'provinces',
  'and',
  'three',
  'territories',
  'extend',
  'from',
  'the',
  'atlantic',
  'to',
  'the',
  'pacific',
  'and',
  'northward',
  'into',
  'the',
  'arctic',
  'ocean',
  'covering',
  '9',
  '98',
  'million',
  'square',
  'kilometres',
  '3',
  '85',
  'million',
  'square',
  'miles',
  'making',
  'it',
  'the',
  'world',
  'second',
  'largest',
  'country',
  'by',
  'total',
  'area',
  'canada',
  'southern',
  'border',
  'with',
  'the',
  'united',
  'states',
  'is',
  'the',
  'world',
  'longest',
  'bi',
  'national',
  'land',
  'border',
  'its',
  'capital',
  'is',
  'ottawa',
  'and',
  'its',
  'three',
  'large

In [236]:
def getDocVectors(content):
    
    doc_vectors = list()
    for i in range(len(content)):
        
        vectorSum = [0]*300
        l = 0
        for el in content[i]:
            if el in list(model.wv.vocab):
                # adding the word vectors togenther by their dimensions:
                vectorSum = list(map(add, list(model.wv[el]), vectorSum))
                l+=1
        
        for m in range(len(vectorSum)):
            if vectorSum[m] != 0:
                ###anormalize the vector sum:
                vectorSum[m] = float(vectorSum[m])/l 
            else:
                vectorSum[m] = vectorSum[m]
          
        doc_vectors.append(vectorSum)

    return doc_vectors 
    

In [237]:
# input: 1* n dims sparse matrix, or single vector from query_vect/ train_tfidf matrix
def getSquareSumWVSM(vector):
    squaresum = 0
    for i in range(len(vector)):
        squaresum += vector[i]* vector[i]
        
    return squaresum 

In [238]:
# input: 1* n sparse matrix, or single vector from query_vect/ train_tfidf matrix
# return the cosine sim of two vectors

def getCosineSimilarityWVSM(query_vector, doc_vector):
    squaresum_query = getSquareSumWVSM(query_vector)
    squaresum_doc= getSquareSumWVSM(doc_vector)
    if math.sqrt(squaresum_query)*math.sqrt(squaresum_doc) > 0:
        sim = np.dot(query_vector, doc_vector)/(math.sqrt(squaresum_query)*math.sqrt(squaresum_doc))
        return sim
    else:
        return np.dot(query_vector, doc_vector)


In [239]:
# src_content: is the source page data of the wikipedia item in laguage a
# trs_content: is the translation data of the wikipedia item from language b to language a
# we link the headlines of the wikipedia item in different languages through translation text and source text
# further more, link of headlines is predicted through similarity between the text vectors below the headlines
# b: source language b of the text 
# a: target language a the text to be translated into
# termname: term name of the text

# return: RUN file for Trec_eval

def getWordVectorSpaceModelRun(trs_content, src_content, la1, la2, termname):    
    ### get the similarity of query with each doc leader
    DV_src_content = getDocVectors(src_content)
    DV_trs_content = getDocVectors(trs_content)
    
    
    result = []
    for j in range(len(DV_trs_content)):
        
        sims = []
        results_ID = []
        results = []
        for i in range(len(DV_src_content)):
            s = getCosineSimilarityWVSM(DV_trs_content[j], DV_src_content[i])
            #if type(s)is not None:
            sims.append(s)

        
        for x in range(len(sims)):
            if sims[x] >0:
                results_ID.append(x)
                results.append(sims[x])
        
        #print(results, results_ID)
        
        if len(results) >0 and len(results_ID)>0:
            results, results_ID= zip(*sorted(zip(results, results_ID), reverse=True))      
       
        
        for m in range(len(results)):
            result.append([la1+'_'+termname+"_h"+str(j), 0, la2+'_'+termname+'_h'+str(results_ID[m]), m, results[m], 'word2vector'])

    df = pd.DataFrame(result)
    if len(trs_content[0]) > 10 and len(src_content[0])>10 :
        df.to_csv('WVSM_'+termname+'_'+la1+'_'+la2+'_text.txt', header=None, index=None, sep=' ', mode='a')
    elif len(trs_content[0]) < 10 and len(src_content[0])<10:
        df.to_csv('WVSM_'+termname+'_'+la1+'_'+la2+'_headline.txt', header=None, index=None, sep=' ', mode='a')
    elif len(trs_content[0]) < 10 and len(src_content[0])>10:
        df.to_csv('WVSM_'+termname+'_'+la1+'_'+la2+'_headline_text.txt', header=None, index=None, sep=' ', mode='a')
    else:
        df.to_csv('WVSM_'+termname+'_'+la1+'_'+la2+'_text_headline.txt', header=None, index=None, sep=' ', mode='a')     
    
    print(df)

In [240]:
# Englis-German Aspects Linking Headline-Headline 

getWordVectorSpaceModelRun(en_headline, de_headline, 'en', 'de', entity_name)

               0  1              2  3         4            5
0   en_Canada_h0  0   de_Canada_h0  0  1.000000  word2vector
1   en_Canada_h0  0   de_Canada_h1  1  0.999981  word2vector
2   en_Canada_h0  0   de_Canada_h6  2  0.999973  word2vector
3   en_Canada_h0  0   de_Canada_h5  3  0.999948  word2vector
4   en_Canada_h0  0   de_Canada_h7  4  0.999894  word2vector
5   en_Canada_h0  0  de_Canada_h11  5  0.999819  word2vector
6   en_Canada_h0  0  de_Canada_h14  6  0.999818  word2vector
7   en_Canada_h0  0   de_Canada_h9  7  0.999792  word2vector
8   en_Canada_h0  0   de_Canada_h8  8  0.999776  word2vector
9   en_Canada_h3  0   de_Canada_h6  0  0.999989  word2vector
10  en_Canada_h3  0   de_Canada_h1  1  0.999982  word2vector
11  en_Canada_h3  0   de_Canada_h0  2  0.999978  word2vector
12  en_Canada_h3  0   de_Canada_h5  3  0.999950  word2vector
13  en_Canada_h3  0   de_Canada_h7  4  0.999895  word2vector
14  en_Canada_h3  0  de_Canada_h14  5  0.999828  word2vector
15  en_Canada_h3  0  de_

In [241]:
# Englis-German Aspects Linking Content-Content

getWordVectorSpaceModelRun(en_content, de_content, 'en', 'de', entity_name)

                0  1              2   3         4            5
0    en_Canada_h0  0  de_Canada_h11   0  1.000000  word2vector
1    en_Canada_h0  0   de_Canada_h5   1  1.000000  word2vector
2    en_Canada_h0  0  de_Canada_h14   2  1.000000  word2vector
3    en_Canada_h0  0   de_Canada_h7   3  1.000000  word2vector
4    en_Canada_h0  0   de_Canada_h6   4  1.000000  word2vector
5    en_Canada_h0  0  de_Canada_h13   5  1.000000  word2vector
6    en_Canada_h0  0   de_Canada_h9   6  1.000000  word2vector
7    en_Canada_h0  0  de_Canada_h10   7  1.000000  word2vector
8    en_Canada_h0  0   de_Canada_h3   8  1.000000  word2vector
9    en_Canada_h0  0   de_Canada_h2   9  1.000000  word2vector
10   en_Canada_h0  0   de_Canada_h0  10  1.000000  word2vector
11   en_Canada_h0  0  de_Canada_h12  11  1.000000  word2vector
12   en_Canada_h0  0   de_Canada_h8  12  1.000000  word2vector
13   en_Canada_h0  0   de_Canada_h1  13  0.999999  word2vector
14   en_Canada_h0  0   de_Canada_h4  14  0.999999  word

In [242]:
# Englis-German Aspects Linking Content-Headline

getWordVectorSpaceModelRun(en_content, de_headline, 'en', 'de', entity_name)

               0  1              2  3         4            5
0   en_Canada_h0  0   de_Canada_h1  0  0.999996  word2vector
1   en_Canada_h0  0   de_Canada_h0  1  0.999988  word2vector
2   en_Canada_h0  0   de_Canada_h6  2  0.999985  word2vector
3   en_Canada_h0  0   de_Canada_h5  3  0.999962  word2vector
4   en_Canada_h0  0   de_Canada_h7  4  0.999907  word2vector
5   en_Canada_h0  0  de_Canada_h14  5  0.999835  word2vector
6   en_Canada_h0  0  de_Canada_h11  6  0.999827  word2vector
7   en_Canada_h0  0   de_Canada_h9  7  0.999797  word2vector
8   en_Canada_h0  0   de_Canada_h8  8  0.999782  word2vector
9   en_Canada_h1  0   de_Canada_h1  0  0.999996  word2vector
10  en_Canada_h1  0   de_Canada_h0  1  0.999989  word2vector
11  en_Canada_h1  0   de_Canada_h6  2  0.999983  word2vector
12  en_Canada_h1  0   de_Canada_h5  3  0.999960  word2vector
13  en_Canada_h1  0   de_Canada_h7  4  0.999906  word2vector
14  en_Canada_h1  0  de_Canada_h14  5  0.999832  word2vector
15  en_Canada_h1  0  de_

In [243]:
# Englis-German Aspects Linking Headline-Content

getWordVectorSpaceModelRun(en_headline, de_content, 'en', 'de', entity_name)

               0  1              2   3         4            5
0   en_Canada_h0  0   de_Canada_h1   0  0.999989  word2vector
1   en_Canada_h0  0  de_Canada_h13   1  0.999988  word2vector
2   en_Canada_h0  0  de_Canada_h10   2  0.999988  word2vector
3   en_Canada_h0  0   de_Canada_h0   3  0.999988  word2vector
4   en_Canada_h0  0   de_Canada_h8   4  0.999988  word2vector
5   en_Canada_h0  0  de_Canada_h11   5  0.999988  word2vector
6   en_Canada_h0  0  de_Canada_h14   6  0.999988  word2vector
7   en_Canada_h0  0   de_Canada_h5   7  0.999988  word2vector
8   en_Canada_h0  0   de_Canada_h6   8  0.999988  word2vector
9   en_Canada_h0  0   de_Canada_h7   9  0.999988  word2vector
10  en_Canada_h0  0  de_Canada_h12  10  0.999987  word2vector
11  en_Canada_h0  0   de_Canada_h9  11  0.999987  word2vector
12  en_Canada_h0  0   de_Canada_h3  12  0.999987  word2vector
13  en_Canada_h0  0   de_Canada_h4  13  0.999987  word2vector
14  en_Canada_h0  0   de_Canada_h2  14  0.999987  word2vector
15  en_C

In [244]:
## input: 
    #1. list of preprocessing documents
    #2. language model
    #3. entity name
    #4. language: En/De
    #5. representation of aspects: headline/content
## output: for each documnet, a document vector will be produced

def getDocVectors(content, language_model):
    
    language_model = language_model

    words = []
    for word in language_model.vocab:
        words.append(word)

    doc_vectors = list()
    
    for i in range(len(content)):
        #print(content[i])
        vectorSum = [0.0000]*300
        l = 0
        for el in content[i]:
            if el in words:
                #print(list(vectors[el]))
                vectorSum = list(map(add, list(language_model[el]), vectorSum))
                l+=1
            #else:
                #print(el)
        #print(vectorSum)
        for m in range(len(vectorSum)):
            if vectorSum[m] != 0:
                vectorSum[m] = float(vectorSum[m])/l ###average the vector sum
            else:
                vectorSum[m] = vectorSum[m]

        doc_vectors.append(vectorSum)
    
    #doc_vectors_final = []
    #for vec in doc_vectors:
        #doc = []
        #for dim in vec:
            
            #doc.append(float(dim))
        
        #doc_vectors_final.append(doc)
            
        
    #with open(language+'_'+termname+'_'+aspect+".json", 'w') as f:
        #json.dump(doc_vectors_final, f)
  
    
    return doc_vectors 



In [245]:
# input: 1* n dims sparse matrix, or single vector from query_vect/ train_tfidf matrix
def getSquareSumWE(vector):
    squaresum = 0
    for i in range(len(vector)):
        squaresum += vector[i]* vector[i]
        
    return squaresum

In [246]:
# input: two 1* n sparse matrixs, or single vector from query_vect/ train_tfidf matrixs
# return the cosine sim of two vectors, type: float

def getCosineSimilarityWE(query_vector, doc_vector):
    squaresum_query = getSquareSumWE(query_vector)
    squaresum_doc= getSquareSumWE(doc_vector)
    if math.sqrt(squaresum_query)*math.sqrt(squaresum_doc) > 0:
        sim = np.dot(query_vector, doc_vector)/(math.sqrt(squaresum_query)*math.sqrt(squaresum_doc))
        return sim
    else:
        return np.dot(query_vector, doc_vector)
        
 

In [247]:
## input:
# assume we want to link wiki-text in deutsch(arrow language) with the wiki-text in english(target):
# src_content: is the source page data of the wikipedia item in laguage a
# trs_content: is the translation data of the wikipedia item from language b to language a
# en_model: english language model
# de_model: deutsch language
# arrow_la: arrow language of the wiki-text
# target_la: target language of the wiki-text
# termname: entry name of the wiki-text

## return: RUN file for Trec_eval

def getWordVectorSpace2Run(en_content, de_content, DV_en, DV_de, la1, la2, termname):    
    
    ### get the similarity of query with each doc leader
    
    #get document vector for each aspect
    DV_en_content = DV_en
    DV_de_content = DV_de
    
    
    result = []
    for j in range(len(DV_en_content)):
        #print (DV_de_content[j][:10])
        
        sims = []
        results_ID = []
        results = []
        for i in range(len(DV_de_content)):
            s = getCosineSimilarityWE(DV_en_content[j], DV_de_content[i])
            #if type(s)is not None:
            #print(s)
            sims.append(s)
        
        for x in range(len(sims)):
            if sims[x] >0:
                results_ID.append(x)
                results.append(sims[x])
        
        #print(results, results_ID)
        
        if len(results) >0 and len(results_ID)>0:
            results, results_ID= zip(*sorted(zip(results, results_ID), reverse=True))      
       
        
        for m in range(len(results)):
            result.append([la1+'_'+termname+"_h"+str(j), 0, la2+'_'+termname+'_h'+str(results_ID[m]), m, results[m], 'wordembedding'])

    df = pd.DataFrame(result)
    if len(de_content[0]) > 10 and len(en_content[0])>10 :
        df.to_csv('WVSMFinal_'+termname+'_'+la1+'_'+la2+'_text.txt', header=None, index=None, sep=' ', mode='a')
    elif len(de_content[0]) < 10 and len(en_content[0])<10:
        df.to_csv('WVSMFinal_'+termname+'_'+la1+'_'+la2+'_headline.txt', header=None, index=None, sep=' ', mode='a')
    elif len(de_content[0]) < 10 and len(en_content[0])>10:
        df.to_csv('WVSMFinal_'+termname+'_'+la1+'_'+la2+'_headline_text.txt', header=None, index=None, sep=' ', mode='a')
    else:
        df.to_csv('WVSMFinal_'+termname+'_'+la1+'_'+la2+'_text_headline.txt', header=None, index=None, sep=' ', mode='a')     
    
    print(df)

In [248]:
en_model = de_model

In [249]:
DV_en_headline = getDocVectors(en_headline, en_model)
DV_en_content = getDocVectors(en_content, en_model)

DV_de_headline = getDocVectors(de_headline, en_model)
DV_de_content = getDocVectors(de_content, en_model)

In [250]:
getWordVectorSpace2Run(en_headline, de_headline, DV_en_headline, DV_de_headline, 'en','de', entity_name)

                0  1              2   3         4              5
0    en_Canada_h0  0   de_Canada_h0   0  1.000000  wordembedding
1    en_Canada_h0  0   de_Canada_h6   1  0.386009  wordembedding
2    en_Canada_h0  0   de_Canada_h7   2  0.338255  wordembedding
3    en_Canada_h0  0  de_Canada_h11   3  0.319626  wordembedding
4    en_Canada_h0  0   de_Canada_h5   4  0.317648  wordembedding
5    en_Canada_h0  0   de_Canada_h8   5  0.314213  wordembedding
6    en_Canada_h0  0  de_Canada_h14   6  0.304370  wordembedding
7    en_Canada_h0  0   de_Canada_h3   7  0.302396  wordembedding
8    en_Canada_h0  0  de_Canada_h12   8  0.299774  wordembedding
9    en_Canada_h0  0  de_Canada_h10   9  0.287398  wordembedding
10   en_Canada_h0  0  de_Canada_h13  10  0.281832  wordembedding
11   en_Canada_h0  0   de_Canada_h1  11  0.269373  wordembedding
12   en_Canada_h0  0   de_Canada_h2  12  0.262909  wordembedding
13   en_Canada_h0  0   de_Canada_h9  13  0.255541  wordembedding
14   en_Canada_h0  0   de

In [251]:
getWordVectorSpace2Run(en_headline, de_content, DV_en_headline, DV_de_content, 'en','de', entity_name)

                0  1              2   3         4              5
0    en_Canada_h0  0   de_Canada_h1   0  0.504367  wordembedding
1    en_Canada_h0  0  de_Canada_h13   1  0.469583  wordembedding
2    en_Canada_h0  0   de_Canada_h0   2  0.468690  wordembedding
3    en_Canada_h0  0  de_Canada_h14   3  0.449778  wordembedding
4    en_Canada_h0  0   de_Canada_h8   4  0.447277  wordembedding
5    en_Canada_h0  0   de_Canada_h3   5  0.444058  wordembedding
6    en_Canada_h0  0   de_Canada_h5   6  0.443052  wordembedding
7    en_Canada_h0  0   de_Canada_h7   7  0.442166  wordembedding
8    en_Canada_h0  0   de_Canada_h4   8  0.442085  wordembedding
9    en_Canada_h0  0  de_Canada_h10   9  0.436385  wordembedding
10   en_Canada_h0  0   de_Canada_h2  10  0.435408  wordembedding
11   en_Canada_h0  0   de_Canada_h9  11  0.429380  wordembedding
12   en_Canada_h0  0  de_Canada_h11  12  0.424434  wordembedding
13   en_Canada_h0  0   de_Canada_h6  13  0.424254  wordembedding
14   en_Canada_h0  0  de_

In [252]:
getWordVectorSpace2Run(en_content, de_content, DV_en_content, DV_de_content, 'en','de', entity_name)

                0  1              2   3         4              5
0    en_Canada_h0  0   de_Canada_h0   0  0.991363  wordembedding
1    en_Canada_h0  0   de_Canada_h2   1  0.988022  wordembedding
2    en_Canada_h0  0  de_Canada_h11   2  0.987964  wordembedding
3    en_Canada_h0  0  de_Canada_h10   3  0.987465  wordembedding
4    en_Canada_h0  0   de_Canada_h7   4  0.986278  wordembedding
5    en_Canada_h0  0  de_Canada_h13   5  0.985742  wordembedding
6    en_Canada_h0  0   de_Canada_h5   6  0.985657  wordembedding
7    en_Canada_h0  0   de_Canada_h6   7  0.984104  wordembedding
8    en_Canada_h0  0   de_Canada_h3   8  0.983894  wordembedding
9    en_Canada_h0  0  de_Canada_h14   9  0.983029  wordembedding
10   en_Canada_h0  0   de_Canada_h9  10  0.979981  wordembedding
11   en_Canada_h0  0   de_Canada_h8  11  0.979915  wordembedding
12   en_Canada_h0  0  de_Canada_h12  12  0.978645  wordembedding
13   en_Canada_h0  0   de_Canada_h1  13  0.977994  wordembedding
14   en_Canada_h0  0   de

In [253]:
getWordVectorSpace2Run(en_content, de_headline, DV_en_content, DV_de_headline, 'en','de', entity_name)

                0  1              2   3         4              5
0    en_Canada_h0  0   de_Canada_h1   0  0.838983  wordembedding
1    en_Canada_h0  0   de_Canada_h6   1  0.756329  wordembedding
2    en_Canada_h0  0   de_Canada_h7   2  0.699145  wordembedding
3    en_Canada_h0  0   de_Canada_h2   3  0.635991  wordembedding
4    en_Canada_h0  0   de_Canada_h8   4  0.617280  wordembedding
5    en_Canada_h0  0  de_Canada_h14   5  0.612284  wordembedding
6    en_Canada_h0  0   de_Canada_h9   6  0.600598  wordembedding
7    en_Canada_h0  0   de_Canada_h5   7  0.587504  wordembedding
8    en_Canada_h0  0  de_Canada_h11   8  0.584766  wordembedding
9    en_Canada_h0  0   de_Canada_h3   9  0.575182  wordembedding
10   en_Canada_h0  0  de_Canada_h10  10  0.564185  wordembedding
11   en_Canada_h0  0  de_Canada_h12  11  0.541002  wordembedding
12   en_Canada_h0  0  de_Canada_h13  12  0.516156  wordembedding
13   en_Canada_h0  0   de_Canada_h0  13  0.457463  wordembedding
14   en_Canada_h0  0   de