In [1]:
import pandas as pd
import numpy as np
import csv
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
nltk.download('wordnet')
nltk.download("stopwords")
from nltk.corpus import stopwords
esw = stopwords.words("english")
from string import punctuation

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\D070678\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\D070678\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
from googletrans import Translator
from collections import defaultdict

In [3]:
import json
import math

# base knowledge:
1. for each wikipedia term, there is a term name
2. for each wikipedia term, its English page was crawled as source text in language a -"en": source_a
3. for each wikipedia term, its German page was crawled as source text in language b -"de": source_b
4. for each source page in language b -"de", it will be translated into language a, saved as translation_b_to_a
5. links between source_a and source_b are to be represented by the links between source-a and translation_b_to_a



# getTranslationRun Function

In [4]:
### translate the text in the format of dictionary from src language into dest language

##input:
# text: text in dictionary format {}
# src_b: source language b of the text 
# dest_a: target language a the text to be translated into
# termname: term name of the text

def getTranslationRun(text, src_b, dest_a, termname):
    text = text
    dest = dest_a
    
    dict_trans = defaultdict(list)
    headline = list(text.keys())
    for el in headline:
        translator = Translator()
        h = translator.translate(text = el, dest = dest)
        t = translator.translate(text = text[el], dest = dest)
        for m in t:
            dict_trans[h.text].append(m.text)
            #print(m.text)

    print(dict_trans)
    return dict_trans

In [5]:
import requests


# data preprocessing Function

In [6]:
# data preprocessing function

def toLowerList(List):
    for i in range(len(List)):
        List[i] = List[i].lower()
    return List

def tokenize(file):
    i = 0
    tokenizer = WordPunctTokenizer()
    for el in file:
        file[i] = ' '.join(tokenizer.tokenize(str(el)))
        i += 1       

def removeStopwords(file):
    i = 0
    for el in file:
        el = ' '.join([word for word in el.lower().split() if word not in esw])
        file[i] = el
        i += 1

punctuation = punctuation + str('%.[')+str('(/')+str(');[')+str('"),')+str(').')+str('.[')+str(',[')+str('][')+str('("')+str('."[')+str('—'+str('."'))+str('.,')
def removePunctuation(file):
    
    i = 0
    for el in file:
        el = ' '.join([word for word in el.lower().split() if word not in punctuation])
        file[i] = el
        i += 1

def stemming(file):
    ps = PorterStemmer()
    i = 0
    for el in file:
        el = ' '.join([ps.stem(word) for word in el.lower().split() ])
        file[i] = el
        i += 1

def removeNumbers(file):
    i = 0
    for el in file:
        file[i] = ' '.join([word for word in el.split() if not word.isdigit()])
        i+=1

        
def lemmatize_n(file):
    lemmatizer = WordNetLemmatizer()
    i = 0
    for el in file:
        el = ' '.join([lemmatizer.lemmatize(word, pos = 'n') for word in el.lower().split() ])
        file[i] = el
        i += 1


def lemmatize_v(file):
    lemmatizer = WordNetLemmatizer()
    i = 0
    for el in file:
        el = ' '.join([lemmatizer.lemmatize(word, pos = 'v') for word in el.lower().split() ])
        file[i] = el
        i += 1
        

def lemmatize_a(file):
    lemmatizer = WordNetLemmatizer()
    i = 0
    for el in file:
        el = ' '.join([lemmatizer.lemmatize(word, pos = 'a') for word in el.lower().split() ])
        file[i] = el
        i += 1
        
def lemmatize_r(file):
    lemmatizer = WordNetLemmatizer()
    i = 0
    for el in file:
        el = ' '.join([lemmatizer.lemmatize(word, pos = 'r') for word in el.lower().split() ])
        file[i] = el
        i += 1

In [7]:
#preprocessing: tokenize, removing stopwords, punctuation, and stemming

def preprocessing(content):
    tokenize(content)
    removeStopwords(content)
    removePunctuation(content)
    removeNumbers(content)
    ### lemmatization, NOUNs
    lemmatize_n(content)
    ### lemmatization, Verbs
    lemmatize_v(content)
    ### lemmatization, Adjactives
    lemmatize_a(content)
    ### lemmatization
    lemmatize_r(content)

# DTM, Query-vector building

In [8]:
##### build functions to generate document-term matrix

## get DTM, weighted by tfidf, the sqrt of the query vector is 1
# therefore, in the retrive phase, 
# dot product of the doc vector and query vector can be used to represent the cosine similarity

# input format: train_doc_file.text

def get_DTM_tfidf(file):
    train_count_vect = CountVectorizer() #remove english stopwords
    X_train_counts = train_count_vect.fit_transform(file)
    vocabulary = list(train_count_vect.vocabulary_.keys())
    #print(vocabulary)

    tfidf_transformer = TfidfTransformer()
    
    X_train_tfidf = CountVectorizer(vocabulary = vocabulary)
    X_train_tfidf = X_train_tfidf.fit_transform(file)
    
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_tfidf)
    return X_train_tfidf


## get DTM, weighted by term frequency
def get_DTM_tf(file):
    train_count_vect = CountVectorizer() #remove english stopwords
    X_train_counts = train_count_vect.fit_transform(file)
    vocabulary = list(train_count_vect.vocabulary_.keys())

    X_train_tf = CountVectorizer(vocabulary = vocabulary)
    X_train_tf = X_train_tf.fit_transform(file)
    return X_train_tf


In [9]:
###Generate query vector for each query

# get_QueryVector_tfidf helps to get the tiidf weighted query vector, the sqrt of the query vector is 1
# therefore, in the retrive phase, 
# dot product of the doc vector and query vector can be used to represent the cosine similarity

# input format: train_doc_file.text, train_query_file.text


def get_QueryVector_tfidf(queryFile, docFile):
    train_count_vect = CountVectorizer() #remove english stopwords
    X_train_counts = train_count_vect.fit_transform(docFile)
    vocabulary = list(train_count_vect.vocabulary_.keys())
    
    if type(queryFile) is str:
        query_vect = []
        query = queryFile
        query = query.split()
        frequency = defaultdict(int)
        for el in vocabulary:
            if el in query:
                frequency[el]+= 1
            else:
                frequency[el] = 0
        query_vect = list(dict(frequency).values())/np.linalg.norm(list(dict(frequency).values()))
        
    else:
        tfidf_transformer = TfidfTransformer()
        
        query_vect = CountVectorizer(analyzer = "word", vocabulary = vocabulary)
        query_vect = query_vect.fit_transform(queryFile)
        
        query_vect = tfidf_transformer.fit_transform(query_vect)
    return query_vect



###Generate the query vector, weighted by term frequency
def get_QueryVector(queryFile, docFile):
    train_count_vect = CountVectorizer() #remove english stopwords
    X_train_counts = train_count_vect.fit_transform(docFile)
    vocabulary = list(train_count_vect.vocabulary_.keys())
    
    if type(queryFile) is str:
        query_vect = []
        query = queryFile
        query = query.split()
        frequency = defaultdict(int)
        for el in vocabulary:
            if el in query:
                frequency[el]+= 1
            else:
                frequency[el] = 0
        query_vect = list(dict(frequency).values())
        
    else:
        query_vect = CountVectorizer(analyzer = "word", vocabulary = vocabulary)
        query_vect = query_vect.fit_transform(queryFile)
        
    return query_vect

# cosine similarity Function

In [10]:
# input: 1* n dims sparse matrix, or single vector from query_vect/ train_tfidf matrix
def getSquareSum(vector):
    squaresum = 0
    for i in range(vector.shape[1]):
        squaresum += vector[0,i]* vector[0,i]
        
    return squaresum 

In [11]:
# input: 1* n sparse matrix, or single vector from query_vect/ train_tfidf matrix
# return the cosine sim of two vectors

def getCosineSimilarity(query_vector, doc_vector):
    squaresum_query = getSquareSum(query_vector)
    squaresum_doc= getSquareSum(doc_vector)
    if math.sqrt(squaresum_query)*math.sqrt(squaresum_doc) > 0:
        sim = np.dot(query_vector, doc_vector.transpose())[0,0]/(math.sqrt(squaresum_query)*math.sqrt(squaresum_doc))
        return sim
    else:
        return np.dot(query_vector, doc_vector.transpose())[0,0]
        
 

# getVectorSpaceModelRun Function

link headlines in the source text with headline in the translation text through cosinesimilarity

In [12]:
# src_content: is the source page data of the wikipedia item in laguage a
# trs_content: is the translation data of the wikipedia item from language b to language a
# we link the headlines of the wikipedia item in different languages through translation text and source text
# further more, link of headlines is predicted through similarity between the text vectors below the headlines
# b: source language b of the text 
# a: target language a the text to be translated into
# termname: term name of the text

# return: RUN file for Trec_eval

def getDocumentVectorSpaceModelRun(trs_content, src_content, src_b, dest_a, termname):    
    ### get the similarity of query with each doc leader
    queryVector = get_QueryVector_tfidf(trs_content, src_content)
    src_DTM = get_DTM_tfidf(src_content)
    
    result = []
    for j in range(queryVector.shape[0]):
        
        sims = []
        results_ID = []
        results = []
        for i in range(src_DTM.shape[0]):
            s = getCosineSimilarity(queryVector[j], src_DTM[i,])
            #if type(s)is not None:
            sims.append(s)

        
        for x in range(len(sims)):
            if sims[x] >0:
                results_ID.append(x)
                results.append(sims[x])
        
        #print(results, results_ID)
        
        if len(results) >0 and len(results_ID)>0:
            results, results_ID= zip(*sorted(zip(results, results_ID), reverse=True))      
       
        
        for m in range(len(results)):
            result.append([src_b+'_'+termname+"_h"+str(j), 0, dest_a+'_'+termname+'_h'+str(results_ID[m]), m, results[m], 'cosinesims'])

    df = pd.DataFrame(result)
    if len(trs_content[0].split()) > 10 and len(src_content[0].split())>10 :
        df.to_csv(termname+'_de_en_text_VSM.txt', header=None, index=None, sep=' ', mode='a')
    elif len(trs_content[0].split()) < 10 and len(src_content[0].split())<10:
        df.to_csv(termname+'_de_en_headline_VSM.txt', header=None, index=None, sep=' ', mode='a')
    elif len(trs_content[0].split()) < 10 and len(src_content[0].split())>10:
        df.to_csv(termname+'_de_en_headline_text_VSM.txt', header=None, index=None, sep=' ', mode='a')
    else:
        df.to_csv(termname+'_de_en_text_headline_VSM.txt', header=None, index=None, sep=' ', mode='a')     
    
    print(df)

# Make Functions Run!
1. data loading as source_a and source_b
2. translating source_b into transaltion_b_to_a
3. data preprocessing 
4. make baseline run(headlines)
5. make cross link run(contexts)



In [14]:
# loading source data
with open('/Users/D070678/Documents/Others/MasterThesis/DataCrawling/source_de_Italy.json') as json_data:
    source_de = json.load(json_data)

with open('/Users/D070678/Documents/Others/MasterThesis/DataCrawling/source_en_Italy.json') as json_data:
    source_en = json.load(json_data)

In [15]:
## input: 
#source_de_Barack_Obama: dict

translation_de_to_en = getTranslationRun(source_de, 'de', 'en', 'Italy')

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [31]:
#rename

source_a = source_en
source_b = source_de
#with open('/Users/hailianhou/Desktop/MasterThesis/FinalCode/translation_de_to_en_Barack_Obama.json') as json_data:
    #translation_b_to_a = json.load(json_data)
translation_b_to_a = translation_de_to_en


In [32]:
# exact headlies
src_headline = list(source_a.keys())
trs_headline = list(translation_b_to_a .keys())

#extract context
src_content = []
trs_content = []
for el in src_headline:
    src_content.append(''.join(source_a[el]))
for el in trs_headline:
    trs_content.append(''.join(translation_b_to_a[el]))

In [33]:
preprocessing(src_content)
preprocessing(trs_content)

preprocessing(src_headline)
preprocessing(trs_headline)

In [35]:
# predict headline links through context--context

getDocumentVectorSpaceModelRun(src_content, trs_content,  'en', 'de', 'China')

               0  1            2  3         4           5
0    en_China_h0  0  de_China_h6  0  0.434614  cosinesims
1    en_China_h0  0  de_China_h4  1  0.420554  cosinesims
2    en_China_h0  0  de_China_h0  2  0.390972  cosinesims
3    en_China_h0  0  de_China_h3  3  0.315599  cosinesims
4    en_China_h0  0  de_China_h1  4  0.304167  cosinesims
5    en_China_h0  0  de_China_h2  5  0.285066  cosinesims
6    en_China_h0  0  de_China_h8  6  0.249078  cosinesims
7    en_China_h0  0  de_China_h7  7  0.240203  cosinesims
8    en_China_h0  0  de_China_h5  8  0.232270  cosinesims
9    en_China_h1  0  de_China_h0  0  0.244568  cosinesims
10   en_China_h1  0  de_China_h4  1  0.240183  cosinesims
11   en_China_h1  0  de_China_h6  2  0.212020  cosinesims
12   en_China_h1  0  de_China_h8  3  0.211630  cosinesims
13   en_China_h1  0  de_China_h3  4  0.189849  cosinesims
14   en_China_h1  0  de_China_h2  5  0.151710  cosinesims
15   en_China_h1  0  de_China_h5  6  0.149911  cosinesims
16   en_China_

In [41]:
# predict headline links through headline--headline

getDocumentVectorSpaceModelRun(src_headline, trs_headline,'en', 'de', 'China')

              0  1            2  3        4           5
0   en_China_h0  0  de_China_h0  0  0.57735  cosinesims
1   en_China_h2  0  de_China_h1  0  1.00000  cosinesims
2   en_China_h3  0  de_China_h2  0  1.00000  cosinesims
3   en_China_h6  0  de_China_h6  0  1.00000  cosinesims
4   en_China_h8  0  de_China_h7  0  1.00000  cosinesims
5  en_China_h10  0  de_China_h8  0  1.00000  cosinesims


In [42]:
# predict headline links through headline--content

getDocumentVectorSpaceModelRun(src_content, trs_headline, 'en', 'de', 'China')

               0  1            2  3         4           5
0    en_China_h0  0  de_China_h0  0  0.705477  cosinesims
1    en_China_h0  0  de_China_h6  1  0.283767  cosinesims
2    en_China_h0  0  de_China_h4  2  0.170788  cosinesims
3    en_China_h0  0  de_China_h8  3  0.094589  cosinesims
4    en_China_h0  0  de_China_h3  4  0.070885  cosinesims
5    en_China_h1  0  de_China_h0  0  0.820019  cosinesims
6    en_China_h2  0  de_China_h0  0  0.677936  cosinesims
7    en_China_h2  0  de_China_h6  1  0.310062  cosinesims
8    en_China_h2  0  de_China_h4  2  0.244929  cosinesims
9    en_China_h2  0  de_China_h8  3  0.155031  cosinesims
10   en_China_h2  0  de_China_h3  4  0.116179  cosinesims
11   en_China_h2  0  de_China_h1  5  0.077515  cosinesims
12   en_China_h3  0  de_China_h0  0  0.618631  cosinesims
13   en_China_h3  0  de_China_h7  1  0.042708  cosinesims
14   en_China_h3  0  de_China_h3  2  0.028602  cosinesims
15   en_China_h4  0  de_China_h0  0  0.746834  cosinesims
16   en_China_

In [43]:
# predict headline links through content--headline

getDocumentVectorSpaceModelRun( src_headline, trs_content, 'en', 'de', 'China')

               0  1            2  3         4           5
0    en_China_h0  0  de_China_h6  0  0.396956  cosinesims
1    en_China_h0  0  de_China_h4  1  0.309110  cosinesims
2    en_China_h0  0  de_China_h2  2  0.295799  cosinesims
3    en_China_h0  0  de_China_h0  3  0.281084  cosinesims
4    en_China_h0  0  de_China_h3  4  0.249949  cosinesims
5    en_China_h0  0  de_China_h7  5  0.236132  cosinesims
6    en_China_h0  0  de_China_h8  6  0.231211  cosinesims
7    en_China_h0  0  de_China_h5  7  0.210044  cosinesims
8    en_China_h0  0  de_China_h1  8  0.150412  cosinesims
9    en_China_h1  0  de_China_h0  0  0.134659  cosinesims
10   en_China_h1  0  de_China_h1  1  0.016013  cosinesims
11   en_China_h1  0  de_China_h3  2  0.014514  cosinesims
12   en_China_h2  0  de_China_h8  0  0.034853  cosinesims
13   en_China_h2  0  de_China_h1  1  0.022673  cosinesims
14   en_China_h2  0  de_China_h5  2  0.014248  cosinesims
15   en_China_h2  0  de_China_h3  3  0.010276  cosinesims
16   en_China_