In [1]:
import pandas as pd
import numpy as np
import csv
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
nltk.download('wordnet')
nltk.download("stopwords")
from nltk.corpus import stopwords
esw = stopwords.words("english")
from string import punctuation

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/hailianhou/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hailianhou/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
from googletrans import Translator
from collections import defaultdict

In [3]:
import json
import math

# base knowledge:
1. for each wikipedia term, there is a term name
2. for each wikipedia term, its English page was crawled as source text in language a -"en": source_a
3. for each wikipedia term, its German page was crawled as source text in language b -"de": source_b
4. for each source page in language b -"de", it will be translated into language a, saved as translation_b_to_a
5. links between source_a and source_b are to be represented by the links between source-a and translation_b_to_a



# getTranslationRun Function

In [24]:
### translate the text in the format of dictionary from src language into dest language

##input:
# text: text in dictionary format {}
# src_b: source language b of the text 
# dest_a: target language a the text to be translated into
# termname: term name of the text

def getTranslationRun(text, src_b, dest_a, termname):
    text = text
    dest = dest_a
    translator = Translator()
    dict_trans = defaultdict(list)
    headline = list(text.keys())
    for el in headline:
        h = translator.translate(text = el, dest = dest)
        t = translator.translate(text = text[el], dest = dest)
        for m in t:
            dict_trans[h.text].append(m.text)
            #print(m.text)

    print(dict_trans)
    return dict_trans

In [5]:
import requests


# data preprocessing Function

In [6]:
# data preprocessing function

def toLowerList(List):
    for i in range(len(List)):
        List[i] = List[i].lower()
    return List

def tokenize(file):
    i = 0
    tokenizer = WordPunctTokenizer()
    for el in file:
        file[i] = ' '.join(tokenizer.tokenize(str(el)))
        i += 1       

def removeStopwords(file):
    i = 0
    for el in file:
        el = ' '.join([word for word in el.lower().split() if word not in esw])
        file[i] = el
        i += 1

punctuation = punctuation + str('%.[')+str('(/')+str(');[')+str('"),')+str(').')+str('.[')+str(',[')+str('][')+str('("')+str('."[')+str('—'+str('."'))+str('.,')
def removePunctuation(file):
    
    i = 0
    for el in file:
        el = ' '.join([word for word in el.lower().split() if word not in punctuation])
        file[i] = el
        i += 1

def stemming(file):
    ps = PorterStemmer()
    i = 0
    for el in file:
        el = ' '.join([ps.stem(word) for word in el.lower().split() ])
        file[i] = el
        i += 1

def removeNumbers(file):
    i = 0
    for el in file:
        file[i] = ' '.join([word for word in el.split() if not word.isdigit()])
        i+=1

        
def lemmatize_n(file):
    lemmatizer = WordNetLemmatizer()
    i = 0
    for el in file:
        el = ' '.join([lemmatizer.lemmatize(word, pos = 'n') for word in el.lower().split() ])
        file[i] = el
        i += 1


def lemmatize_v(file):
    lemmatizer = WordNetLemmatizer()
    i = 0
    for el in file:
        el = ' '.join([lemmatizer.lemmatize(word, pos = 'v') for word in el.lower().split() ])
        file[i] = el
        i += 1
        

def lemmatize_a(file):
    lemmatizer = WordNetLemmatizer()
    i = 0
    for el in file:
        el = ' '.join([lemmatizer.lemmatize(word, pos = 'a') for word in el.lower().split() ])
        file[i] = el
        i += 1
        
def lemmatize_r(file):
    lemmatizer = WordNetLemmatizer()
    i = 0
    for el in file:
        el = ' '.join([lemmatizer.lemmatize(word, pos = 'r') for word in el.lower().split() ])
        file[i] = el
        i += 1

In [7]:
#preprocessing: tokenize, removing stopwords, punctuation, and stemming

def preprocessing(content):
    tokenize(content)
    removeStopwords(content)
    removePunctuation(content)
    removeNumbers(content)
    ### lemmatization, NOUNs
    lemmatize_n(content)
    ### lemmatization, Verbs
    lemmatize_v(content)
    ### lemmatization, Adjactives
    lemmatize_a(content)
    ### lemmatization
    lemmatize_r(content)

# DTM, Query-vector building

In [8]:
##### build functions to generate document-term matrix

## get DTM, weighted by tfidf, the sqrt of the query vector is 1
# therefore, in the retrive phase, 
# dot product of the doc vector and query vector can be used to represent the cosine similarity

# input format: train_doc_file.text

def get_DTM_tfidf(file):
    train_count_vect = CountVectorizer() #remove english stopwords
    X_train_counts = train_count_vect.fit_transform(file)
    vocabulary = list(train_count_vect.vocabulary_.keys())
    #print(vocabulary)

    tfidf_transformer = TfidfTransformer()
    
    X_train_tfidf = CountVectorizer(vocabulary = vocabulary)
    X_train_tfidf = X_train_tfidf.fit_transform(file)
    
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_tfidf)
    return X_train_tfidf


## get DTM, weighted by term frequency
def get_DTM_tf(file):
    train_count_vect = CountVectorizer() #remove english stopwords
    X_train_counts = train_count_vect.fit_transform(file)
    vocabulary = list(train_count_vect.vocabulary_.keys())

    X_train_tf = CountVectorizer(vocabulary = vocabulary)
    X_train_tf = X_train_tf.fit_transform(file)
    return X_train_tf


In [9]:
###Generate query vector for each query

# get_QueryVector_tfidf helps to get the tiidf weighted query vector, the sqrt of the query vector is 1
# therefore, in the retrive phase, 
# dot product of the doc vector and query vector can be used to represent the cosine similarity

# input format: train_doc_file.text, train_query_file.text


def get_QueryVector_tfidf(queryFile, docFile):
    train_count_vect = CountVectorizer() #remove english stopwords
    X_train_counts = train_count_vect.fit_transform(docFile)
    vocabulary = list(train_count_vect.vocabulary_.keys())
    
    if type(queryFile) is str:
        query_vect = []
        query = queryFile
        query = query.split()
        frequency = defaultdict(int)
        for el in vocabulary:
            if el in query:
                frequency[el]+= 1
            else:
                frequency[el] = 0
        query_vect = list(dict(frequency).values())/np.linalg.norm(list(dict(frequency).values()))
        
    else:
        tfidf_transformer = TfidfTransformer()
        
        query_vect = CountVectorizer(analyzer = "word", vocabulary = vocabulary)
        query_vect = query_vect.fit_transform(queryFile)
        
        query_vect = tfidf_transformer.fit_transform(query_vect)
    return query_vect



###Generate the query vector, weighted by term frequency
def get_QueryVector(queryFile, docFile):
    train_count_vect = CountVectorizer() #remove english stopwords
    X_train_counts = train_count_vect.fit_transform(docFile)
    vocabulary = list(train_count_vect.vocabulary_.keys())
    
    if type(queryFile) is str:
        query_vect = []
        query = queryFile
        query = query.split()
        frequency = defaultdict(int)
        for el in vocabulary:
            if el in query:
                frequency[el]+= 1
            else:
                frequency[el] = 0
        query_vect = list(dict(frequency).values())
        
    else:
        query_vect = CountVectorizer(analyzer = "word", vocabulary = vocabulary)
        query_vect = query_vect.fit_transform(queryFile)
        
    return query_vect

# cosine similarity Function

In [10]:
# input: 1* n dims sparse matrix, or single vector from query_vect/ train_tfidf matrix
def getSquareSum(vector):
    squaresum = 0
    for i in range(vector.shape[1]):
        squaresum += vector[0,i]* vector[0,i]
        
    return squaresum 

In [11]:
# input: 1* n sparse matrix, or single vector from query_vect/ train_tfidf matrix
# return the cosine sim of two vectors

def getCosineSimilarity(query_vector, doc_vector):
    squaresum_query = getSquareSum(query_vector)
    squaresum_doc= getSquareSum(doc_vector)
    if math.sqrt(squaresum_query)*math.sqrt(squaresum_doc) > 0:
        sim = np.dot(query_vector, doc_vector.transpose())[0,0]/(math.sqrt(squaresum_query)*math.sqrt(squaresum_doc))
        return sim
    else:
        return np.dot(query_vector, doc_vector.transpose())[0,0]
        
 

# getVectorSpaceModelRun Function

link headlines in the source text with headline in the translation text through cosinesimilarity

In [12]:
# src_content: is the source page data of the wikipedia item in laguage a
# trs_content: is the translation data of the wikipedia item from language b to language a
# we link the headlines of the wikipedia item in different languages through translation text and source text
# further more, link of headlines is predicted through similarity between the text vectors below the headlines
# b: source language b of the text 
# a: target language a the text to be translated into
# termname: term name of the text

# return: RUN file for Trec_eval

def getDocumentVectorSpaceModelRun(trs_content, src_content, src_b, dest_a, termname):    
    ### get the similarity of query with each doc leader
    queryVector = get_QueryVector_tfidf(trs_content, src_content)
    src_DTM = get_DTM_tfidf(src_content)
    
    result = []
    for j in range(queryVector.shape[0]):
        
        sims = []
        results_ID = []
        results = []
        for i in range(src_DTM.shape[0]):
            s = getCosineSimilarity(queryVector[j], src_DTM[i,])
            #if type(s)is not None:
            sims.append(s)

        
        for x in range(len(sims)):
            if sims[x] >0:
                results_ID.append(x)
                results.append(sims[x])
        
        #print(results, results_ID)
        
        if len(results) >0 and len(results_ID)>0:
            results, results_ID= zip(*sorted(zip(results, results_ID), reverse=True))      
       
        
        for m in range(len(results)):
            result.append([src_b+'_'+termname+"_h"+str(j), 0, dest_a+'_'+termname+'_h'+str(results_ID[m]), m, results[m], 'cosinesims'])

    df = pd.DataFrame(result)
    if len(trs_content[0].split()) > 10 and len(src_content[0].split())>10 :
        df.to_csv(termname+'_de_en_text_VSM.txt', header=None, index=None, sep=' ', mode='a')
    elif len(trs_content[0].split()) < 10 and len(src_content[0].split())<10:
        df.to_csv(termname+'_de_en_headline_VSM.txt', header=None, index=None, sep=' ', mode='a')
    elif len(trs_content[0].split()) < 10 and len(src_content[0].split())>10:
        df.to_csv(termname+'_de_en_headline_text_VSM.txt', header=None, index=None, sep=' ', mode='a')
    else:
        df.to_csv(termname+'_de_en_text_headline_VSM.txt', header=None, index=None, sep=' ', mode='a')     
    
    print(df)

# Make Functions Run!
1. data loading as source_a and source_b
2. translating source_b into transaltion_b_to_a
3. data preprocessing 
4. make baseline run(headlines)
5. make cross link run(contexts)



In [20]:
# loading source data
with open('/Users/hailianhou/Desktop/MasterThesis/FinalCode/DataCrawling/source_de_Germany.json') as json_data:
    source_de = json.load(json_data)

with open('/Users/hailianhou/Desktop/MasterThesis/FinalCode/DataCrawling/source_en_Germany.json') as json_data:
    source_en = json.load(json_data)

In [25]:
## input: 
#source_de_Barack_Obama: dict

translation_de_to_en = getTranslationRun(source_de, 'de', 'en', 'Canada')

defaultdict(<class 'list'>, {'Canada': ['Template: Infobox State / Maintenance / NAME-AMTSSPRACHETemplate: Infobox State / Maintenance / NAME-DEUTSCHTemplate: Infobox State / Maintenance / IMAGE-COAT-WIDTH', 'Canada (English and French Canada) is a state in North America that lies between the Atlantic in the east and the Pacific in the west and extends northward to the Arctic Ocean. Federal capital is Ottawa, the most populous city is Toronto. The only land border is the one to the USA in the south and in the northwest. Canada is the second largest country in the world after Russia, but the population is only about 36 million.', 'The colonization by First Nations began no later than 12,000 years ago, the Inuit followed around 5000 years ago. From the late 15th century, Europeans landed on the east coast and began colonization around 1600. At first Frenchmen and Englishmen settled down. During this time, the name "Canada" spread, which was originally a name for an Iroquois village. Fran

In [26]:
#rename

source_a = source_en
source_b = source_de

translation_b_to_a = translation_de_to_en


In [34]:
translation_de_to_en

defaultdict(list,
            {'Administrative Division [edit | Edit]': ['Japan is a centralized state that merely passes on clearly defined tasks to subordinate local authorities. Japan is divided into three administrative levels, the central government in Tokyo, the 47 prefectures (todōfuken) and the municipal level (shikuchōson): Circular cities (shi), small towns (chō or machi), villages (mura or son) and in the prefecture Tokyo the 23 "[special] districts" ([tokubetsu-] ku).',
              'A coarse subdivision of Japan consists of the eight regions, which consist of one or more prefectures. They are not local authorities but are used by the administration for specific areas of responsibility (branches of the central government, regional governor conferences, judicial districts). Various reform plans for a Dōshūsei provide a stronger role for the regions - in existing or slightly differentiated distribution - to increase the capacity of local governments to act.',
              '

In [35]:
# exact headlies
src_headline = list(source_a.keys())
trs_headline = list(translation_b_to_a .keys())

#extract context
src_content = []
trs_content = []
for el in src_headline:
    src_content.append(''.join(source_a[el]))
for el in trs_headline:
    trs_content.append(''.join(translation_b_to_a[el]))

In [36]:
preprocessing(src_content)
preprocessing(trs_content)

preprocessing(src_headline)
preprocessing(trs_headline)

In [37]:
# predict headline links through context--context

getDocumentVectorSpaceModelRun(src_content, trs_content, 'en', 'de', 'Japan')

                0  1            2  3         4           5
0     en_Japan_h0  0  de_Japan_h0  0  0.398414  cosinesims
1     en_Japan_h0  0  de_Japan_h7  1  0.299273  cosinesims
2     en_Japan_h0  0  de_Japan_h2  2  0.294098  cosinesims
3     en_Japan_h0  0  de_Japan_h3  3  0.272842  cosinesims
4     en_Japan_h0  0  de_Japan_h9  4  0.268294  cosinesims
5     en_Japan_h0  0  de_Japan_h8  5  0.228979  cosinesims
6     en_Japan_h0  0  de_Japan_h1  6  0.223092  cosinesims
7     en_Japan_h0  0  de_Japan_h6  7  0.155474  cosinesims
8     en_Japan_h0  0  de_Japan_h5  8  0.143587  cosinesims
9     en_Japan_h0  0  de_Japan_h4  9  0.137428  cosinesims
10    en_Japan_h1  0  de_Japan_h1  0  0.467835  cosinesims
11    en_Japan_h1  0  de_Japan_h0  1  0.193397  cosinesims
12    en_Japan_h1  0  de_Japan_h3  2  0.155415  cosinesims
13    en_Japan_h1  0  de_Japan_h9  3  0.131259  cosinesims
14    en_Japan_h1  0  de_Japan_h7  4  0.113322  cosinesims
15    en_Japan_h1  0  de_Japan_h2  5  0.101052  cosinesi

In [38]:
# predict headline links through headline--headline

getDocumentVectorSpaceModelRun(src_headline, trs_headline, 'en', 'de', 'Japan')

              0  1            2  3         4           5
0   en_Japan_h0  0  de_Japan_h0  0  1.000000  cosinesims
1   en_Japan_h3  0  de_Japan_h2  0  0.777096  cosinesims
2   en_Japan_h4  0  de_Japan_h5  0  0.777096  cosinesims
3   en_Japan_h6  0  de_Japan_h7  0  0.777096  cosinesims
4   en_Japan_h8  0  de_Japan_h8  0  0.926883  cosinesims
5  en_Japan_h12  0  de_Japan_h9  0  0.777096  cosinesims


In [39]:
# predict headline links through headline--content

getDocumentVectorSpaceModelRun(src_content, trs_headline,  'en', 'de', 'Japan')

               0  1            2  3         4           5
0    en_Japan_h0  0  de_Japan_h0  0  0.933491  cosinesims
1    en_Japan_h0  0  de_Japan_h1  1  0.198099  cosinesims
2    en_Japan_h0  0  de_Japan_h3  2  0.184234  cosinesims
3    en_Japan_h0  0  de_Japan_h4  3  0.081709  cosinesims
4    en_Japan_h0  0  de_Japan_h7  4  0.073615  cosinesims
5    en_Japan_h1  0  de_Japan_h0  0  0.697848  cosinesims
6    en_Japan_h1  0  de_Japan_h1  1  0.587149  cosinesims
7    en_Japan_h2  0  de_Japan_h0  0  0.886857  cosinesims
8    en_Japan_h2  0  de_Japan_h9  1  0.321216  cosinesims
9    en_Japan_h2  0  de_Japan_h1  2  0.080622  cosinesims
10   en_Japan_h2  0  de_Japan_h3  3  0.080474  cosinesims
11   en_Japan_h2  0  de_Japan_h8  4  0.063855  cosinesims
12   en_Japan_h2  0  de_Japan_h7  5  0.048233  cosinesims
13   en_Japan_h2  0  de_Japan_h6  6  0.047671  cosinesims
14   en_Japan_h3  0  de_Japan_h0  0  0.989462  cosinesims
15   en_Japan_h3  0  de_Japan_h2  1  0.070785  cosinesims
16   en_Japan_

In [40]:
# predict headline links through content--headline

getDocumentVectorSpaceModelRun(src_headline, trs_content,  'en', 'de', 'Japan')

               0  1            2  3         4           5
0    en_Japan_h0  0  de_Japan_h7  0  0.291532  cosinesims
1    en_Japan_h0  0  de_Japan_h2  1  0.248865  cosinesims
2    en_Japan_h0  0  de_Japan_h3  2  0.233580  cosinesims
3    en_Japan_h0  0  de_Japan_h9  3  0.224254  cosinesims
4    en_Japan_h0  0  de_Japan_h8  4  0.220102  cosinesims
5    en_Japan_h0  0  de_Japan_h0  5  0.156441  cosinesims
6    en_Japan_h0  0  de_Japan_h4  6  0.119017  cosinesims
7    en_Japan_h0  0  de_Japan_h1  7  0.113613  cosinesims
8    en_Japan_h0  0  de_Japan_h5  8  0.077078  cosinesims
9    en_Japan_h0  0  de_Japan_h6  9  0.062163  cosinesims
10   en_Japan_h2  0  de_Japan_h4  0  0.038231  cosinesims
11   en_Japan_h2  0  de_Japan_h2  1  0.022206  cosinesims
12   en_Japan_h2  0  de_Japan_h3  2  0.018758  cosinesims
13   en_Japan_h2  0  de_Japan_h9  3  0.012006  cosinesims
14   en_Japan_h2  0  de_Japan_h5  4  0.008843  cosinesims
15   en_Japan_h3  0  de_Japan_h2  0  0.037395  cosinesims
16   en_Japan_

In [33]:
entity_list = ['Germany', 'Japan','France','Italy', 'United_Kingdom', 'Asia','Europe']


for entity in entity_list:
    # loading source data
    with open('/Users/hailianhou/Desktop/MasterThesis/FinalCode/DataCrawling/source_de_'+entity+'.json') as json_data:
        source_de = json.load(json_data)

    with open('/Users/hailianhou/Desktop/MasterThesis/FinalCode/DataCrawling/source_en_'+entity+'.json') as json_data:
        source_en = json.load(json_data)
        
    ## input: 
    #source_de_Barack_Obama: dict

    translation_de_to_en = getTranslationRun(source_de, 'de', 'en', el)
    
    #rename

    source_a = source_en
    source_b = source_de

    translation_b_to_a = translation_de_to_en
    
    # exact headlies
    src_headline = list(source_a.keys())
    trs_headline = list(translation_b_to_a .keys())

    #extract context
    src_content = []
    trs_content = []
    for el in src_headline:
        src_content.append(''.join(source_a[el]))
    for el in trs_headline:
        trs_content.append(''.join(translation_b_to_a[el]))
    
    #preprocessing
    preprocessing(src_content)
    preprocessing(trs_content)

    preprocessing(src_headline)
    preprocessing(trs_headline)
    
    # predict headline links through context--context
    getDocumentVectorSpaceModelRun(src_content, trs_content, 'en', 'de', el)
    
    # predict headline links through headline--headline
    getDocumentVectorSpaceModelRun(src_headline, trs_headline, 'en', 'de', el)
    
    # predict headline links through headline--content
    getDocumentVectorSpaceModelRun(src_content, trs_headline,  'en', 'de', el)
    
    # predict headline links through content--headline
    getDocumentVectorSpaceModelRun(src_headline, trs_content,  'en', 'de', el)

    

defaultdict(<class 'list'>, {'Germany': ['Template: Infobox State / Maintenance / NAME-AMTSSPRACHETemplate: Infobox State / Maintenance / IMAGE-COAT-WIDTH', 'Germany ([.mw-parser-output .IPA a {text-decoration: none} dɔʏtʃlant]; full form: Federal Republic of Germany) is a federal state in Central Europe. It consists of 16 countries and is written as a liberal-democratic and social state of law. The Federal Republic of Germany is the most recent manifestation of the German nation state. Germany has over 82.5 million inhabitants and, with 232 inhabitants per km², is one of the densely populated territorial states.', "Germany borders nine states, the North and Baltic Seas in the north and the Alps in the south. It is located in the temperate zone and has sixteen national parks and over one hundred nature parks. The federal capital as well as the most densely populated German city is Berlin. Other cities with more than one million inhabitants are Hamburg, Munich and Cologne, the largest m

JSONDecodeError: Expecting value: line 1 column 1 (char 0)