In [1]:
import json
import numpy as np
import pandas as pd
import math
from operator import add
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import MWETokenizer
from string import punctuation
import jieba
import hanziconv
from hanziconv import HanziConv

# Loading word embeddings

# Data Preprocessing functions

In [9]:
def traditionalToSimplified(file):
    i = 0
    for el in file:
        file[i] = ''.join(HanziConv.toSimplified(el))
        i += 1
        
jieba_stop_words = [
    '的', '了', '和', '是', '就', '都', '而', '及', '與', 
    '著', '或', '一個', '沒有', '我們', '你們', '妳們', 
    '他們', '她們', '是否']       

def chineseTokenize(file):
    i = 0
    for el in file:
        file[i] = ' '.join(jieba.cut(el, cut_all=False, HMM=True))
        i += 1
def chineseStopwordsRemoval(file):
    i = 0
    for el in file:
        file[i] = ' '.join(word for word in el if word not in jieba_stop_words)


punctuation = punctuation + str('；')+  str("：《》「 」“”[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）]+")+str('编辑')+str('%.[')+str('(/')+str(');[')+str('"),')+str(').')+str('.[')+str(',[')+str('][')+str('("')+str('."[')+str('—'+str('."'))+str('.,')
def removePunctuation(file):
    
    i = 0
    for el in file:
        el = ' '.join([word for word in el.lower().split() if word not in punctuation])
        file[i] = el
        i += 1


from nltk.tokenize import MWETokenizer
def MWEtokenize(el):
    i = 0
    tokenizer = MWETokenizer(('barack','obama'))
    tokenizer.add_mwe([('new','york'),('hong', 'kong'), ('los', 'angeles'), ('san', 'francisco'),('united', 'kingdom')])
    el = tokenizer.tokenize(el.split())
    return el
        
def tokenize(file):
    i = 0
    for el in file:
        tokenizerOne = WordPunctTokenizer()
        el = tokenizerOne.tokenize(str(el.lower()))
        el = ' '.join([word for word in el if word not in punctuation])
        file[i] = MWEtokenize(el)
        i += 1
        

punctuation = punctuation + str('；')+  str("：《》「 」“”[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）]+")+str('编辑')+str('%.[')+str('(/')+str(');[')+str('"),')+str(').')+str('.[')+str(',[')+str('][')+str('("')+str('."[')+str('—'+str('."'))+str('.,')



# Document-vector building function

In [3]:
## input: 
    #1. list of preprocessing documents
    #2. language model
    #3. entity name
    #4. language: En/De
    #5. representation of aspects: headline/content
## output: for each documnet, a document vector will be produced

def getDocVectors(content, language_model, termname ,language, aspect, WE_source):
    
    language_model = language_model

    words = []
    for word in language_model.vocab:
        words.append(word)

    doc_vectors = list()
    
    for i in range(len(content)):
        #print(content[i])
        vectorSum = [0.0000]*300
        l = 0
        for el in content[i]:
            if el in words:
                #print(list(vectors[el]))
                vectorSum = list(map(add, list(language_model[el]), vectorSum))
                l+=1
            #else:
                #print(el)
        #print(vectorSum)
        for m in range(len(vectorSum)):
            if vectorSum[m] != 0:
                vectorSum[m] = float(vectorSum[m])/l ###average the vector sum
            else:
                vectorSum[m] = vectorSum[m]

        doc_vectors.append(vectorSum)
    
    doc_vectors_final = []
    for vec in doc_vectors:
        doc = []
        for dim in vec:
            
            doc.append(float(dim))
        
        doc_vectors_final.append(doc)
            
        
    with open(WE_source+'_'+language+'_'+termname+'_'+aspect+".json", 'w') as f:
        json.dump(doc_vectors_final, f)
  
    
    #return doc_vectors_final 


# run the function! get and save the document vectors!

# Word Ebedding _BP_

In [4]:
from gensim.models import KeyedVectors

# importing wordembedding and building the language model

print('loading English word embeddings...')
en_model = KeyedVectors.load_word2vec_format('/home/hahou/WordEmbedding/wiki.en.transformed.vec')

'''print('loading German word embeddings...')
de_model = KeyedVectors.load_word2vec_format('/home/hahou/WordEmbedding/wiki.de.transformed.vec')'''

print('loading Chinese word embeddings...')
zh_model = KeyedVectors.load_word2vec_format('/home/hahou/WordEmbedding/wiki.zh.transformed.vec')

# Getting the tokens 
en_words = []
for word in en_model.vocab:
    en_words.append(word)

# Printing out number of tokens available
print("Number of English Tokens: {}".format(len(en_words)))

'''de_words = []
for word in de_model.vocab:
    de_words.append(word)

# Printing out number of tokens available
print("Number of German Tokens: {}".format(len(de_words)))'''

zh_words = []
for word in zh_model.vocab:
    zh_words.append(word)

# Printing out number of tokens available
print("Number of Chinese Tokens: {}".format(len(zh_words)))


loading English word embeddings...
loading Chinese word embeddings...
Number of English Tokens: 2519370
Number of Chinese Tokens: 332647


In [10]:
# define the entity list
#entity_list = ['United_Kingdom', 'Italy', 'Asia','Europe','Canada', 'China', 'France', 'Germany', 'Japan']

#entity_list =  ['Russia','singapore','India', 'Israel','Brazil','Philippines'] 
#entity_list = ['Barack_Obama', 'Donald_Trump','New_York_City',
#'London','Singapore','Hong_Kong','Dubai','Los_Angeles','Paris','Chicago','Washington,_D.C.','San_Francisco',
#'Mumbai','Rome','Toronto','Philadelphia','Monaco','Tokyo','Amsterdam','Boston','Barcelona','Peking']

#entity_list = ['United_States']

vector_source = 'BP'

for entity in entity_list:
    # loading source data
    print('entity: '+entity)
    with open('/home/hahou/WikiDataCrawling/English Corpus/source_en_'+entity+'.json') as json_data:
        source_en = json.load(json_data)
    '''with open('/home/hahou/WikiDataCrawling/German Corpus/source_de_'+entity+'.json') as json_data:
        source_de = json.load(json_data)'''
    with open('/home/hahou/WikiDataCrawling/Chinese Corpus/source_zh_'+entity+'.json') as json_data:
        source_zh = json.load(json_data)    
    
    # exact headlies
    en_headline = list(source_en.keys())
    #de_headline = list(source_de.keys())
    zh_headline = list(source_zh.keys())

    #extract context
    en_content = []
    #de_content = []
    zh_content = []
    for el in en_headline:
        en_content.append(''.join(source_en[el]))
    #for el in de_headline:
     #   de_content.append(''.join(source_de[el]))
    for el in zh_headline:
        zh_content.append(''.join(source_zh[el]))
        
    # preprocessing for en and de
    tokenize(en_content)
    #tokenize(de_content)
    tokenize(en_headline)
    #tokenize(de_headline)
    
    
    #preprocessing for zh
    traditionalToSimplified(zh_content)
    chineseTokenize(zh_content)
    removePunctuation(zh_content)
    chineseStopwordsRemoval(zh_content)

    traditionalToSimplified(zh_headline)
    chineseTokenize(zh_headline)
    removePunctuation(zh_headline)
    chineseStopwordsRemoval(zh_headline)
    
    
    # get the document vectors for headlines of entry page section
    getDocVectors(en_headline, en_model, entity,'En', 'Headline', vector_source)
    #getDocVectors(de_headline, de_model, entity,'De', 'Headline', vector_source)
    getDocVectors(zh_headline, zh_model, entity,'zh', 'Headline', vector_source)
    
    get the document vectors for Contents of entry page section
    #getDocVectors(en_content, en_model, entity,'En', 'Content', vector_source)
    #getDocVectors(de_content, de_model, entity,'De', 'Content', vector_source)
    getDocVectors(zh_content, zh_model, entity,'zh', 'Content', vector_source)
   

entity: United_States
