In [None]:
import gensim
import numpy as np
import scipy
import scipy.spatial
import MeCab
import nltk
import xlrd
import string
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style="ticks")

mecab = MeCab.Tagger("-Owakati")

In [None]:
# Select all words in the data file and compute the vocabulary. 
# Write the cross-lingual word embeddings for those words to a separate file.
# This will speed up loading word embeddings and save memory.

data_files = ["../data/olddata.xlsx", "../data/newdata.xlsx"]
vocab = set()
for fname in data_files:
    trans_data = xlrd.open_workbook(fname)
    sheet = trans_data.sheet_by_index(0)  
    for l in range(1, sheet.nrows):
        # tokenise Japanese texts
        rows = sheet.row_values(l, 0, sheet.ncols)
        token_ja = mecab.parse(rows[0].lower())
        vocab = vocab.union(set(token_ja.strip().split()))    
        # tokenise English texts
        vocab = vocab.union(set(nltk.word_tokenize(rows[1].lower())))

stop_words = ['(', ')', '[', ']', '@', '•', '`', '-', '❚❚', '●', '（√',  '×', '。', '＠']
#add_words = ['I', 'like', 'hate', 'cat', 'cats', 'dog', 'dogs', 'banana', '好き', '嫌い', '猫', '犬', '私']
vocab = vocab - set(stop_words)
#vocab = vocab.union(set(add_words))
print("No of unique words in the vocabulary = %d" % len(vocab))

# write the vocabulary to a file for debugging purposes
with open("../data/vocab.txt", 'w') as vocab_file:
    for word in vocab:
        vocab_file.write("%s\n" % word)

# Lets select the cross-lingual word embeddings for those words in the vocabulary.
cross_in_embeds_fname = "../data/ja2en.txt"
cross_out_embeds_fname = "../data/ja2en.sel"
first_line = True

with open(cross_in_embeds_fname) as cross_in:
    with open(cross_out_embeds_fname, 'w') as cross_out:
        for line in cross_in:
            if first_line:
                dim = int(line.split()[1])
                cross_out.write("%d %d\n" % (len(vocab), dim))
                first_line = False
            elif line.split()[0].lower() in vocab:
                cross_out.write(line)

In [None]:
# Load the cross-lingual word embeddings.
large_embeddings = gensim.models.KeyedVectors.load_word2vec_format('../data/ja2en.txt')
small_embeddings = gensim.models.KeyedVectors.load_word2vec_format('../data/ja2en.sel')

In [None]:
embeddings = large_embeddings

In [None]:
def clean_text(s):
    stop_words = ['(', ')', '[', ']', '@', '•', '`', '-', '❚❚', '●', '（√',  '×', '。', '＠']
    for ch in stop_words:
        s = s.replace(ch, ' ')
    return s

In [None]:
def tms(source, target):
    source = list(filter(lambda x: x in embeddings, source))
    target = list(filter(lambda x: x in embeddings, target))
    sim_max = []
    
    n = len(source)
    m = len(target)
    
    for i in range(n):
        temp_max = 0
        for j in range(m):
            first, second = embeddings[target[i]],  embeddings[source[j]]   
            #similarity_temp = scipy.spatial.distance.cosine(first, second)
            
            similarity_temp = np.dot(first, second)/(np.linalg.norm(first)*(np.linalg.norm(second)))
            #print("sim_temp", similarity_temp)
            if temp_max < similarity_temp:
                temp_max = similarity_temp
        sim_max.append(temp_max) 
    #sim_max = np.array(sim_max)        
    #print("sim_max", sim_max)
    similarity = np.mean(sim_max, axis=0) 
    return similarity

In [None]:
trans_data = xlrd.open_workbook('../data/olddata.xlsx')  #open the Excel spreadsheet as workbook
sheet = trans_data.sheet_by_index(0)  
instances = []
for l in range(1, sheet.nrows):
    # tokenise Japanese texts
    rows = sheet.row_values(l, 0, sheet.ncols)
    instances.append((rows[0], rows[1], float(rows[2])))
print("Total number of instances = %d" % len(instances))

similarity = []

for x in instances:
    source = list(set(mecab.parse(clean_text(x[0]).lower().strip('\n')).split()))
    target = list(set(nltk.word_tokenize(clean_text(x[1]).lower().strip())))
    res = tms(source, target)
    similarity.append(res)
    
similarities = np.array(similarity)

with open("../data/pred-sims-tms.csv", "w") as out_file:
    for val in similarities:
        out_file.write("%f\n" % val)