In [7]:
import gensim
import numpy as np
import scipy
import scipy.spatial
import MeCab
import nltk
import xlrd
import string
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style="ticks")

mecab = MeCab.Tagger("-Owakati")

In [19]:
# Select all words in the data file and compute the vocabulary. 
# Write the cross-lingual word embeddings for those words to a separate file.
# This will speed up loading word embeddings and save memory.

data_files = ["../data/olddata.xlsx", "../data/newdata.xlsx"]
vocab = set()
for fname in data_files:
    trans_data = xlrd.open_workbook(fname)
    sheet = trans_data.sheet_by_index(0)  
    for l in range(1, sheet.nrows):
        # tokenise Japanese texts
        rows = sheet.row_values(l, 0, sheet.ncols)
        token_ja = mecab.parse(rows[0].lower())
        vocab = vocab.union(set(token_ja.strip().split()))    
        # tokenise English texts
        vocab = vocab.union(set(nltk.word_tokenize(rows[1].lower())))

stop_words = ['(', ')', '[', ']', '@', '•', '`', '-', '❚❚', '●', '（√',  '×', '。', '＠']
add_words = ['I', 'like', 'hate', 'cat', 'cats', 'dog', 'dogs', 'banana', '好き', '嫌い', '猫', '犬', '私']
vocab = vocab - set(stop_words)
vocab = vocab.union(set(add_words))
print("No of unique words in the vocabulary = %d" % len(vocab))

# write the vocabulary to a file for debugging purposes
with open("../data/vocab.txt", 'w') as vocab_file:
    for word in vocab:
        vocab_file.write("%s\n" % word)

# Lets select the cross-lingual word embeddings for those words in the vocabulary.
cross_in_embeds_fname = "../data/ja-en.txt"
cross_out_embeds_fname = "../data/ja-en.sel"
first_line = True

with open(cross_in_embeds_fname) as cross_in:
    with open(cross_out_embeds_fname, 'w') as cross_out:
        for line in cross_in:
            if first_line:
                dim = int(line.split()[1])
                cross_out.write("%d %d\n" % (len(vocab), dim))
                first_line = False
            elif line.split()[0].lower() in vocab:
                cross_out.write(line)
    


No of unique words in the vocabulary = 2155


In [8]:
# Load the cross-lingual word embeddings.
#large_embeddings = gensim.models.KeyedVectors.load_word2vec_format('../data/ja-en.txt')
small_embeddings = gensim.models.KeyedVectors.load_word2vec_format('../data/ja-en.sel')

In [9]:
embeddings = small_embeddings

In [10]:
def clean_text(s):
    stop_words = ['(', ')', '[', ']', '@', '•', '`', '-', '❚❚', '●', '（√',  '×', '。', '＠']
    for ch in stop_words:
        s = s.replace(ch, ' ')
    return s

In [11]:
def wmd(source, target):
    distance = embeddings.wmdistance(source, target)
    return (distance, 0)

In [12]:
def mwmd(source, target):
    # remove words that are not in the vocabulary from source and target.
    source = list(filter(lambda x: x in embeddings, source))
    target = list(filter(lambda x: x in embeddings, target))
     
    n = len(source)
    m = len(target)
    
    # compute distances between words
    C = np.zeros((n, m), dtype=float)
    for i in range(n):
        for j in range(m):
            first, second = embeddings[source[i]],  embeddings[target[j]]
            first_norm, second_norm = np.linalg.norm(first), np.linalg.norm(second)
            if first_norm > 0:
                first = first / first_norm
            if second_norm > 0:
                second = second / second_norm            
            C[i,j] = scipy.spatial.distance.euclidean(first, second)
    
    # Initialise variables
    x = np.zeros(n + n*m, dtype=float)
    T = x[n:].reshape(n,m)
    y = x[:n]
    
    c = np.zeros_like(x)
    c[:n] = 1.0
    
    # Inequality constraints
    b_ub = np.zeros(n*m, dtype=float)
    A_ub = np.zeros((n*m, n + n*m), dtype=float)    
    for p in range(n*m):
        for q in range(n + n*m):
            if p % n == q:
                A_ub[p, q % n] = -1.0
            if (p // n) + 2 * (p % n) + n == q:
                A_ub[p,q] = C[p % n, p // n]    
    #print(A_ub)
    
    # Equality constraints for Eq. 5 (Columns in T must be stochastic)
    CA_eq = np.zeros((n, n + n*m), dtype=float)
    Cb_eq = np.ones(n, dtype=float)
    for p in range(n):
        for q in range(n + m*p, n + m + m*p):
            CA_eq[p,q] = 1.0
            
    # Equality constraints for Eq. 4 (Rows in T must be stochastic)
    RA_eq = np.zeros((m, n + n*m), dtype=float)
    Rb_eq = np.ones(m, dtype=float)
    for p in range(m):
        for q in range(n, n + n*m):
            if p == (q - n) % m:
                RA_eq[p,q] = 1.0
    
    # Double stochasticity
    #A_eq = np.concatenate((CA_eq, RA_eq), axis=0)
    #b_eq = np.concatenate((Cb_eq, Rb_eq), axis=0)    
    
    res = scipy.optimize.linprog(c, A_ub, b_ub, CA_eq, Cb_eq, method='simplex', options={'maxiter':10000})
    #res = scipy.optimize.linprog(c, A_ub, b_ub, method='simplex')
    status = {0 : "Optimization terminated successfully",
              1 : "Iteration limit reached",
              2 : "Problem appears to be infeasible",
              3 : "Problem appears to be unbounded",
              4 : "Serious numerical difficulties encountered"}
    if res.status > 0:
        print("\x1b[31m %s \x1b[0m" % status[res.status])
    
    if res.status == 2:
        # Infeasible problem. Drop equality constrains and try again.
        res = scipy.optimize.linprog(c, A_ub, b_ub, method='simplex') 
        distance_y = np.sum(res.x[:n])
        distance_TC = C.flatten().dot(res.x[n:])
        return (distance_y, 2)        
    
    if res.status == 0:        
        print("No of iterations to optimisation = %d" % res.nit)
        # objective is the sum of y_i.
        distance_y = np.sum(res.x[:n])
        #print("sum y = %f" % distance_y)
        distance_TC = C.flatten().dot(res.x[n:])
        #print("sum TC = %f" % distance_TC)
        return (distance_y, res.status)
    else:
        return (0, res.status)    
    
    

In [None]:
# We will compute the correlation between human ratings and semantic distances over all instances

trans_data = xlrd.open_workbook('../data/olddata.xlsx')  #open the Excel spreadsheet as workbook
sheet = trans_data.sheet_by_index(0)  
instances = []
for l in range(1, sheet.nrows):
    # tokenise Japanese texts
    rows = sheet.row_values(l, 0, sheet.ncols)
    instances.append((rows[0], rows[1], float(rows[2])))
print("Total number of instances = %d" % len(instances))

# 1000 random integers between 0 and 50

human_ratings = []
distances = []
bad_count = 0
for x in instances:
    source = list(set(mecab.parse(clean_text(x[0]).lower().strip('\n')).split()))
    target = list(set(nltk.word_tokenize(clean_text(x[1]).lower().strip())))
    res = mwmd(source, target)
    if res[1] > 0:
        bad_count += 1
    else:
        distances.append(res[0])
        human_ratings.append(x[2])

print("Failed cases = %d" % bad_count)

# convert distances to similarity and scale to [0,1]
human_ratings = np.array(human_ratings)
#human_ratings = 1.0 - (human_ratings / np.max(human_ratings))
human_ratings = human_ratings
distances = np.array(distances)
distances = 1.0 - (distances / np.max(distances))
spr = scipy.stats.spearmanr(human_ratings, distances)
pearson = scipy.stats.pearsonr(human_ratings, distances)
print("Spearman Full", spr)
print("Pearson Full", pearson)

# Plot linear regression line
fit = np.polyfit(human_ratings, distances, 1)
fit_fn = np.poly1d(fit) 
plt.plot(human_ratings, fit_fn(human_ratings), '--k')

sortinds = np.argsort(human_ratings)
distances = distances[sortinds]
human_ratings = human_ratings[sortinds]
N = len(sortinds) // 2
low_human, high_human = human_ratings[: N], human_ratings[N:]
low_sim, high_sim = distances[:N], distances[N:]
print("Sperman Low", scipy.stats.spearmanr(low_human, low_sim))
print("Sperman High", scipy.stats.spearmanr(high_human, high_sim))
print("Pearson Low", scipy.stats.pearsonr(low_human, low_sim))
print("Pearson High", scipy.stats.pearsonr(high_human, high_sim))

# Compute accuracy. For low_human, predicted value must be less than or equal, 
# and for high_human predicted value must be greater than or equal to be correct.

corrects = 0
for (x,y) in zip(low_human, low_sim):
    if fit_fn(x) >= y:
        corrects += 1
for (x,y) in zip(high_human, high_sim):
    if fit_fn(x) <= y:
        corrects += 1
print("Accuracy = ", float(100 * corrects) / float(len(distances)))
plt.plot(low_human, low_sim, 'b*', high_human, high_sim, 'r+')
plt.xlabel("Human Ratings")
plt.ylabel("Translation Quality")
plt.title("Spearman = %f, Pearson = %f" % (spr[0], pearson[0]))
plt.show()


Total number of instances = 30
No of iterations to optimisation = 218
No of iterations to optimisation = 362
No of iterations to optimisation = 302
No of iterations to optimisation = 438
No of iterations to optimisation = 167
No of iterations to optimisation = 163
No of iterations to optimisation = 298
No of iterations to optimisation = 308
No of iterations to optimisation = 376
No of iterations to optimisation = 349
No of iterations to optimisation = 490
No of iterations to optimisation = 347
No of iterations to optimisation = 378
No of iterations to optimisation = 452


In [7]:
# We provide a simple UI for entering source (Japanese) and target (English) texts to compare.

def Comparison(Source_Ja, Target_En):
    source = list(set(mecab.parse(Source_Ja.lower().strip('\n')).split()))
    target = list(set(nltk.word_tokenize(Target_En.lower().strip())))
    print(source, target)
    distance = mwmd(source, target)[0]
    print("Semantic distance = %f\n" % distance)

interact_manual(Comparison, Source_Ja='私は猫が好きです', Target_En="I like cats");

The installed widget Javascript is the wrong version.


In [None]:
# Process a dataset, predict similarities and save to a file.
trans_data = xlrd.open_workbook('../data/newdata.xlsx')  
sheet = trans_data.sheet_by_index(0)  
scores = []
for l in range(1, sheet.nrows):
    rows = sheet.row_values(l, 0, sheet.ncols)
    source = list(set(mecab.parse(clean_text(rows[0]).lower().strip('\n')).split()))
    target = list(set(nltk.word_tokenize(clean_text(rows[1]).lower().strip())))
    #res = mwmd(source, target)
    res = wmd(source, target)
    val = -1 if res[1] > 0 else res[0]
    scores.append(val)

scores = np.array(scores)
max_val = np.max(scores)
print("max val", max_val)
scores = 1.0 - (scores / max_val)
with open("../data/pred-sims.csv", "w") as out_file:
    for val in scores:
        print(val)
        out_file.write("%f\n" % val)
  
    


TODO:
-	L2 normalised, l1 normalised vs. non-normalised (3 options)
-	Y_sum vs TC_sum (2 options)
-	Full vocabulary vs. restricted vocabulary (2 options)
-	Row stochasticity, column stochasticity (2 options)

- Do 3 x 2 x 2 x 2 = 24 experiments and produce the correlation plots. Prepare a table summarising the results (Spearman, Pearson for Full, High and Low, and accuracy)
24 rows and 7 columns excel sheet!
Decide which setting is the best.

* Once the answer to this question is known, we will score the newdataset using wmd and the best version of the proposed method and get humans to judge.
