In [1]:
# much of this code, particularly the encoder decoder code, is taken from https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
# the rest is adapted for this project but is still pretty similar

from io import open
import unicodedata
import string
import re
import random
import os

import pandas as pd

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

from sklearn.model_selection import train_test_split

In [64]:
# for start and end of string
SOS_token = 0
EOS_token = 1

# Alphabet class (works with both pinyin and English)
class Alphabet:
    def __init__(self, name):
        self.name = name
        self.letter2index = {}
        self.letter2count = {}
        self.index2letter = {0: "SOS", 1: "EOS"}
        self.n_letters = 2
        
    def add_name(self, name):
        """
        Adds the characters of a name to the alphabet by iterating over them
        and updating the appropriate counts
        """
        # for pinyin we can use the syllables instead of the raw letters. 
        name = name.split(" ") # they'll still fall into the "letter" category though
            
        for letter in name: 
            if letter not in self.letter2index:
                self.letter2index[letter] = self.n_letters
                self.letter2count[letter] = 1
                self.index2letter[self.n_letters] = letter
                self.n_letters += 1
            else:
                self.letter2count[letter] += 1

                    
            


In [89]:
data_file = os.path.join("..", "data", "TransliterationSearchAndData.csv")

def normalize(s):
    """
    Right now just converts a string to lowercase but could be something more later
    (such as removing spaces)
    """
    s = re.sub(r"([-.·])", r"", s) # remove punctuation that seems to have seeped in (including chinese dash)
    return s.lower()

def read_alphabets():
    """
    Creates two alphabets, one for search output names and the other for ground truth pinyin
    Iterates through data file to initialize those alphabets
    """
    input_alph = Alphabet("SearchOutput")
    output_alph = Alphabet("Pinyin")
    pairs = []
    
    df = pd.read_csv(data_file)
    for row_i, row in df.iterrows():
        pinyin, search_output = row
        search_output = normalize(search_output)
        pinyin = normalize(pinyin)
        input_alph.add_name(search_output)
        output_alph.add_name(pinyin) # includes spaces
        pairs.append([search_output, pinyin])
     
    print("Input names have: {} characters".format(input_alph.n_letters))
    print("Output names have: {} characters".format(output_alph.n_letters))

    return input_alph, output_alph, pairs
        
srch_alph, pin_alph, pairs = read_alphabets()

Input names have: 148 characters
Output names have: 279 characters


In [66]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        # Embedding is really just a lookup table that takes an index input and returns some k-dimensional vector
        # input_size is size of table, hidden size is number of weights associated with each vector. We really only
        # need this to be a one-hot vector for our purposes so we can probably don't need to explicity represent an 
        # embedding. The smaller our embedding dimension the more information we're giving up
        self.embedding = nn.Embedding(input_size, hidden_size) 
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size)#, device=device)

In [67]:
class DecoderRNN(nn.Module):
    
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        # still need an embedding
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1) # assume because we have 1-d data
        
    def forward(self, x, hidden):
        output = self.embedding(x).view(1, 1, -1)
        output = F.relu(output) # regularization thing
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0])) # output is only going to have a single thing, so this is legal i guess
        return output, hidden
     

In [78]:
def indexesFromName(alphabet, name):
    return [alphabet.letter2index[l] for l in name.split(" ")]

def tensorFromName(alphabet, name):
    indexes = indexesFromName(alphabet, name)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long).view(-1, 1)

def tensorsFromPair(pair):
    input_tensor = tensorFromName(srch_alph, pair[0])
    target_tensor = tensorFromName(pin_alph, pair[1])
    return (input_tensor, target_tensor)


In [69]:
# how often we use the target input as input to our decoder rather than our decoder's guess
# while training 
teacher_forcing_ratio = 0.5 
MAX_LENGTH = 20


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer,
         criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden() # just 0's
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)
    
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size)
    
    loss = 0 # mission accomplished ;)
    
    # actually run the thing that encodes
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]
        
    
    # now its decoder time - this part changes somewhat if you add attention
    decoder_input = torch.tensor([[SOS_token]])
    decoder_hidden = encoder_hidden # no need for an init function
    
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    
    if use_teacher_forcing:
        # target is next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di] # bc we're using teacher focing
            
    else:
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1) # returns a tuple of the largest value and its index as tensors
            decoder_input = topi.squeeze().detach() # I'm not totally sure what this does
            loss += criterion(decoder_output, target_tensor[di])
            
            if decoder_input.item() == EOS_token:
                break # we're done with this sentence - we don't have to do this above bc it goes to the end of the string automatically
    
    
    # backprop
    loss.backward()
    
    # run SGD which is in the encoder/decoder_optimizer object
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss.item()/target_length # not sure what this is, but we can see I guess


In [70]:
# copied directly for profiling...
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [71]:
# Actually do the training:

def trainIters(encoder, decoder, training_pairs, n_iters, print_every=1000, plot_every=100, learning_rate = 0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0
    plot_loss_total = 0
    
    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(training_pairs)) for i in range(n_iters)]
    criterion = nn.NLLLoss() # this is... negative log likelihood loss
                             # it's the same as cross-entropy loss bc of the log softmax in the last layer
    
    for iter_i in range(1, n_iters + 1):
        training_pair = training_pairs[iter_i -1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]
        
        loss = train(input_tensor, target_tensor, encoder, decoder,
                     encoder_optimizer, decoder_optimizer, criterion)
        
        print_loss_total += loss
        plot_loss_total += loss
        
        if iter_i % print_every == 0:
            print_loss_avg = print_loss_total / print_every # calc avg loss
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter_i / n_iters),
                                         iter_i, iter_i / n_iters * 100, print_loss_avg))
        # for plotting loss
        if iter_i % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [72]:
%matplotlib inline

#shamelessly copied from tutorial... yikes
import matplotlib.pyplot as plt
#plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)
    plt.show()

In [73]:
# Same as training, just no targets - just rum the thing through the network

def evaluate(encoder, decoder, name, max_len=MAX_LENGTH):
    with torch.no_grad(): # not totally sure what this does tbh - probably stops from updating gradients like we do in training because we are done with training
        input_tensor = tensorFromName(srch_alph, name)
        input_length = input_tensor.size(0) # just the size of the first dimension
        encoder_hidden = encoder.initHidden()
        encoder_outputs = torch.zeros(max_len, encoder.hidden_size)
        
        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(
                input_tensor[ei], encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0] # is a vector 
        
        # decoder - would have to change if added attention
        decoder_input = torch.tensor([[SOS_token]])
        decoder_hidden = encoder_hidden
        decoded_name = []
        
        for di in range(max_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            
            # transliterate to actual words 
            if topi.item() == EOS_token:
                decoded_name.append("<EOS>")
                break
            else:
                decoded_name.append(pin_alph.index2letter[topi.item()])
            
            decoder_input = topi.squeeze().detach()
            
            
        return decoded_name

In [74]:
def evaluateRandomLines(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_name = evaluate(encoder, decoder, pair[0])
        print('<', ''.join(output_name[:-1]))
        print()

In [75]:
def evaluateAllLines(encoder, decoder, eval_pairs, spaces = False):
    distance = 0
    diff_count = 0
    for pair in eval_pairs:
        output_name = evaluate(encoder, decoder, pair[0])
        
        # maintain spaces
        if spaces:
            output_name  = ''.join(output_name[:-1])
            target_name = pair[1]
            print(output_name, target_name)
        else:
            # remove the space for edit distance calculations for consistency with baseline
            output_name = ''.join(filter(lambda l: l != ' ', output_name[:-1])) # need to get rid of the <EOS> string at end
            target_name = ''.join(filter(lambda l: l != ' ', pair[1]))
        if output_name != target_name:
            
            diff_count += 1
            distance += edit_distance_pinyin(target_name, output_name)
    print("Out of {} names, {} were different, with an average edit distance of {} ({} for just the different pairs)".format(len(eval_pairs), diff_count, distance/len(eval_pairs), distance/diff_count))


In [76]:
def train_and_evaluate(encoder, decoder):
    srch_names = [pair[0] for pair in pairs]
    pinyin_names = [pair[1] for pair in pairs]
    train_srch, test_srch, train_pin, test_pin = train_test_split(srch_names, pinyin_names, test_size=100)
    train_pairs = list(zip(train_srch, train_pin))
    test_pairs = list(zip(test_srch, test_pin))
    #trainIters(encoder, decoder, train_pairs, 75000, print_every=5000)
    
    evaluateAllLines(encoder, decoder, test_pairs)
    return train_pairs, test_pairs
    

In [86]:
# now actually do the thing!
hidden_size = 20
encoder = EncoderRNN(srch_alph.n_letters, hidden_size)
decoder = DecoderRNN(hidden_size, pin_alph.n_letters)

train, test = train_and_evaluate(encoder, decoder)

Out of 100 names, 100 were different, with an average edit distance of 53.67 (53.67 for just the different pairs)


In [87]:
evaluateRandomLines(encoder, decoder, 10)

> bā ér è lè
= dá ruì ěr
< kēnánjīkǎnpānjīkǎnsòngkǎnsòngkǎnsòngtīngpānjīkǎnpānjīkǎn

> bì bā
= lì wá
< kēyínànànànànànànànànànànànànànànànànà

> bā cuì è
= jié kè
< kēnánjīkǎnpānjīkǎnsòngkǎnsòngkǎnsòngtīngpānjīkǎnpānjīkǎn

> bo sà
= luó shā
< kēnánjīkǎnpānjīkǎnsòngkǎnsòngkǎnsòngtīngpānjīkǎnpānjīkǎn

> bo sī è
= luó xī
< kēyíyuējījīkǎnsòngkǎnsòngkǎnsòngtīngpānjīkǎnpānjīkǎnsòng

> bān
= yī bāng
< kēyíyuējījīkǎnsòngkǎnsòngkǎnsòngtīngpānjīkǎnpānjīkǎnsòng

> de ān
= dí ēn
< kēyíyuējījīkǎnsòngkǎnsòngkǎnsòngtīngpānjīkǎnpānjīkǎnsòng

> bā ruì bo
= mǎ lǐ ào
< kēyíyuējījīkǎnsòngkǎnsòngkǎnsòngtīngpānjīkǎnpānjīkǎnsòng

> sài yà
= dài xī
< kēyíyuējījīkǎnsòngkǎnsòngkǎnsòngtīngpānjīkǎnpānjīkǎnsòng

> bā bài è lè lè
= jiā bǐ lǐ lā
< kēnánjīkǎnpānjīkǎnsòngkǎnsòngkǎnsòngtīngpānjīkǎnpānjīkǎn



In [77]:
evaluateAllLines(encoder2, decoder2, [('zev', '')], spaces = True)

wéidì 
Out of 1 names, 1 were different, with an average edit distance of 5.0 (5.0 for just the different pairs)


In [34]:
# now actually do the thing!
hidden_size = 20
#encoder2 = EncoderRNN(eng_alph.n_letters, hidden_size)
#decoder2 = DecoderRNN(hidden_size, pin_alph.n_letters)

train, test = train_and_evaluate(encoder2, decoder2)
#trainIters(encoder2, decoder2, 75000, print_every=5000)

Out of 100 names, 93 were different, with an average edit distance of 3.54 (3.806451612903226 for just the different pairs)


In [74]:
evaluateRandomLines(encoder2, decoder2, 10)

> sampson
= xīn pǔ sēn
< shāmǔsēn

> boyd
= bù dé
< bùdé

> bernice
= bǎi nī sī
< bùlánní

> denise
= dān nī sī
< wéinísī

> martina
= mǎ dì nà
< mǎdìnà

> maurice
= mó lǐ sī
< mǎlǐsī

> cathy
= kǎi xī
< kǎixī

> janice
= zhān nī sī
< zhēnnīsī

> flora
= fú luō lā
< fúlālā

> trina
= cuī nà
< tètènà



In [88]:
import datetime
def save_model(encoder, decoder):
    torch.save(encoder.state_dict(), os.path.join("..", "models", "{date:%Y-%m-%d-%H:%M:%S}-encoder".format(date=datetime.datetime.now())))
    torch.save(decoder.state_dict(), os.path.join("..", "models", "{date:%Y-%m-%d-%H:%M:%S}-decoder".format(date=datetime.datetime.now())))

save_model(encoder1, decoder1)

In [84]:
def load_model(encoder_path, decoder_path):
    encoder1 = EncoderRNN(srch_alph.n_letters, hidden_size)
    decoder1 = DecoderRNN(hidden_size, pin_alph.n_letters)
    encoder1.load_state_dict(torch.load(encoder_path))
    decoder1.load_state_dict(torch.load(decoder_path))
    return encoder1, decoder1

'abcde'

In [32]:
sorted(pin_alph.index2letter.values())

['EOS',
 'SOS',
 'bo',
 'bài',
 'bào',
 'bái',
 'bèi',
 'bì',
 'bó',
 'bù',
 'bā',
 'bān',
 'bāng',
 'běn',
 'bīn',
 'bō',
 'bǎi',
 'bǎo',
 'bǐ',
 'chá',
 'chè',
 'cuì',
 'cuī',
 'cài',
 'de',
 'diān',
 'duō',
 'dà',
 'dài',
 'dào',
 'dá',
 'dèng',
 'dé',
 'dì',
 'dí',
 'dù',
 'dùn',
 'dān',
 'dāng',
 'dēng',
 'dī',
 'dīng',
 'dōng',
 'dōu',
 'dū',
 'dūn',
 'fàn',
 'fán',
 'fèi',
 'fù',
 'fú',
 'fāng',
 'fēi',
 'fēn',
 'fěi',
 'fū',
 'fǎ',
 'guān',
 'guǒ',
 'gài',
 'gè',
 'gé',
 'gān',
 'gāo',
 'gē',
 'gēn',
 'gū',
 'gǔ',
 'huá',
 'huò',
 'hàn',
 'hán',
 'háo',
 'hè',
 'hé',
 'hú',
 'hā',
 'hēng',
 'hū',
 'hǎi',
 'hǎn',
 'hǎo',
 'jié',
 'jiā',
 'jiǎ',
 'jiǎn',
 'jì',
 'jí',
 'jī',
 'jīn',
 'kuí',
 'kè',
 'kòu',
 'kù',
 'kāng',
 'kē',
 'kě',
 'kěn',
 'kūn',
 'kǎ',
 'kǎi',
 'kǎn',
 'kǎo',
 'lián',
 'liáng',
 'liè',
 'luò',
 'luó',
 'luō',
 'lài',
 'lái',
 'lán',
 'láo',
 'lè',
 'léi',
 'lì',
 'lìlù',
 'lín',
 'líng',
 'lóng',
 'lù',
 'lú',
 'lún',
 'lüè',
 'lā',
 'lēi',
 'lěi',
 'lǎng',


In [53]:
#df = pd.read_csv(data_file)
max([len(x) for x in df['first name']])

11

In [37]:
"../models/decoder-2018-11-14-23:32:47"

AttributeError: 'Series' object has no attribute 'contains'

In [15]:
# specifically for pinyin - doesn't penalize wrong tones as much
def edit_distance_pinyin(str1, str2):
    cache = {}
    def recurse(str1, str2):
        # base cases
        if len(str1) == 0:
            return len(str2)
        elif len(str2) == 0:
            return len(str1)
        
        if cache.get((str1, str2), -1) != -1:
            return cache[(str1, str2)]
        # recursive case
        if str1[0] == str2[0]:
            ed = recurse(str1[1:], str2[1:])
            cache[(str1, str2)] = ed
            return ed
        if have_diff_tones(str1[0], str2[0]):
            ed = 0.5 + recurse(str1[1:], str2[1:])
            cache[(str1, str2)] = ed
            return ed
        else:
            # min of insert into 1, insert into 2, replace
            ed = 1 + min(recurse(str1, str2[1:]), recurse(str1[1:], str2), recurse(str1[1:], str2[1:]))
            cache[(str1, str2)] = ed
            return ed

    return recurse(str1, str2) #- common_chars(str1, str2)

In [16]:
# specifically for pinyin 
def have_diff_tones(v1, v2):
    result = False
    if v1 in 'āáǎàa' and v2 in 'āáǎàa':
        return True
    elif v1 in 'ēéěèe' and v2 in 'ēéěèe':
        return True
    elif v1 in 'īíǐìi' and v2 in 'īíǐìi':
        return True
    elif v1 in 'ōóǒòo' and v2 in 'ōóǒòo':
        return True
    elif v1 in 'ūúǔùu' and v2 in 'ūúǔùu':
        return True
    elif v1 in 'ǖǘǚǜü' and v2 in 'ǖǘǚǜü':
        return True
    else:
        return False

In [17]:
edit_distance_pinyin('ben', 'běn')

0.5