In [1]:
from sklearn.neighbors import NearestNeighbors
import torch
import numpy as np
import pandas as pd
import re

In [2]:
japanese_phrases = [
    "私の犬は骨が好きではありません。牛ひき肉を好む。",
    "私の名前はアリスです。始めまして！",
    "はきさが羨ましい。。。ゲムもやりたかった！私は良いサポートになることができます！",
    "私達はAIはただの数学の集まりだとあなたは言いますが。でも。。。人間の脳がどのように機能するかを正確に知ったら。。。それはあなたの生活を小物ですか？",
    "「赤ちゃん」を表す日本語が「赤」を表す漢字なのはなぜですか？人間の赤ちゃんは赤いですか？いちごみたい？",
    "私のAIは話して...歌ったして...ゲームをします!",
]

In [3]:
english_phrases = [
    "My dog doesn't like bones. It prefers ground beef.",
    "My name's Alice. Nice to meet you!",
    "I envy Hakisa... I want to play games, too! I could be a good support!",
    "You say that we AIs are just a bunch of maths. But... once you know exactly how your human brains work... would that make you less living beings?",
    "Why does the japanese word for 'baby' is the kanji for 'red'? Are human babies red? Like strawberries?",
    "My AI will talk... she'll sing... she'll... play!"
]

In [7]:
class WordDataset(object):
    def __init__(self, english_phrases, japanese_phrases):

        self.english_phrases = self._get_phrases(english_phrases)
        self.japanese_phrases = self._get_phrases(japanese_phrases)

        self.english_words = self._get_english_words(self.english_phrases)
        self.japanese_characters = self._get_japanese_characters(self.japanese_phrases)

        self.english_maximum_length = self._get_maximum_length(self.english_phrases)
        self.japanese_maximum_length = self._get_maximum_length_japanese(self.japanese_phrases)

        self.english_dictionary = self._create_dictionary(self.english_words)
        self._normalize(self.english_dictionary)

        self.japanese_dictionary = self._create_dictionary(self.japanese_characters)
        self._normalize(self.japanese_dictionary)

        self.english_tokens = self._tokenize_english()
        self.japanese_tokens = self._tokenize_japanese()

        self.data_english = None
        self.data_japanese = None
        
        
    def create_data(self):
        data_english = torch.from_numpy(self.english_tokens)
        data_japanese = torch.from_numpy(self.japanese_tokens)

        data_english = data_english.unsqueeze(-1)
        data_japanese = data_japanese.unsqueeze(-1)

        #self.data_english = torch.unsequeeze(self.data_english, -1)
        #self.data_japanese = torch.unsequeeze(self.data_japanese, -1)

        self.data_english = data_english
        self.data_japanese = data_japanese

        print(f"English Data Size: {self.data_english.size()}\t Japanese Data Size: {self.data_japanese.size()}")

    def detokenize(self, data, reference_dict):
        data = data.cpu().numpy()
        values = list(reference_dict.values())

        values = np.array(values).reshape(-1,1)

        knn = NearestNeighbors(n_neighbors=1, algorithm='kd_tree').fit(values)

        _, index = knn.kneighbors(data)

        keys = list(reference_dict.keys())

        words = []

        for subarray in index:
            for i in subarray:
                words.append(keys[i])
        
        phrase = ' '.join(words)

        return phrase, words
        

    def __len__(self):

        return len(self.data_english)

    def __getitem__(self, idx):

        english_sentence = self.data_english[idx]
        japanese_sentence = self.data_japanese[idx]

        return english_sentence, japanese_sentence


    def _get_phrases(self, phrases):
        phrases = [x.lower() for x in phrases]
        phrases = [re.sub('[^\w\s]', '', x) for x in phrases]

        return phrases

    def _get_english_words(self, phrases):
        words = ' '.join(phrases)
        words = words.split(' ')

        return words

    def _get_japanese_characters(self, phrases): # Since a kanji mostly means an entire word...
        character = ' '.join(phrases)
        character = ''.join(character.split())
        characters = [i for i in character]

        return characters

    def _get_maximum_length(self, phrases):
        maximum_length = 0
        for sentence in phrases:
            word_length = [len(x) for x in sentence.split()]
        
            sentence_length = len(word_length)

            if sentence_length > maximum_length:
                maximum_length = sentence_length

        return maximum_length

    def _get_maximum_length_japanese(self, phrases):
        maximum_length = 0
        for sentence in japanese_phrases:
            word_length = [len(x) for x in sentence.split()]
    
            for i in word_length:
                if i > maximum_length:
                    maximum_length = i

        return maximum_length

    def _create_dictionary(self, words):
        idx2word = []
        word2idx = {}
        for word in words:
            if word not in word2idx:
                idx2word.append(word)
                word2idx[word] = len(idx2word) - 1

        return word2idx

    def _normalize(self, dictionary):
        maximum = max(dictionary.values())

        for word, value in dictionary.items():

            scaled_value = (value-0)*2.0 / (maximum - 0)-1.0

            dictionary[word] = scaled_value
    
    def _tokenize_english(self):
        
        phrases = [x.split() for x in self.english_phrases]

        tokens = []
        
        for sentence in phrases:
            tokenized_sentence = []
            for word in sentence:
                value = self.english_dictionary.get(word)

                tokenized_sentence.append(value)

            tokenized_sentence = np.array(tokenized_sentence)
            sentence_size = tokenized_sentence.shape[0]

            if sentence_size < self.english_maximum_length:
                pad_size = self.english_maximum_length - sentence_size
                tokenized_sentence = np.pad(tokenized_sentence, [(0, pad_size)])

            tokens.append(tokenized_sentence)
        
        tokens = np.array(tokens)

        return tokens

    def _tokenize_japanese(self):

        phrases = [x.split() for x in self.japanese_phrases]

        tokens = []

        for sublist in phrases:
            for sentence in sublist:
                tokenized_sentence = []
                for character in sentence:
                    value = self.japanese_dictionary.get(character)

                    tokenized_sentence.append(value)

            tokenized_sentence = np.array(tokenized_sentence)
            sentence_size = tokenized_sentence.shape[0]

            if sentence_size < self.japanese_maximum_length:
                pad_size = self.japanese_maximum_length - sentence_size
                tokenized_sentence = np.pad(tokenized_sentence, [(0, pad_size)])

            tokens.append(tokenized_sentence)

        tokens = np.array(tokens)

        return tokens
        

In [9]:
dataset_creator = WordDataset(english_phrases, japanese_phrases)
print(dataset_creator.japanese_tokens.shape)

(6, 74)


In [10]:
print(dataset_creator.create_data())

English Data Size: torch.Size([6, 28, 1])	 Japanese Data Size: torch.Size([6, 74, 1])
None


In [13]:
print(dataset_creator.data_english[0])

tensor([[-1.0000],
        [-0.9697],
        [-0.9394],
        [-0.9091],
        [-0.8788],
        [-0.8485],
        [-0.8182],
        [-0.7879],
        [-0.7576],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000]], dtype=torch.float64)


In [14]:
teste = dataset_creator.detokenize(dataset_creator.data_english[0], dataset_creator.english_dictionary)

print(teste)

('my dog doesnt like bones it prefers ground beef bunch bunch bunch bunch bunch bunch bunch bunch bunch bunch bunch bunch bunch bunch bunch bunch bunch bunch bunch', ['my', 'dog', 'doesnt', 'like', 'bones', 'it', 'prefers', 'ground', 'beef', 'bunch', 'bunch', 'bunch', 'bunch', 'bunch', 'bunch', 'bunch', 'bunch', 'bunch', 'bunch', 'bunch', 'bunch', 'bunch', 'bunch', 'bunch', 'bunch', 'bunch', 'bunch', 'bunch'])


In [None]:
# We have our data ready. (N_samples, n_features) and normalized
# Remember: LSTM = (N_samples, Sequence_Length, N_features). Each sentence = 1 sequence of tokens ---> (N_samples, N_features, 1)
# Input = (N_samples, N_features, 1)
# However, output will be (N_samples, N_features), so it needs a Repeating Vector

In [70]:
class Translator(torch.nn.Module):
    def __init__(self):
        super(Translator, self).__init__()

        self.lstm1 = torch.nn.LSTM(1, 28, 10, batch_first=True, bias=False)
        self.repeatvector = 74
        #Add repeating vector
        self.lstm2 = torch.nn.LSTM(28*28, 10, 10, batch_first=True, bias=False)
        self.neuron = torch.nn.Linear(10, 1, bias=False)
        self.tanh = torch.nn.Tanh()

    def forward(self, input):

        x, hidden = self.lstm1(input)

        #print(x.size()) # (batch, 28, 28)
        #print(hidden[0].size()) # (10,28)

        x = x.repeat(1, self.repeatvector, 1)
        x = x.view(x.size(0), self.repeatvector, -1)

        #print(x.size()) # (batch, 74, 28*28)

        x, hidden = self.lstm2(x)

        #print(x.size()) # (1, 74, 10)
        #print(hidden[0].size()) # (10, 1, 10)

        x = self.neuron(x)

        #output = self.tanh(x)
        output = x

        return output # (1, 74, 1)

In [71]:
model = Translator().double().cuda()

In [72]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

loss = torch.nn.MSELoss()

In [86]:
dataloader = torch.utils.data.DataLoader(dataset_creator, batch_size=6, shuffle=True)

for epoch in range(1000):
    for i, (english, japanese) in enumerate(dataloader):
        model.zero_grad()

        input_data = english.cuda()
        labels = japanese.cuda()

        output = model(input_data)

        #print(output.size())

        cost = loss(output, labels)

        cost.backward()

        optimizer.step()

    if epoch % 100 == 0:
        print(f"{epoch}/100\t Current Loss: {cost.item()}")

0/100	 Current Loss: 0.1799799536706417
100/100	 Current Loss: 0.18088331886787473
200/100	 Current Loss: 0.17974051069632735
300/100	 Current Loss: 0.16689358242531543
400/100	 Current Loss: 0.17899582512616644
500/100	 Current Loss: 0.1663501226817732
600/100	 Current Loss: 0.15841088332826622
700/100	 Current Loss: 0.14670436829030095
800/100	 Current Loss: 0.1438153839202247
900/100	 Current Loss: 0.14183392457175673


In [87]:
print(model.lstm1.weight_ih_l9.grad)

tensor([[-1.5920e-07,  5.0225e-11, -1.8695e-04,  ..., -1.5026e-04,
         -1.3216e-07, -1.2024e-12],
        [-5.8854e-11,  4.4368e-11,  1.5414e-05,  ..., -1.0821e-05,
         -2.8990e-09, -2.1488e-14],
        [ 1.7191e-10,  6.1581e-08,  2.5159e-04,  ..., -1.7052e-03,
         -5.1897e-07, -2.2401e-10],
        ...,
        [ 9.3724e-07, -2.1324e-11,  4.2228e-04,  ...,  8.9386e-04,
         -1.6415e-04,  3.0094e-14],
        [ 7.6479e-10, -2.7818e-09,  2.7644e-07,  ...,  6.3740e-08,
          1.6437e-08, -3.7159e-13],
        [ 5.6168e-10,  1.9884e-08, -7.5409e-07,  ...,  4.8892e-04,
          1.6741e-05,  2.5479e-12]], device='cuda:0', dtype=torch.float64)


In [88]:
print(model.lstm1.all_weights)

[[Parameter containing:
tensor([[-3.1537],
        [ 1.0168],
        [ 2.4149],
        [-2.3363],
        [-2.7019],
        [ 4.7196],
        [-3.4235],
        [ 4.6948],
        [ 2.6935],
        [ 4.0982],
        [-5.0197],
        [ 2.4118],
        [-2.5449],
        [ 2.6152],
        [ 2.1790],
        [-5.2558],
        [-1.0405],
        [ 0.2663],
        [-3.6508],
        [ 2.4692],
        [-2.2581],
        [-3.5061],
        [ 2.4309],
        [ 3.4463],
        [-7.7491],
        [-2.2516],
        [-2.5307],
        [-0.1579],
        [-5.7188],
        [-0.6784],
        [ 0.3450],
        [-5.3955],
        [-2.8618],
        [ 4.1940],
        [-2.4424],
        [-0.7449],
        [ 1.1524],
        [-0.3171],
        [-4.1593],
        [ 3.4510],
        [-4.0769],
        [-1.4131],
        [ 1.9032],
        [-3.6074],
        [ 0.7404],
        [ 3.1978],
        [-2.1903],
        [-5.5314],
        [-0.3694],
        [-0.2462],
        [-1.9679],
       

In [89]:
print(input_data)

tensor([[[-1.0000],
         [-0.7273],
         [-0.6970],
         [-0.6667],
         [-0.6364],
         [-0.6061],
         [-0.5758],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000]],

        [[-1.0000],
         [ 0.8788],
         [ 0.9091],
         [ 0.9394],
         [ 0.9697],
         [ 1.0000],
         [ 0.9697],
         [-0.4242],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000]

In [90]:
label = dataset_creator.detokenize(labels[0], dataset_creator.japanese_dictionary)

output = output.detach()
predicted = dataset_creator.detokenize(output[0], dataset_creator.japanese_dictionary)

In [91]:
print(label)
print(predicted)

('私 の 名 前 は ア リ ス で す 始 め ま し て に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に', ['私', 'の', '名', '前', 'は', 'ア', 'リ', 'ス', 'で', 'す', '始', 'め', 'ま', 'し', 'て', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に'])
('ト 始 羨 め む い 前 た っ っ 始 ゲ ア い す な サ い 始 始 て な ゲ た 良 サ ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト ト', ['ト', '始', '羨', 'め', 'む', 'い', '前', 'た', 'っ', 'っ', '始', 'ゲ', 'ア', 'い', 'す', 'な', 'サ', 'い', '始', '始', 'て', 'な', 'ゲ', 'た', '良', 'サ', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト', 'ト'