In [1]:
from sklearn.neighbors import NearestNeighbors
import torch
import numpy as np
import pandas as pd
import re

In [2]:
japanese_phrases = [
    "私の犬は骨が好きではありません。牛ひき肉を好む。",
    "私の名前はアリスです。始めまして！",
    "はきさが羨ましい。。。ゲムもやりたかった！私は良いサポートになることができます！",
    "私達はAIはただの数学の集まりだとあなたは言いますが。でも。。。人間の脳がどのように機能するかを正確に知ったら。。。それはあなたの生活を小物ですか？",
    "「赤ちゃん」を表す日本語が「赤」を表す漢字なのはなぜですか？人間の赤ちゃんは赤いですか？いちごみたい？",
    "私のAIは話して...歌ったして...ゲームをします!",
]

In [3]:
english_phrases = [
    "My dog doesn't like bones. It prefers ground beef.",
    "My name's Alice. Nice to meet you!",
    "I envy Hakisa... I want to play games, too! I could be a good support!",
    "You say that we AIs are just a bunch of maths. But... once you know exactly how your human brains work... would that make you less living beings?",
    "Why does the japanese word for 'baby' is the kanji for 'red'? Are human babies red? Like strawberries?",
    "My AI will talk... she'll sing... she'll... play!"
]

In [7]:
class WordDataset(object):
    def __init__(self, english_phrases, japanese_phrases):

        self.english_phrases = self._get_phrases(english_phrases)
        self.japanese_phrases = self._get_phrases(japanese_phrases)

        self.english_words = self._get_english_words(self.english_phrases)
        self.japanese_characters = self._get_japanese_characters(self.japanese_phrases)

        self.english_maximum_length = self._get_maximum_length(self.english_phrases)
        self.japanese_maximum_length = self._get_maximum_length_japanese(self.japanese_phrases)

        self.english_dictionary = self._create_dictionary(self.english_words)
        self._normalize(self.english_dictionary)

        self.japanese_dictionary = self._create_dictionary(self.japanese_characters)
        self._normalize(self.japanese_dictionary)

        self.english_tokens = self._tokenize_english()
        self.japanese_tokens = self._tokenize_japanese()

        self.data_english = None
        self.data_japanese = None
        
        
    def create_data(self):
        data_english = torch.from_numpy(self.english_tokens)
        data_japanese = torch.from_numpy(self.japanese_tokens)

        data_english = data_english.unsqueeze(-1)
        data_japanese = data_japanese.unsqueeze(-1)

        #self.data_english = torch.unsequeeze(self.data_english, -1)
        #self.data_japanese = torch.unsequeeze(self.data_japanese, -1)

        self.data_english = data_english
        self.data_japanese = data_japanese

        print(f"English Data Size: {self.data_english.size()}\t Japanese Data Size: {self.data_japanese.size()}")

    def detokenize(self, data, reference_dict):
        data = data.cpu().numpy()
        values = list(reference_dict.values())

        values = np.array(values).reshape(-1,1)

        knn = NearestNeighbors(n_neighbors=1, algorithm='kd_tree').fit(values)

        _, index = knn.kneighbors(data)

        keys = list(reference_dict.keys())

        words = []

        for subarray in index:
            for i in subarray:
                words.append(keys[i])
        
        phrase = ' '.join(words)

        return phrase, words
        

    def __len__(self):

        return len(self.data_english)

    def __getitem__(self, idx):

        english_sentence = self.data_english[idx]
        japanese_sentence = self.data_japanese[idx]

        return english_sentence, japanese_sentence


    def _get_phrases(self, phrases):
        phrases = [x.lower() for x in phrases]
        phrases = [re.sub('[^\w\s]', '', x) for x in phrases]

        return phrases

    def _get_english_words(self, phrases):
        words = ' '.join(phrases)
        words = words.split(' ')

        return words

    def _get_japanese_characters(self, phrases): # Since a kanji mostly means an entire word...
        character = ' '.join(phrases)
        character = ''.join(character.split())
        characters = [i for i in character]

        return characters

    def _get_maximum_length(self, phrases):
        maximum_length = 0
        for sentence in phrases:
            word_length = [len(x) for x in sentence.split()]
        
            sentence_length = len(word_length)

            if sentence_length > maximum_length:
                maximum_length = sentence_length

        return maximum_length

    def _get_maximum_length_japanese(self, phrases):
        maximum_length = 0
        for sentence in japanese_phrases:
            word_length = [len(x) for x in sentence.split()]
    
            for i in word_length:
                if i > maximum_length:
                    maximum_length = i

        return maximum_length

    def _create_dictionary(self, words):
        idx2word = []
        word2idx = {}
        for word in words:
            if word not in word2idx:
                idx2word.append(word)
                word2idx[word] = len(idx2word) - 1

        return word2idx

    def _normalize(self, dictionary):
        maximum = max(dictionary.values())

        for word, value in dictionary.items():

            scaled_value = (value-0)*2.0 / (maximum - 0)-1.0

            dictionary[word] = scaled_value
    
    def _tokenize_english(self):
        
        phrases = [x.split() for x in self.english_phrases]

        tokens = []
        
        for sentence in phrases:
            tokenized_sentence = []
            for word in sentence:
                value = self.english_dictionary.get(word)

                tokenized_sentence.append(value)

            tokenized_sentence = np.array(tokenized_sentence)
            sentence_size = tokenized_sentence.shape[0]

            if sentence_size < self.english_maximum_length:
                pad_size = self.english_maximum_length - sentence_size
                tokenized_sentence = np.pad(tokenized_sentence, [(0, pad_size)])

            tokens.append(tokenized_sentence)
        
        tokens = np.array(tokens)

        return tokens

    def _tokenize_japanese(self):

        phrases = [x.split() for x in self.japanese_phrases]

        tokens = []

        for sublist in phrases:
            for sentence in sublist:
                tokenized_sentence = []
                for character in sentence:
                    value = self.japanese_dictionary.get(character)

                    tokenized_sentence.append(value)

            tokenized_sentence = np.array(tokenized_sentence)
            sentence_size = tokenized_sentence.shape[0]

            if sentence_size < self.japanese_maximum_length:
                pad_size = self.japanese_maximum_length - sentence_size
                tokenized_sentence = np.pad(tokenized_sentence, [(0, pad_size)])

            tokens.append(tokenized_sentence)

        tokens = np.array(tokens)

        return tokens
        

In [9]:
dataset_creator = WordDataset(english_phrases, japanese_phrases)
print(dataset_creator.japanese_tokens.shape)

(6, 74)


In [10]:
print(dataset_creator.create_data())

English Data Size: torch.Size([6, 28, 1])	 Japanese Data Size: torch.Size([6, 74, 1])
None


In [13]:
print(dataset_creator.data_english[0])

tensor([[-1.0000],
        [-0.9697],
        [-0.9394],
        [-0.9091],
        [-0.8788],
        [-0.8485],
        [-0.8182],
        [-0.7879],
        [-0.7576],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000]], dtype=torch.float64)


In [14]:
teste = dataset_creator.detokenize(dataset_creator.data_english[0], dataset_creator.english_dictionary)

print(teste)

('my dog doesnt like bones it prefers ground beef bunch bunch bunch bunch bunch bunch bunch bunch bunch bunch bunch bunch bunch bunch bunch bunch bunch bunch bunch', ['my', 'dog', 'doesnt', 'like', 'bones', 'it', 'prefers', 'ground', 'beef', 'bunch', 'bunch', 'bunch', 'bunch', 'bunch', 'bunch', 'bunch', 'bunch', 'bunch', 'bunch', 'bunch', 'bunch', 'bunch', 'bunch', 'bunch', 'bunch', 'bunch', 'bunch', 'bunch'])


In [None]:
# We have our data ready. (N_samples, n_features) and normalized
# Remember: LSTM = (N_samples, Sequence_Length, N_features). Each sentence = 1 sequence of tokens ---> (N_samples, N_features, 1)
# Input = (N_samples, N_features, 1)
# However, output will be (N_samples, N_features), so it needs a Repeating Vector

In [70]:
class Translator(torch.nn.Module):
    def __init__(self):
        super(Translator, self).__init__()

        self.lstm1 = torch.nn.LSTM(1, 28, 10, batch_first=True, bias=False)
        self.repeatvector = 74
        #Add repeating vector
        self.lstm2 = torch.nn.LSTM(28*28, 10, 10, batch_first=True, bias=False)
        self.neuron = torch.nn.Linear(10, 1, bias=False)
        self.tanh = torch.nn.Tanh()

    def forward(self, input):

        x, hidden = self.lstm1(input)

        #print(x.size()) # (batch, 28, 28)
        #print(hidden[0].size()) # (10,28)

        x = x.repeat(1, self.repeatvector, 1)
        x = x.view(x.size(0), self.repeatvector, -1)

        #print(x.size()) # (batch, 74, 28*28)

        x, hidden = self.lstm2(x)

        #print(x.size()) # (1, 74, 10)
        #print(hidden[0].size()) # (10, 1, 10)

        x = self.neuron(x)

        #output = self.tanh(x)
        output = x

        return output # (1, 74, 1)

In [71]:
model = Translator().double().cuda()

In [72]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

loss = torch.nn.MSELoss()

In [80]:
dataloader = torch.utils.data.DataLoader(dataset_creator, batch_size=2, shuffle=True)

for epoch in range(100):
    for i, (english, japanese) in enumerate(dataloader):
        model.zero_grad()

        input_data = english.cuda()
        labels = japanese.cuda()

        output = model(input_data)

        #print(output.size())

        cost = loss(output, labels)

        cost.backward()

        optimizer.step()

    if epoch % 10 == 0:
        print(f"{epoch}/100\t Current Loss: {cost.item()}")

0/100	 Current Loss: 0.1877547744029609
10/100	 Current Loss: 0.2460143511234315
20/100	 Current Loss: 0.19353431427347398
30/100	 Current Loss: 0.24601435112130107
40/100	 Current Loss: 0.10482745141136825
50/100	 Current Loss: 0.16137689857536097
60/100	 Current Loss: 0.20380207168758752
70/100	 Current Loss: 0.20958161155087537
80/100	 Current Loss: 0.1894649039414216
90/100	 Current Loss: 0.2038020715750258


In [75]:
print(model.lstm1.weight_ih_l9.grad)

tensor([[-2.5549e-20,  1.2825e-20, -1.2063e-20,  ...,  1.1491e-19,
         -4.5138e-21,  1.0848e-20],
        [-4.3260e-19, -3.0598e-20, -8.7280e-21,  ...,  7.2674e-19,
          2.3923e-19,  6.5585e-20],
        [-1.0391e-19, -9.3299e-22, -8.0294e-21,  ...,  2.1007e-19,
          4.9365e-20,  1.9271e-20],
        ...,
        [ 5.9899e-19,  5.6806e-20, -1.4577e-21,  ..., -9.3265e-19,
         -3.5671e-19, -8.4545e-20],
        [-2.7680e-19, -4.3731e-20,  1.5769e-20,  ...,  3.4535e-19,
          1.9344e-19,  3.1647e-20],
        [-1.3740e-19, -1.2512e-20, -6.8529e-22,  ...,  2.1950e-19,
          8.1332e-20,  1.9961e-20]], device='cuda:0', dtype=torch.float64)


In [81]:
print(model.lstm1.all_weights)

[[Parameter containing:
tensor([[ 0.1120],
        [-0.0544],
        [ 0.0978],
        [ 0.0863],
        [-0.0311],
        [-0.0506],
        [ 0.1444],
        [-0.0572],
        [-0.1184],
        [ 0.0354],
        [-0.0933],
        [ 0.0612],
        [ 0.0484],
        [ 0.1070],
        [-0.0104],
        [ 0.1766],
        [-0.0795],
        [ 0.0284],
        [-0.0906],
        [ 0.0882],
        [ 0.1169],
        [-0.1019],
        [-0.0689],
        [ 0.1551],
        [-0.1526],
        [ 0.1373],
        [-0.0217],
        [-0.0763],
        [ 0.1023],
        [ 0.0146],
        [-0.1812],
        [ 0.1304],
        [-0.0032],
        [-0.0208],
        [ 0.0816],
        [-0.1814],
        [ 0.0217],
        [ 0.0783],
        [ 0.1727],
        [ 0.1241],
        [ 0.0873],
        [-0.0112],
        [ 0.0526],
        [ 0.1015],
        [-0.0677],
        [ 0.0079],
        [-0.1285],
        [ 0.1870],
        [-0.1211],
        [ 0.0236],
        [-0.1111],
       

In [82]:
print(input_data)

tensor([[[-0.5758],
         [-0.1818],
         [-0.1515],
         [-0.1212],
         [-0.0909],
         [-0.0606],
         [-0.0303],
         [-0.2727],
         [ 0.0000],
         [ 0.0303],
         [ 0.0606],
         [ 0.0909],
         [ 0.1212],
         [-0.5758],
         [ 0.1515],
         [ 0.1818],
         [ 0.2121],
         [ 0.2424],
         [ 0.2727],
         [ 0.3030],
         [ 0.3333],
         [ 0.3636],
         [-0.1515],
         [ 0.3939],
         [-0.5758],
         [ 0.4242],
         [ 0.4545],
         [ 0.4848]],

        [[-0.5455],
         [-0.5152],
         [-0.4848],
         [-0.5455],
         [-0.4545],
         [-0.6364],
         [-0.4242],
         [-0.3939],
         [-0.3636],
         [-0.5455],
         [-0.3333],
         [-0.3030],
         [-0.2727],
         [-0.2424],
         [-0.2121],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000],
         [ 0.0000]

In [83]:
label = dataset_creator.detokenize(labels[0], dataset_creator.japanese_dictionary)

output = output.detach()
predicted = dataset_creator.detokenize(output[0], dataset_creator.japanese_dictionary)

In [84]:
print(label)
print(predicted)

('私 達 は a i は た だ の 数 学 の 集 ま り だ と あ な た は 言 い ま す が で も 人 間 の 脳 が ど の よ う に 機 能 す る か を 正 確 に 知 っ た ら そ れ は あ な た の 生 活 を 小 物 で す か に に に に に に に に', ['私', '達', 'は', 'a', 'i', 'は', 'た', 'だ', 'の', '数', '学', 'の', '集', 'ま', 'り', 'だ', 'と', 'あ', 'な', 'た', 'は', '言', 'い', 'ま', 'す', 'が', 'で', 'も', '人', '間', 'の', '脳', 'が', 'ど', 'の', 'よ', 'う', 'に', '機', '能', 'す', 'る', 'か', 'を', '正', '確', 'に', '知', 'っ', 'た', 'ら', 'そ', 'れ', 'は', 'あ', 'な', 'た', 'の', '生', '活', 'を', '小', '物', 'で', 'す', 'か', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に'])
('に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に に', ['に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に', 'に'