In [3]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split

!pip3 install unidecode
from unidecode import unidecode
from string import ascii_letters



In [2]:
!curl -O https://download.pytorch.org/tutorial/data.zip; unzip data.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2814k  100 2814k    0     0  5130k      0 --:--:-- --:--:-- --:--:-- 5126k
Archive:  data.zip
replace data/eng-fra.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [11]:
data_dir = 'data/names'

langMap = {f.split('.')[0]: torch.tensor(i) for i, f in enumerate(os.listdir(data_dir))}
totalLangs = len(langMap)
langMap

{'Czech': tensor(0),
 'German': tensor(1),
 'Arabic': tensor(2),
 'Japanese': tensor(3),
 'Chinese': tensor(4),
 'Vietnamese': tensor(5),
 'Russian': tensor(6),
 'French': tensor(7),
 'Irish': tensor(8),
 'English': tensor(9),
 'Spanish': tensor(10),
 'Greek': tensor(11),
 'Italian': tensor(12),
 'Portuguese': tensor(13),
 'Scottish': tensor(14),
 'Dutch': tensor(15),
 'Korean': tensor(16),
 'Polish': tensor(17)}

In [12]:
unidecode('Ślusàrski')

'Slusarski'

In [13]:
stoi = {s: i for i, s in enumerate(ascii_letters + " .,;!?:'\"-")}
vocabSize = len(stoi)

In [14]:
# seq_len, vocab_size
def wtot(w):
  t = torch.zeros(len(w), vocabSize)
  for i, c in enumerate(w):
    t[i, stoi[c]] = 1
  return t

In [15]:
wtot('a-')

tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 1.]])

In [16]:
tensor_names = []
target_langs = []

for file in os.listdir(data_dir):
    with open(os.path.join(data_dir, file)) as f:
        lang = file.split(".")[0]
        names = [unidecode(line.strip()) for line in f]
        print(names)
        for name in names:
            try:
                tensor_names.append(wtot(name))
                target_langs.append(langMap[lang])
            except KeyError:
                pass


['Abl', 'Adsit', 'Ajdrna', 'Alt', 'Antonowitsch', 'Antonowitz', 'Bacon', 'Ballalatak', 'Ballaltick', 'Bartonova', 'Bastl', 'Baroch', 'Benesch', 'Betlach', 'Biganska', 'Bilek', 'Blahut', 'Blazek', 'Blazek', 'Blazejovsky', 'Blecha', 'Bleskan', 'Blober', 'Bock', 'Bohac', 'Bohunovsky', 'Bolcar', 'Borovka', 'Borovski', 'Borowski', 'Borovsky', 'Brabbery', 'Brezovjak', 'Brousil', 'Bruckner', 'Buchta', 'Cablikova', 'Camfrlova', 'Cap', 'Cerda', 'Cermak', 'Chermak', 'Cermak', 'Cernochova', 'Cernohous', 'Cerny', 'Cerney', 'Cerny', 'Cerv', 'Cervenka', 'Chalupka', 'Charlott', 'Chemlik', 'Chicken', 'Chilar', 'Chromy', 'Cihak', 'Clineburg', 'Klineberg', 'Cober', 'Colling', 'Cvacek', 'Czabal', 'Damell', 'Demall', 'Dehmel', 'Dana', 'Dejmal', 'Dempko', 'Demko', 'Dinko', 'Divoky', 'Dolejsi', 'Dolezal', 'Doljs', 'Dopita', 'Drassal', 'Driml', 'Duyava', 'Dvorak', 'Dziadik', 'Egr', 'Entler', 'Faltysek', 'Faltejsek', 'Fencl', 'Fenyo', 'Fillipova', 'Finfera', 'Finferovy', 'Finke', 'Fojtikova', 'Fremut', 'Fried

In [17]:
class MyDataset(Dataset):
  def __init__(self, tensor_names, target_langs):
    self.names = tensor_names
    self.langs = target_langs

  def __len__(self):
    return len(self.names)
  
  def __getitem__(self, index):
    return self.names[index], self.langs[index]

In [18]:
# Create dataloader with traain test split
dataset = MyDataset(tensor_names, target_langs)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

In [19]:
print(f"Train: {len(train_dataset)}")
print(f"Test: {len(test_dataset)}")


Train: 16056
Test: 4014


In [20]:
class SimpleRNN(nn.Module):
  def __init__(self, input_size, hidden_size, output_size):
    super().__init__()
    self.hidden_size = hidden_size
    self.get_hidden = nn.Sequential(
      nn.Linear(input_size+hidden_size, hidden_size),
      nn.ReLU(),
    )
    self.get_output = nn.Sequential(
      nn.Linear(input_size+hidden_size, output_size),
    )
  
  def forward(self, x, hidden):
    data_plus_hidden = torch.cat((x, hidden), 1)
    hid = self.get_hidden(data_plus_hidden)
    out = self.get_output(data_plus_hidden)
    return out, hid



In [21]:
hidden_size = 256
learning_rate = 0.001

model = SimpleRNN(vocabSize, hidden_size, totalLangs)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [44]:
EPOCHS = 10
BATCH_SIZE = 1

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

for epoch in range(EPOCHS):
  for i, (name, y) in enumerate(train_loader):
    hidden = torch.zeros(hidden_size)
    for x in name[0]:
      output, hidden = model(x.view(1, -1), hidden.view(1, -1))

    loss = loss_fn(output, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (i % 100 == 0):
      print(f'Epoch: {epoch}, Step: {i/len(train_loader)}, Loss: {loss.item()}')
  

Epoch: 0, Step: 0.0, Loss: 2.8641743659973145
Epoch: 0, Step: 0.00622820129546587, Loss: 1.378955602645874
Epoch: 0, Step: 0.01245640259093174, Loss: 7.898468017578125
Epoch: 0, Step: 0.01868460388639761, Loss: 2.039006233215332
Epoch: 0, Step: 0.02491280518186348, Loss: 1.6596019268035889
Epoch: 0, Step: 0.03114100647732935, Loss: 0.45666739344596863
Epoch: 0, Step: 0.03736920777279522, Loss: 2.012503147125244
Epoch: 0, Step: 0.04359740906826109, Loss: 1.86020827293396
Epoch: 0, Step: 0.04982561036372696, Loss: 10.93112564086914
Epoch: 0, Step: 0.05605381165919283, Loss: 0.12147118151187897
Epoch: 0, Step: 0.0622820129546587, Loss: 1.6468024253845215
Epoch: 0, Step: 0.06851021425012456, Loss: 1.7105584144592285
Epoch: 0, Step: 0.07473841554559044, Loss: 0.7593142986297607
Epoch: 0, Step: 0.0809666168410563, Loss: 2.260262966156006
Epoch: 0, Step: 0.08719481813652218, Loss: 4.70727014541626
Epoch: 0, Step: 0.09342301943198804, Loss: 2.80104923248291
Epoch: 0, Step: 0.09965122072745392,

In [45]:
correct = 0
for (name, y) in test_loader:
  hidden = torch.zeros(hidden_size)
  for x in name[0]:
     output, hidden = model(x.view(1, -1), hidden.view(1, -1))
  if output.argmax() == y:
     correct += 1

In [47]:
correct/len(test_loader)

0.8014449427005481

In [48]:
torch.save(model.state_dict(), 'model.pth')

In [22]:
mymodel = SimpleRNN(vocabSize, hidden_size, totalLangs)
mymodel.load_state_dict(torch.load('model.pth'))

<All keys matched successfully>

In [23]:
def predict(name):
  hidden = torch.zeros(hidden_size)
  for x in wtot(name):
    output, hidden = mymodel(x.view(1, -1), hidden.view(1, -1))
  return list(langMap.keys())[output.argmax()]

In [46]:
predict('tim')

'English'