# Классификация языков

In [3]:
with open("/content/exam/dutch.txt", 'r') as infile:
  dutch_text = infile.read()

with open("/content/exam/hungarian.txt", 'r') as infile:
  hungarian_text = infile.read()

with open("/content/exam/portugese.txt", 'r') as infile:
  portugese_text = infile.read()

In [8]:
from collections import Counter
class Vocabulary:
  def __init__(self, lists):
    self.UNKNOWN = "UNK"

    tokens = []
    for sub_list in lists:
      tokens.extend(sub_list)
      counter = [ item[0] for item in Counter(tokens).most_common(1000)]
      self.index2word = ["UNK"] + counter
      self.word2index = {item : index for index, item in enumerate(self.index2word)}

In [10]:
def split_data(dutch_text, hungarian_text, portugese_text):
  dutch_text = dutch_text.split('\n')
  hungarian_text = hungarian_text.split('\n')
  portugese_text = portugese_text.split('\n')

  train_text = {"dutch" : dutch_text[:int(len(dutch_text) * 0.8)], "hungarian" : hungarian_text[:int(len(hungarian_text) * 0.8)], "portugese" : portugese_text[:int(len(portugese_text) * 0.8)]}
  test_text = {"dutch" : dutch_text[int(len(dutch_text) * 0.8) :], "hungarian" : hungarian_text[int(len(hungarian_text) * 0.8) :], "portugese" : portugese_text[int(len(portugese_text) * 0.8) :]}

  vocab = Vocabulary(train_text.values())

  train_tokens = dict()
  test_tokens = dict()
  for language in train_text.keys():
    train_tokens[language] = [vocab.word2index.get(item, 0) for item in train_text[language]]
    test_tokens[language] = [vocab.word2index.get(item, 0) for item in test_text[language]]

  return train_tokens, test_tokens, vocab

In [11]:
train_tokens, test_tokens, vocab = split_data(dutch_text, hungarian_text, portugese_text)

In [23]:
import torch
import torch.nn as nn
class Net(nn.Module):
  def __init__(self, embedding_input: int):
    super().__init__()
    self.embadding = nn.Embedding(embedding_input, 64)
    self.linear_1 = nn.Linear(64, 16)
    self.linear_2 = nn.Linear(16, 3)

  def forward(self, token):
    relu = nn.ReLU()
    output = self.embadding(token)
    output = relu(output)
    output = self.linear_1(output)
    output = relu(output)
    output = self.linear_2(output)
    output = nn.Sigmoid()(output)
    return output

In [17]:
from torch.utils.data import Dataset

class MyDataset(Dataset):
  def __init__(self, tokens_by_languages):
    super().__init__()
    self.target_codes = {
        "dutch" : 0,
        "hungarian": 1,
        "portugese": 2
    }
    self.data = []
    self.target = []

    for lang in tokens_by_languages.keys():
      for token in  tokens_by_languages[lang]:
        self.data.append(token)
        self.target.append(self.target_codes[lang])

  def __getitem__(self, index):
    return torch.tensor(self.data[index]), torch.tensor(self.target[index])

  def __len__(self):
    return len(self.target)

In [18]:
train_dataset = MyDataset(train_tokens)
test_dataset = MyDataset(test_tokens)
from torch.utils.data import Dataset, DataLoader
train_loader = DataLoader(train_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=8)

In [24]:
import numpy as np
EPOCHS = 5
model = Net(len(vocab.index2word))
optim = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
for e in range(EPOCHS):
  train_loss = []
  test_loss = []
  for batch in train_loader:
    optim.zero_grad()
    X, Y = batch
    output = model(X)
    loss = criterion(output, Y)
    loss.backward()
    optim.step()
    train_loss.append(loss.item())
  for batch in test_loader:
    X, Y = batch
    output = model(X)
    loss = criterion(output, Y)
    test_loss.append(loss.item())

  print("Epoch", e, "Train", np.mean(train_loss), "Test", np.mean(test_loss))

Epoch 0 Train 0.8910402947739923 Test 1.1970766858315804
Epoch 1 Train 1.0681929307108369 Test 1.1975187813731987
Epoch 2 Train 0.9236955717941602 Test 1.1974039069363769
Epoch 3 Train 1.305106542006198 Test 1.2257074576028635
Epoch 4 Train 1.2784567605204913 Test 1.2151526239556325


In [None]:
входной вектор длины (макс. количество символов)
1) Embedding - превращаем код символа в вектор - выдает тензор размера (макс. количество символов X эмбеддинг одного символа)
2) BI LSTM - распознаем зависимости между символами - выдаёт тензор (макс. количество символов X 2*эмбеддинг одного символа)
3) Pooling или сумма вдоль оси - сжимаем матрицу в вектор - выдаём тензор длины (макс. количество символов)
4) Linear - превращаем вектор в распределение вероятностей - выдаёт тензор длины 3
5) Sigmoid - ограничиваем числа от 0 до 1