In [None]:
# import all relevant libraries for preprocessing and gensim 
import os 
import re
import pandas as pd
import numpy as np

In [None]:
# read in the data
um = pd.read_csv(r"C:\Users\Home\Desktop\Python Scripts\kat-master\um_features.csv")
um.head()

In [None]:
# make a list of Georgian chars
georgian_chars = [
    'ა', 
    'ბ', 
    'გ', 
    'დ', 
    'ე', 
    'ვ', 
    'ზ', 
    'თ', 
    'ი', 
    'კ', 
    'ლ', 
    'მ', 
    'ნ', 
    'ო', 
    'პ', 
    'ჟ', 
    'რ', 
    'ს', 
    'ტ', 
    'უ', 
    'ფ', 
    'ქ', 
    'ღ', 
    'ყ', 
    'შ', 
    'ჩ', 
    'ც', 
    'ძ', 
    'წ', 
    'ჭ', 
    'ხ', 
    'ჯ', 
    'ჰ']

In [None]:
# vocab dict with chars as keys and indices as values
char2idx = {
    "<bos>": 0,
    "<eos>": 1,
     **{c: i+2 for i, c in enumerate(sorted(georgian_chars))}
}

print(char2idx)

In [None]:
# vocab dict with tags from the data as keys and indices as values
## first seperate the tag column by the delimiter and make a list of all tags
um['tag'] = um['tag'].str.split(';')
um["tag"]

In [None]:
um['tag'] = um['tag'].apply(lambda tags: [tag for tag in tags if tag != "V"])
um["tag"]

In [None]:
# make a list of all tags
all_tags = []
for tags in um['tag']:
    all_tags.extend(tags)
all_tags
# remove duplicates from the list of tags
all_tags = list(set(all_tags))
# make a dict with tags as keys and indices as values
tag2idx = {
    tag: i+0 for i, tag in enumerate(sorted(all_tags))
}

tag2idx

In [None]:
# split the lemma column by characters and make a list of all lemmas
um['lemma'] = um['lemma'].str.split('')
# strip empty strings from the list of lemmas
um['lemma'] = um['lemma'].apply(lambda lemmas: [lemma for lemma in lemmas if lemma != ""])
um['lemma']

In [None]:
def tokenize(row):
    tokens = ["<bos>"]
    tokens.extend(row['lemma']) 
    tokens.extend(row['tag'])
    tokens.append("<eos>")
    return tokens

um['tokens'] = um.apply(tokenize, axis=1)
um['tokens']

In [None]:
# build one unified vocab for X
symbols = ['bos', 'eos'] + sorted(georgian_chars) + sorted(all_tags)
token2idx = {sym: i for i, sym in enumerate(symbols)}
token2idx

In [None]:
# read in target data
target = pd.read_csv(r"C:\Users\Home\Desktop\Python Scripts\kat-master\um_target.csv")
target.head()

In [None]:
def tokenize_target(row):
    tokens_target = ["<bos>"]
    tokens_target.extend(row['form']) 
    tokens_target.append("<eos>")
    return tokens_target
# apply the function to the target data
target['tokens'] = target.apply(tokenize_target, axis=1)
target['tokens']

In [None]:
len(token2idx)

In [None]:
# model variables 
X = um['tokens'].values
y = target['tokens'].values
# check the data
print(X[0])
print(y[0])

In [None]:
# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# import the necessary libraries for the model
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [None]:
# set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# implement the dataset class
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return torch.tensor(self.X[idx]), torch.tensor(self.y[idx])

In [None]:
# setup the dataset and dataloader
train_dataset = CustomDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataset = CustomDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
# implement encoder
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hidden_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

In [None]:
# implement decoder
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hidden_dim)
        self.fc_out = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, hidden, cell):
        embedded = self.embedding(x).unsqueeze(0)
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell

In [None]:
# implement seq2seq model
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        trg_len = trg.shape[1]
        batch_size = trg.shape[0]
        output_dim = self.decoder.fc_out.out_features

        outputs = torch.zeros(batch_size, trg_len, output_dim).to(self.device)

        hidden, cell = self.encoder(src)

        x = trg[:, 0]

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(x, hidden, cell)
            outputs[:, t] = output
            top1 = output.argmax(1) 
            x = trg[:, t] if random.random() < teacher_forcing_ratio else top1

        return outputs