In [30]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import pandas as pd


In [2]:
# Check PyTorch version
torch.__version__

'2.5.1'

In [3]:
# check if GPU is available
torch.cuda.is_available()

True

In [24]:
# load the dataset
um_X = pd.read_csv(r'C:\Users\Home\Desktop\Python Scripts\kat-master\um_features.csv')
um_y = pd.read_csv(r'C:\Users\Home\Desktop\Python Scripts\kat-master\um_target.csv')

In [25]:
# Split the 'tag' column into 4 separate columns
um_X[['POS', 'ARG', 'Mood', 'Tense']] = um_X['tag'].str.split(';', n=3, expand=True)
um_X.drop(columns=['tag'], inplace=True)

In [26]:
um_X.head()

Unnamed: 0,lemma,POS,ARG,Mood,Tense
0,შეუძლია,V,ARGNO1S,IND,PRS
1,შეუძლია,V,ARGNO2S,IND,PRS
2,შეუძლია,V,ARGNO3S,IND,PRS
3,შეუძლია,V,ARGNO1P,IND,PRS
4,შეუძლია,V,ARGNO2P,IND,PRS


In [29]:
# number of unique values for each column
print('unique values in POS:', um_X.POS.nunique())
print('unique values in ARG:', um_X.ARG.nunique())
print('unique values in Mood:', um_X.Mood.nunique())
print('unique values in Tense:', um_X.Tense.nunique())

unique values in POS: 1
unique values in ARG: 7
unique values in Mood: 13
unique values in Tense: 18


In [41]:
encoder = OneHotEncoder(sparse_output=False)
X = encoder.fit_transform(um_X[["ARG", "Mood", "Tense"]])

X_df = pd.DataFrame(X, columns=encoder.get_feature_names_out(["ARG", "Mood", "Tense"]))

In [42]:
X_df.head()

Unnamed: 0,ARG_ARGNO1P,ARG_ARGNO1S,ARG_ARGNO2P,ARG_ARGNO2S,ARG_ARGNO3P,ARG_ARGNO3S,ARG_V.MSDR,Mood_ARGAC1P,Mood_ARGAC1S,Mood_ARGAC2P,...,Tense_IPFV,Tense_OPT,Tense_PRF,Tense_PRS,Tense_PST;PFV,Tense_PST;PRF,Tense_SBJV;FUT,Tense_SBJV;PRF,Tense_SBJV;PRS,Tense_None
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
# set up model variables
X = X_df
y = um_y # target variable
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# embed y values using the pretrained char2vec model
# Load the pretrained char2vec model
from gensim.models import KeyedVectors
char2vec_model = KeyedVectors.load_word2vec_format(r'C:\Users\Home\Desktop\Python Scripts\kat-master\char2vec.bin', binary=True)
# Convert the target variable to a list of strings
y_train_list = y_train['target'].tolist()
# Create a list to store the embedded vectors
y_train_vectors = []
# Iterate over each string in the target variable
for string in y_train_list:
    # Split the string into characters
    chars = list(string)
    # Get the vector for each character and average them
    vectors = [char2vec_model[char] for char in chars if char in char2vec_model.key_to_index]
    if vectors:
        avg_vector = sum(vectors) / len(vectors)
        y_train_vectors.append(avg_vector)
    else:
        y_train_vectors.append([0] * 100)  # Use a zero vector if no characters are found
# Convert the list of vectors to a DataFrame
y_train_vectors_df = pd.DataFrame(y_train_vectors, columns=[f'char2vec_{i}' for i in range(100)])

# Convert the data to PyTorch tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)
# Create a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]
    
# Create DataLoader objects for training and testing data
train_dataset = CustomDataset(X_train_tensor, y_train_tensor)
test_dataset = CustomDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Home\\Desktop\\Python Scripts\\kat-master\\char2vec.bin'

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from tqdm import tqdm

# Hyperparameters
BATCH_SIZE = 128
EMBED_DIM = 100
HIDDEN_DIM = 128
NUM_LAYERS = 2
EPOCHS = 10
MAX_LENGTH = 40
LEARNING_RATE = 0.001
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Prepare your dataset
class CharDataset(Dataset):
    def __init__(self, sequences, char2idx, max_length):
        self.sequences = sequences
        self.char2idx = char2idx
        self.max_length = max_length

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        text = self.sequences[idx]
        padded = text.ljust(self.max_length)
        input_seq = [self.char2idx[c] for c in padded[:-1]]
        target_seq = [self.char2idx[c] for c in padded[1:]]
        return torch.tensor(input_seq), torch.tensor(target_seq)

# Model Definition
class CharRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers):
        super(CharRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        embedded = self.embedding(x)
        out, hidden = self.rnn(embedded, hidden)
        out = self.fc(out)
        return out, hidden

# Read and prepare the data
train_data = X_train['lemma'] + " " + y_train['form']
sequences = list(train_data)

# Build character vocabulary
chars = sorted(list(set("".join(sequences))))
char2idx = {c: i for i, c in enumerate(chars)}
idx2char = {i: c for c, i in char2idx.items()}

# Dataset and DataLoader
dataset = CharDataset(sequences, char2idx, MAX_LENGTH)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

# Model, loss, optimizer
model = CharRNN(len(char2idx), EMBED_DIM, HIDDEN_DIM, NUM_LAYERS).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# TensorBoard
writer = SummaryWriter()

# Training loop
step = 0
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for inputs, targets in tqdm(dataloader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
        optimizer.zero_grad()
        outputs, _ = model(inputs)
        loss = criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
        writer.add_scalar('Loss/train', loss.item(), step)
        step += 1

    print(f"Epoch {epoch+1} Loss: {total_loss / len(dataloader):.4f}")
    writer.add_scalar('Loss/epoch_avg', total_loss / len(dataloader), epoch)

# Save model
torch.save(model.state_dict(), "char_rnn_model.pt")
writer.close()

ModuleNotFoundError: No module named 'tensorboard'