In [155]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.utils.data import Subset
import numpy as np

import pandas as pd
import os
from PIL import Image
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import random


In [156]:
kaggle_env = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', 'Localhost') == 'Interactive'

In [157]:
if not kaggle_env:
    dottless_text_file = 'dotless_text.txt'
    original_text_file = 'original_text.txt'

# load the data
with open(dottless_text_file, 'r', encoding='utf-8') as f:
    dottless_text_list = f.readlines()

with open(original_text_file, 'r', encoding='utf-8') as f:
    original_text_list = f.readlines()

In [158]:
def get_vocab(data):
    vocab = set()
    for sentence in data:
        for letter in sentence:
            vocab.update(letter)
            
    vocab = sorted(vocab)
    
    char2index = {'<PAD>': 0}
    index2char = {0: '<PAD>'}
    for index, char in enumerate(vocab, len(index2char)):
        char2index[char] = index
        index2char[index] = char

    return vocab, char2index, index2char

# Create a vocab
dottless_chars, dottless_char2index, dottless_index2char = get_vocab(dottless_text_list)
original_chars, original_char2index, original_index2char = get_vocab(original_text_list)

# Convert to index
dottless_text_list_encoded = []
for sentence in dottless_text_list: 
    dottless_text_list_encoded.append([dottless_char2index[char] for char in sentence])

original_text_list_encoded = []
for sentence in original_text_list:
    original_text_list_encoded.append([original_char2index[char] for char in sentence])

In [159]:
def pad_sequence(x, max_len, pad_token_index=0):
    padded = np.full((max_len), fill_value=pad_token_index)
    if len(x) > max_len: padded[:] = x[:max_len]
    else: padded[:len(x)] = x
    return padded

# pad the sequences
max_length = 512
dottless_text_list_padded = [pad_sequence(sequence, max_length) for sequence in dottless_text_list_encoded]
original_text_list_padded = [pad_sequence(sequence, max_length) for sequence in original_text_list_encoded]

In [160]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(dottless_text_list_padded, original_text_list_padded, test_size=0.1, random_state=42)
 

In [161]:
class ArabicDottizationDataset(Dataset):
    def __init__(self, dottless_text, original_text):
        self.dottless_text = dottless_text
        self.original_text = original_text
         
    def __len__(self):
        return len(self.dottless_text)

    def __getitem__(self, index):     
        return torch.tensor(self.dottless_text[index]), torch.tensor(self.original_text[index])


In [162]:
train_dataset = ArabicDottizationDataset(x_train, y_train)
test_dataset = ArabicDottizationDataset(x_test, y_test)
  

# The Model

In [163]:
class ArabicDottizationModel(nn.Module):
    def __init__(self, dotless_vocab_size, dotted_vocab_size, embedding_dim, hidden_size):
        super().__init__()
        
        self.embedding = nn.Embedding(dotless_vocab_size, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_size, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, dotted_vocab_size)
        
        
    def forward(self, dotless_sentence):
        embeddings = self.embedding(dotless_sentence)  
        embeddings = embeddings.transpose(0, 1)  
        output, _ = self.rnn(embeddings)  
        output = self.fc(output)  
        output = output.transpose(0, 1)  
        return output


In [164]:
embed_size = 512
hidden_size = 256
dottless_vocab_size = len(dottless_chars)
dotted_vocab_size = len(original_chars)
learning_rate = 1e-4
num_epochs = 20
batch_size = 128

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [165]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = ArabicDottizationModel(dottless_vocab_size, dotted_vocab_size, embed_size, hidden_size).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=original_char2index['<PAD>'])
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


In [166]:
print(dotted_vocab_size)

129


In [None]:
# loop over the dataset multiple times
for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (inputs, labels) in tqdm(
        enumerate(train_loader), total=len(train_loader), leave=False
    ):
        inputs, labels = inputs.to(device), labels.to(device).long()

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
         
        outputs = outputs.reshape(-1, dotted_vocab_size) 
        labels = labels.reshape(-1)
         
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print('Loss: {}'.format(running_loss))

print('Finished Training')

  0%|          | 0/79 [00:00<?, ?it/s]

torch.Size([65536, 129]) torch.Size([65536])
torch.Size([65536, 129]) torch.Size([65536])


KeyboardInterrupt: 

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# Example dataset: [(sentence, POS tags)]
data = [
    (["The", "dog", "barked"], ["DET", "NOUN", "VERB"]),
    (["A", "cat", "meowed", 'NOW'], ["DET", "NOUN", "VERB", 'DET'])
]

# Vocabulary & Tag mapping
word_to_ix = {word: i for sent, _ in data for word in sent}
tag_to_ix = {"DET": 0, "NOUN": 1, "VERB": 2}

# Parameters
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

# Model
class RNNTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        
        rnn_out, _ = self.rnn(embeds.view(len(sentence), 1, -1))
        tag_space = self.fc(rnn_out.view(len(sentence), -1))
        return tag_space

model = RNNTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# Convert sentence/tags to tensor
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[word] for word in seq]
    return torch.tensor(idxs, dtype=torch.long)

# Training
for epoch in range(100):
    for sentence, tags in data:
        model.zero_grad()
        inputs = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        outputs = model(inputs)
        loss = loss_function(outputs, targets)
        loss.backward()
        optimizer.step()

# Test
with torch.no_grad():
    test_sentence = ["A", "dog", "barked"]
    test_input = prepare_sequence(test_sentence, word_to_ix)
    outputs = model(test_input)
    predictions = torch.argmax(outputs, dim=1)
    ix_to_tag = {v: k for k, v in tag_to_ix.items()}
    print("Predicted tags:", [ix_to_tag[ix.item()] for ix in predictions])


Predicted tags: ['DET', 'NOUN', 'VERB']
