In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from indexer import ArabicCorpusIndexer
from lang import Lang
from lang import SOS_token 
from lang import EOS_token 
from lang import PAD_token 

from torch.nn.utils.rnn import pad_sequence

# LSTM model

In [2]:
class LSTMqa(nn.Module):
    def __init__(self,vocab_size,embedding_dim,hidden_size,num_layers,dropout):
        super(LSTMqa,self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout
        
        # Encoder
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder_lstm = nn.LSTM(input_size= embedding_dim, hidden_size = self.hidden_size, num_layers = self.num_layers,dropout = self.dropout,batch_first=True)
        
        # Decoder
        self.decoder_lstm = nn.LSTM(input_size= embedding_dim, hidden_size = self.hidden_size, num_layers = self.num_layers,dropout = self.dropout,batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size )
        
    def forward(self, questions_seq, answers_seq):
        # Encoder
        embed = self.embedding(questions_seq)
        q_seq, (hidden, cell) = self.encoder_lstm(embed)

        # Decoder - handle None input for answers during inference
        if answers_seq is not None:
            decoder_input = self.embedding(answers_seq)
        else:
            # Start with the same hidden and cell states
            decoder_input = torch.zeros_like(embed)
        a_seq, _ = self.decoder_lstm(decoder_input, (hidden, cell))

        output = self.linear(a_seq)
        return output


In [3]:
lang = Lang()

## Numerical Encoding (Indexing)

In [4]:
def start_end_tokens(sentence,lang = Lang()):
    lang.addSentence(sentence)
    index2word = []
    #for string in sentence.split():
        # Gives every word an index nice
    
        # Adds an sos and eos tokens to each string
    index2word.append("SOS" + ' ' +  sentence + ' ' + "EOS")   
    
    
    return index2word


# corpus = [
#      "هذا نص عربي",
#      "هذا نص اخر",
#      "نص باللغة العربية",
#      "احبك يا امي"
#  ]
# sentences = start_end_tokens(corpus,lang)
# print(f"index2word :{sentences} ")

    

In [5]:
def token_to_number(strings, lang = Lang()):
    """
    This function convert tokens to it's index from
    word2index dict

    Args:
        strings (list of strings): _description_
        lang (tokenizer):Holds the tokenized words and different dict

    Returns:
        A list of Lists of the transformed strings to indexes
    """
    # TO_DO: make it retrun each string to it's own not a whole string
    # يرجع كل جملة لحالها مب كلها مدموجة في بعض 
    indices = []
    for string in strings:
        sentences_list = []
        split = string.split()
        for i,word in enumerate(split):  
            if word in lang.word2index.keys():
                sentences_list.append(lang.word2index[word])
            elif word == "SOS": 
                sentences_list.append(SOS_token)
            elif word == "EOS": 
                sentences_list.append(EOS_token)
        indices.append(sentences_list)
                


    return indices

# tokenized = token_to_number(sentences,lang)
# print(tokenized)

# Dataset class for the dataloader

In [6]:
# TO-DO: change data loader to accommodate the changes in the tokneizer
# Change how get item function works by making it use the functions 
# start_end_tokens & token_to_number
class relDataset(Dataset):
    def __init__(self, csv_file,lang): 
        # Transform to pandas object 
        self.file_out = pd.read_excel(csv_file)
        self.file_out = self.file_out[['Question_Text','Full_Answer']]
        self.lang = lang
    
    
    def __len__(self):
        return len(self.file_out)
        
        
    def __getitem__(self,idx):
        question = self.file_out.iloc[idx]['Question_Text']
        answer = self.file_out.iloc[idx]['Full_Answer']
        
        # self.lang.addSentence(question)
        # self.lang.addSentence(answer)
        # Comment
        q_lst = start_end_tokens(question,self.lang)
        a_lst = start_end_tokens(answer,self.lang)
        
        q_indexes = token_to_number(q_lst,lang)
        a_indexes = token_to_number(a_lst,lang)
        # INDEXER CLASS
        
        # q_word = question.split()
        # a_word = answer.split()
        
        # q_index = [self.indexer.get_index(word) for word in q_word]
        # a_index = [self.indexer.get_index(word) for word in a_word]
        
        # END OF INDEXER CLASS
        
        # question_indices = [self.lang.word2index[word] for word in question.split() if word in self.lang.word2index]
        # answer_indices = [self.lang.word2index[word] for word in answer.split() if word in self.lang.word2index]

        # To tensor 
        
        question_tensor = torch.tensor(q_indexes)
        answer_tensor = torch.tensor(a_indexes)
        
        return {"questions": question_tensor,"answer":answer_tensor}
    
        

In [7]:
lang.word2index

{'SOS': 0, 'EOS': 1, '<PAD>': 2}

In [8]:
dataset = relDataset('HAQA.xlsx',lang)
sample = dataset.__getitem__(idx=2)
# Print the returned sample
print(sample)


{'questions': tensor([[0, 3, 4, 5, 6, 7, 1]]), 'answer': tensor([[ 0,  8,  9, 10, 11, 12, 13, 14,  9, 15, 16, 17, 18,  8,  9, 10, 11, 12,
         19, 20, 21, 22, 23, 13, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
         36, 37,  1]])}


In [9]:
lang.index2word

{0: 'SOS',
 1: 'EOS',
 2: '<PAD>',
 3: 'ما',
 4: 'هو',
 5: 'الإحسان',
 6: 'في',
 7: 'العبادة؟',
 8: 'مراقبة',
 9: 'الله',
 10: 'وحده',
 11: 'الَّذِي',
 12: 'يرانا',
 13: '(',
 14: 'إنّ',
 15: 'كَانَ',
 16: 'عليكم',
 17: 'رقيباً)',
 18: '[النّساء].',
 19: '(الَّذِي',
 20: 'يراك',
 21: 'حينَ',
 22: 'تقومُ)',
 23: '[الشّعراء].',
 24: 'الإحسانُ',
 25: 'أَنْ',
 26: 'تعبُدَ',
 27: 'اللهَ',
 28: 'كأنّك',
 29: 'تراه',
 30: 'فإن',
 31: 'لَمْ',
 32: 'تكن',
 33: 'تراهُ',
 34: 'فإنَّه',
 35: 'يراك)',
 36: '(رواه',
 37: 'مسلم)'}

In [10]:
from torch.nn.utils.rnn import pad_sequence
import torch

def custom_collate_fn(batch):
    # Extract questions and answers and handle possible dimension issues
    questions = [item['questions'].squeeze(0) for item in batch if item['questions'].nelement() > 0]
    answers = [item['answer'].squeeze(0) for item in batch if item['answer'].nelement() > 0]

    pad_token_index = lang.word2index['<PAD>']

    # Check if any sequences are empty and handle them appropriately
    if len(questions) == 0 or len(answers) == 0:
        print("Warning: Received an empty list of sequences.")
        return None  # Or handle as needed

    # Pad sequences
    questions_padded = pad_sequence(questions, batch_first=True, padding_value=pad_token_index)
    answers_padded = pad_sequence(answers, batch_first=True, padding_value=pad_token_index)

    return {'questions': questions_padded, 'answer': answers_padded}


In [11]:
lang.n_words

38

In [12]:
HAQA_ds = relDataset('HAQA.xlsx',lang)

In [13]:
for x in HAQA_ds:
    print(x)


{'questions': tensor([[ 0, 38, 39, 40,  1]]), 'answer': tensor([[ 0, 41, 42,  9, 43, 44, 45, 13, 46, 47, 48, 49, 27, 50, 51, 52, 53, 54,
         13, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 36, 37,  1]])}
{'questions': tensor([[ 0, 65, 39,  9, 66, 67,  1]]), 'answer': tensor([[ 0, 68, 69, 70, 71, 72, 73, 74, 75, 70, 76, 77, 71, 78, 79, 80, 81, 13,
         82, 27, 83, 84, 85, 86, 87, 88, 89, 90, 91,  1]])}
{'questions': tensor([[0, 3, 4, 5, 6, 7, 1]]), 'answer': tensor([[ 0,  8,  9, 10, 11, 12, 13, 14,  9, 15, 16, 17, 18,  8,  9, 10, 11, 12,
         19, 20, 21, 22, 23, 13, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
         36, 37,  1]])}
{'questions': tensor([[ 0,  3, 92, 93, 94, 95, 40,  1]]), 'answer': tensor([[  0,  96,  97,  98,  48,   9,  13,  99, 100,  27,   4, 101, 102, 103,
         104, 105, 106, 107, 108, 109,  13,  76, 110, 111,  94,  48,   9, 112,
         113, 114,  86, 115,   9, 116, 117, 118,  36,  37,   1]])}
{'questions': tensor([[  0,   3,   4, 119,   6, 120,  

In [14]:
lang.n_words

25450

In [15]:
print(HAQA_ds.lang.index2word.get(0))

SOS


In [16]:
for idx, sample in enumerate(HAQA_ds):
    print(idx,sample)

0 {'questions': tensor([[ 0, 38, 39, 40,  1]]), 'answer': tensor([[ 0, 41, 42,  9, 43, 44, 45, 13, 46, 47, 48, 49, 27, 50, 51, 52, 53, 54,
         13, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 36, 37,  1]])}
1 {'questions': tensor([[ 0, 65, 39,  9, 66, 67,  1]]), 'answer': tensor([[ 0, 68, 69, 70, 71, 72, 73, 74, 75, 70, 76, 77, 71, 78, 79, 80, 81, 13,
         82, 27, 83, 84, 85, 86, 87, 88, 89, 90, 91,  1]])}
2 {'questions': tensor([[0, 3, 4, 5, 6, 7, 1]]), 'answer': tensor([[ 0,  8,  9, 10, 11, 12, 13, 14,  9, 15, 16, 17, 18,  8,  9, 10, 11, 12,
         19, 20, 21, 22, 23, 13, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
         36, 37,  1]])}
3 {'questions': tensor([[ 0,  3, 92, 93, 94, 95, 40,  1]]), 'answer': tensor([[  0,  96,  97,  98,  48,   9,  13,  99, 100,  27,   4, 101, 102, 103,
         104, 105, 106, 107, 108, 109,  13,  76, 110, 111,  94,  48,   9, 112,
         113, 114,  86, 115,   9, 116, 117, 118,  36,  37,   1]])}
4 {'questions': tensor([[  0,   3,   4, 119,  

In [17]:
dataloader = DataLoader(HAQA_ds, batch_size=2, shuffle=True,collate_fn=custom_collate_fn)

In [18]:
for batch in dataloader:
    print(batch['answer'].shape)

torch.Size([2, 86])
torch.Size([2, 360])
torch.Size([2, 47])
torch.Size([2, 66])
torch.Size([2, 75])
torch.Size([2, 40])
torch.Size([2, 38])
torch.Size([2, 244])
torch.Size([2, 162])
torch.Size([2, 70])
torch.Size([2, 32])
torch.Size([2, 360])
torch.Size([2, 243])
torch.Size([2, 101])
torch.Size([2, 86])
torch.Size([2, 56])
torch.Size([2, 101])
torch.Size([2, 56])
torch.Size([2, 46])
torch.Size([2, 59])
torch.Size([2, 81])
torch.Size([2, 193])
torch.Size([2, 78])
torch.Size([2, 64])
torch.Size([2, 82])
torch.Size([2, 52])
torch.Size([2, 95])
torch.Size([2, 55])
torch.Size([2, 37])
torch.Size([2, 71])
torch.Size([2, 56])
torch.Size([2, 154])
torch.Size([2, 76])
torch.Size([2, 92])
torch.Size([2, 126])
torch.Size([2, 288])
torch.Size([2, 109])
torch.Size([2, 304])
torch.Size([2, 69])
torch.Size([2, 58])
torch.Size([2, 75])
torch.Size([2, 55])
torch.Size([2, 43])
torch.Size([2, 47])
torch.Size([2, 103])
torch.Size([2, 132])
torch.Size([2, 40])
torch.Size([2, 120])
torch.Size([2, 44])
torc

In [19]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: cpu


In [20]:
from torch import optim


model = LSTMqa(vocab_size=lang.n_words, embedding_dim=8, hidden_size=4, num_layers=2, dropout=0.5)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
model.to(device)


def train(model, dataloader, criterion, optimizer, num_epochs=10, device=device):
    model.train()  # Set the model to training mode
    
    for epoch in range(num_epochs):
        total_loss = 0
        
        for batch in dataloader:
            # Move data to the appropriate device
            questions = batch['questions'].to(device)
            answers = batch['answer'].to(device)
            
            optimizer.zero_grad()
            outputs = model(questions, answers)
            outputs = outputs.view(-1, outputs.shape[-1])
            answers = answers.view(-1)
            loss = criterion(outputs, answers)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(dataloader):.4f}')

# Call train function
train(model, dataloader, criterion, optimizer, num_epochs=2, device=device)




  from .autonotebook import tqdm as notebook_tqdm


Epoch [1/2], Loss: 8.7242


KeyboardInterrupt: 

In [None]:
lang.index2word[1234]

'ماهو'

In [None]:
lang.word2index

{'SOS': 0,
 'EOS': 1,
 '<PAD>': 2,
 'ما': 3,
 'هو': 4,
 'الإحسان': 5,
 'في': 6,
 'العبادة؟': 7,
 'مراقبة': 8,
 'الله': 9,
 'وحده': 10,
 'الَّذِي': 11,
 'يرانا': 12,
 '(': 13,
 'إنّ': 14,
 'كَانَ': 15,
 'عليكم': 16,
 'رقيباً)': 17,
 '[النّساء].': 18,
 '(الَّذِي': 19,
 'يراك': 20,
 'حينَ': 21,
 'تقومُ)': 22,
 '[الشّعراء].': 23,
 'الإحسانُ': 24,
 'أَنْ': 25,
 'تعبُدَ': 26,
 'اللهَ': 27,
 'كأنّك': 28,
 'تراه': 29,
 'فإن': 30,
 'لَمْ': 31,
 'تكن': 32,
 'تراهُ': 33,
 'فإنَّه': 34,
 'يراك)': 35,
 '(رواه': 36,
 'مسلم)': 37,
 'كيف': 38,
 'نعبد': 39,
 'الله؟': 40,
 'كَمَا': 41,
 'أمرنا': 42,
 'ورسوله': 43,
 'مَعَ': 44,
 'الإخلاص': 45,
 'وَمَا': 46,
 'أُمروا': 47,
 'إِلاَّ': 48,
 'ليَعبدوا': 49,
 'مخلصينَ': 50,
 'لَهُ': 51,
 'الدّين)': 52,
 '[البيّنة:': 53,
 '5].': 54,
 'مَن': 55,
 'عمِلَ': 56,
 'عمَلاً': 57,
 'لَيْسَ': 58,
 'عَلَيهِ': 59,
 'أمْرُنا': 60,
 'فَهُوَ': 61,
 'رَدٌّ)': 62,
 '[أَيْ': 63,
 'مردود]': 64,
 'هل': 65,
 'خوفا': 66,
 'وطمعا؟': 67,
 'نعم': 68,
 'نعبده': 69,
 'خوفاً': 70,
 'وطم

In [None]:
def generate_answer(model, question_tensor, lang, max_len=80):
    device = next(model.parameters()).device
    model.eval()

    sos_token_id = lang.word2index["SOS"]
    eos_token_id = lang.word2index["EOS"]
    answer_seq = [sos_token_id]
    answer_tensor = torch.tensor([answer_seq], dtype=torch.long, device=device)

    for _ in range(max_len):
        with torch.no_grad():
            output = model(question_tensor, answer_tensor)
            next_token_id = output[:, -1, :].argmax(dim=-1).item()
            answer_seq.append(next_token_id)
            answer_tensor = torch.tensor([answer_seq], dtype=torch.long, device=device)

            if next_token_id == eos_token_id:
                break

    answer_words = [lang.index2word[idx] for idx in answer_seq if idx in lang.index2word]
    return ' '.join(answer_words)



# Example usage
question = "ماهو الاحسان؟"
question_tokens = start_end_tokens(question, lang)
question_indices = token_to_number(question_tokens, lang)
question_tensor = torch.tensor(question_indices, dtype=torch.long) 
question_tensor = question_tensor.to(device)




In [None]:
question_tensor.shape

torch.Size([1, 4])

In [None]:
# Generate answer
answer = generate_answer(model, question_tensor, lang)
print(answer)

: 

In [21]:
import torch
print(torch.__version__)
print("CUDA Available: ", torch.cuda.is_available())


2.2.2+cpu
CUDA Available:  False
