In [2]:
import numpy as np
import tensorflow as tf
import pandas as pd
import math

from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional

from constants import *
from encoding_decoding_lookup import *

## Training our model

### Dataset

### Diacritizer

# check steps in model_train.py

In [3]:
# # just like the lab, but with tf, we would do init with all needed variables and a forward fn
# #what we need: self, window_size, lstm_size, dropout_rate,embedding_size
# class Diacritizer:
#     def __init__(self)
#     def __init__(self, vocab_size=35181, embedding_dim=50, hidden_size=50, n_classes=len(tag_map)):
    

In [4]:
# not sure if we need vocab size since we use window
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional


class Diacritizer(Model):
    def __init__(self, embedding_size=DEFAULT_EMBEDDING_SIZE, lstm_size=DEFAULT_LSTM_SIZE, window_size=DEFAULT_WINDOW_SIZE,dropout_rate=DEFAULT_DROPOUT_RATE):
        super(Diacritizer, self).__init__()

        # in the initialization, we define: input, embeddings, and output
        # in lab 5 we had a linear where number of neurons= number of classes but here we have dense

        ######################################## Creating the layers of our model #####################

        # Step 1: define the input
        # input would be the fixed window size after lookup 
        # Input in tf defines the input layer with shape=(window_size,)
        # it represents the input data.
        self.inputs = Input(shape=(window_size,), name='input')


        # Step 2: define the embeddings
        # it converts the integer index to dense vectors with fixed size
        # input dim is dim of the letters (length of valid input letters we have)
        # the output is the dense vector (embeddings size which is 128)
        # the ( ) at the end mean that this layer would be applied to the input layer
        self.embedding = Embedding(input_dim= len(SORTED_VALID_INPUT_LETTERS) + 1, output_dim= embedding_size, name='embedding')(self.inputs)
        

        # Step 3: Define the Bidirectional LSTM layers (we have 4 layers of each class and an initial layer)
        
        # initial layer that would be applied to the embeddings
        self.initial_layer = Bidirectional(LSTM(lstm_size, dropout=dropout_rate, return_sequences=True),
                                      name='initial_layer')(self.embedding)
        
        # first layer is sukun layer that would be applied on initial layer
        self.sukoon_layer = Bidirectional(LSTM(lstm_size, dropout=dropout_rate, return_sequences=True),
                                     name='sukoon_layer')(self.initial_layer)
        
        # sec layer is shadda layer that would be applied to sukun layer
        self.shadda_layer = Bidirectional(LSTM(lstm_size, dropout=dropout_rate, return_sequences=True),
                                     name='shadda_layer')(self.sukoon_layer)
        
        # third layer is sec layer that would be applied to shadda layer
        self.secondary_diacritics_layer = Bidirectional(LSTM(lstm_size, dropout=dropout_rate, return_sequences=True),
                                                   name='secondary_diacritics_layer')(self.shadda_layer)
        
        # forth layer is primary layer that would be applied to sec layer
        self.primary_diacritics_layer = Bidirectional(LSTM(lstm_size, dropout=dropout_rate, return_sequences=True),
                                                 name='primary_diacritics_layer')(self.secondary_diacritics_layer)

        # Step 4: Define the output layers (we have 4 outputs with diff dense)
        # dense functions define the activation functions in tensor flow
        # where dense=1 means sigmoid and dense =2 means softmax

        # Sukun output would have a sigmoid as its binary and would come from the sukun layer
        # dense= 1 means single neuorn we just wanna know if 0 no sukun or 1 yes sukun
        # SIGMOID
        self.sukoon_output = Dense(1, activation='sigmoid',name='sukoon_output')(self.sukoon_layer)

        # shadda output
        # SIGMOID
        self.shadda_output = Dense(1,activation='sigmoid',   name='shadda_output')(self.shadda_layer)

        # sec output
        # dense= 4 as we have multi classification task, where each neuron assigned to one class
        # we have 0,1,2,3 as per the lookup table
        # SOFTMAX
        self.secondary_diacritics_output = Dense(4, activation='softmax' , name='secondary_diacritics_output')(self.secondary_diacritics_layer)

        # prim output
        # SOFTMAX
        self.primary_diacritics_output = Dense(4,  activation='softmax' ,name='primary_diacritics_output')(self.primary_diacritics_layer)
       

    def forward(self, inputs):
        # Step 4: Define the forward pass through the layers
        initial_layer = self.initial_layer(inputs)
        sukoon_layer = self.sukoon_layer(initial_layer)
        shadda_layer = self.shadda_layer(sukoon_layer)
        secondary_diacritics_layer = self.secondary_diacritics_layer(shadda_layer)
        primary_diacritics_layer = self.primary_diacritics_layer(secondary_diacritics_layer)


        # Step 5: Separate outputs for each diacritic
        sukoon_output = self.sukoon_output(sukoon_layer)
        shadda_output = self.shadda_output(shadda_layer)
        secondary_diacritics_output = self.secondary_diacritics_output(secondary_diacritics_layer)
        primary_diacritics_output = self.primary_diacritics_output(primary_diacritics_layer)
       

        return primary_diacritics_output, secondary_diacritics_output, shadda_output, sukoon_output


In [5]:
model = Diacritizer()
print(model)

<__main__.Diacritizer object at 0x000002A1E0C6F760>


## Input/output preprocessing

In [6]:
def read_sentences_from_file(file1_path, file2_path):
    sentences_with_diacritics = []
    sentences_without_diacritics = []

    with open(file1_path, 'r', encoding='utf-8') as file:
        for line in file:
            sentences_with_diacritics.append(line)

    with open(file2_path, 'r', encoding='utf-8') as file:
        for line in file:
            sentences_without_diacritics.append(line)

    return sentences_with_diacritics, sentences_without_diacritics

# Example usage:
file_path_with_diacritics = '../data/sentences_with_diacritics.txt'
file_path_without_diacritics = '../data/sentences_without_diacritics.txt'

sentences_with_diacritics, sentences_without_diacritics = read_sentences_from_file(file_path_with_diacritics,file_path_without_diacritics)
print("Sentences with Diacritics:", sentences_with_diacritics[0])
print("Sentences without Diacritics:", sentences_without_diacritics[0])
print("Length of Sentences with Diacritics:", len(sentences_with_diacritics))
print("Length of Sentences without Diacritics:", len(sentences_without_diacritics))


Sentences with Diacritics: قَوْلُهُ أَوْ قَطَعَ الْأَوَّلُ يَدَهُ إلَخْ قَالَ الزَّرْكَشِيُّ ابْنُ عَرَفَةَ قَوْلُهُ بِلَفْظٍ يَقْتَضِيه كَإِنْكَارِ غَيْرِ حَدِيثٍ بِالْإِسْلَامِ وُجُوبَ مَا عُلِمَ وُجُوبُهُ مِنْ الدِّينِ ضَرُورَةً كَإِلْقَاءِ مُصْحَفٍ بِقَذَرٍ وَشَدِّ زُنَّارٍ ابْنُ عَرَفَةَ قَوْلُ ابْنِ شَاسٍ أَوْ بِفِعْلٍ يَتَضَمَّنُهُ هُوَ كَلُبْسِ الزُّنَّارِ وَإِلْقَاءِ الْمُصْحَفِ فِي صَرِيحِ النَّجَاسَةِ وَالسُّجُودِ لِلصَّنَمِ وَنَحْوِ ذَلِكَ وَسِحْرٍ مُحَمَّدٌ قَوْلُ مَالِكٍ وَأَصْحَابِهِ أَنَّ السَّاحِرَ كَافِرٌ بِاَللَّهِ تَعَالَى قَالَ مَالِكٌ هُوَ كَالزِّنْدِيقِ إذَا عَمِلَ السِّحْرَ بِنَفْسِهِ قُتِلَ وَلَمْ يُسْتَتَبْ 

Sentences without Diacritics: قوله أو قطع الأول يده إلخ قال الزركشي ابن عرفة قوله بلفظ يقتضيه كإنكار غير حديث بالإسلام وجوب ما علم وجوبه من الدين ضرورة كإلقاء مصحف بقذر وشد زنار ابن عرفة قول ابن شاس أو بفعل يتضمنه هو كلبس الزنار وإلقاء المصحف في صريح النجاسة والسجود للصنم ونحو ذلك وسحر محمد قول مالك وأصحابه أن الساحر كافر بالله تعالى قال مالك هو كالزنديق 

In [7]:
print(sentences_with_diacritics[0][0:8])
print(sentences_without_diacritics[0][0:5])

قَوْلُهُ
قوله 


In [8]:
arabicDictionary=['ى', 'ع', 'ظ', 'ح', 'ر', 'س', 'ي', 'ش', 'ض', 'ق', ' ', 'ث', 'ل', 'ص', 'ط', 'ك', 'آ', 'م', 'ا', 'إ', 'ه', 'ز', 'ء', 'أ', 'ف', 'ؤ', 'غ', 'ج', 'ئ', 'د', 'ة', 'خ', 'و', 'ب', 'ذ', 'ت', 'ن']

In [9]:
def getDiacriticsForSentence(listOfSentencesWithDiacritics):
    #now that we have two separated lists we need to get the diacritics list
    short_vowels_list=list()
    double_case_endings_list=list()
    shadda_list=list()
    sukoon_list=list()
    counter=0
    # letters_counter=0
    for word in listOfSentencesWithDiacritics:
        while counter<len(word):
            if word[counter] in arabicDictionary: #checking if the character is a letter
                # letters_counter+=1
                if (counter+1)<len(word):
                    #checking if the next character is also a letter, then that means that the diacritics of the current letter is none so add empty string to the list
                    if word[counter +1] in arabicDictionary:
                        short_vowels_list.append("")
                        double_case_endings_list.append("")
                        shadda_list.append("")
                        sukoon_list.append("")
                        counter+=2
                        # letters_counter+=1
                        continue
                counter+=1 #if it is the end of the word (no more letters) or the next character is a diacritics -> continue looping
                continue
            else:
                if word[counter] in SHORT_VOWELS:
                    short_vowels_list.append(word[counter])
                    double_case_endings_list.append("")
                    shadda_list.append("")
                    sukoon_list.append("")
                elif word[counter] in DOUBLE_CASE_ENDINGS:
                    double_case_endings_list.append(word[counter])
                    short_vowels_list.append("")
                    shadda_list.append("")
                    sukoon_list.append("")
                elif word[counter] == SHADDA:
                    shadda_list.append(word[counter])
                    short_vowels_list.append("")
                    double_case_endings_list.append("")
                    sukoon_list.append("")
                else:
                    sukoon_list.append(word[counter])
                    short_vowels_list.append("")
                    double_case_endings_list.append("")
                    shadda_list.append("")
                counter+=1
        counter=0
    # print(letters_counter)
    return short_vowels_list,double_case_endings_list,shadda_list,sukoon_list

In [10]:
def getDiacriticsForDataSet(sentences_with_diacritics):
    short_vowels_list=list()
    double_case_endings_list=list()
    shadda_list=list()
    sukoon_list=list()
    for sentence in sentences_with_diacritics:
        list_to_be_sent= sentence.split(" ")
        if ("\n") in list_to_be_sent:
            list_to_be_sent.remove("\n")
        sv_list,dce_list,sh_list,su_list= getDiacriticsForSentence(list_to_be_sent)
        short_vowels_list.append(sv_list)
        double_case_endings_list.append(dce_list)
        shadda_list.append(sh_list)
        sukoon_list.append(su_list)
    return short_vowels_list,double_case_endings_list,shadda_list,sukoon_list


In [11]:
short_vowels_list,double_case_endings_list,shadda_list,sukoon_list = getDiacriticsForDataSet(sentences_with_diacritics)

In [12]:
# print(len(sentences_with_diacritics[0].split(" ")))
print(len(shadda_list[0]))
print(shadda_list[0]) #this shows the diacritics of the first sentence
# print(" ".join(short_vowels_list[0])) #diacritics not in a list

296
['', '', '', '', '', '', '', '', '', '', '', '', 'ّ', '', '', '', '', '', '', '', '', '', '', '', '', 'ّ', '', '', '', '', 'ّ', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'ّ', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'ّ', '', '', 'ّ', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'ّ', '', '', '', '', '', '', '', '', '', '', 'ّ', '', 'ّ', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'ّ', '', '', '', '', '', '', '', 'ّ', '', '', '', '', '', '', 'ّ', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'ّ', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'ّ', '', '', 'ّ', '', '', '', '', '', '', '', '', '', '', 

In [13]:
encoded_shadda_list= encodeShaddaList(shadda_list)
encoded_sukoon_list = encodeSukoonList(sukoon_list)

In [14]:
print(encoded_shadda_list[0])
print(encoded_sukoon_list[0])
print(len(encoded_shadda_list))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,

In [15]:
encoded_sentence=[ENCODE_SHORT_VOWELS_LOOKUP.lookup(tf.constant(char)).numpy() for char in sentences_with_diacritics[0][0]]
print(encoded_sentence)

[0]


In [16]:
print(len(sentences_without_diacritics))
print(len(short_vowels_list))
print(len(double_case_endings_list))
print(len(sukoon_list))
print(len(shadda_list))

40836
40836
40836
40836
40836


In [17]:
def encode_input_sentence(sentence):
    encoded_sentence=[ENCODE_LETTERS_LOOKUP.lookup(tf.constant(char)).numpy() for char in sentence]
    return encoded_sentence

def encode_input_sentences(sentences):
        # loop on each sentence and encode it
        encoded_input_sentences=[]
        for sentence in sentences:
              encoded_input_sentences.append(encode_input_sentence(sentence))
        return encoded_input_sentences

def encode_output_sentence_for_short_vowels(sentence):
    encoded_sentence=[ENCODE_SHORT_VOWELS_LOOKUP.lookup(tf.constant(char)).numpy() for char in sentence]
    return encoded_sentence

def encode_output_sentence_for_double_case_endings(sentence):
    encoded_sentence=[ENCODE_DOUBLE_CASE_ENDINGS_LOOKUP.lookup(tf.constant(char)).numpy() for char in sentence]
    return encoded_sentence

# def encode_output_sentence_for_sukun_and_shadda(sentence):
#     encoded_sentence=[ENCODE_BINARY_LOOKUP.lookup(tf.constant(char)).numpy() for char in sentence]
#     return encoded_sentence

def encode_output_sentences_for_short_vowels(sentences):
    encoded_output_sentences=[]
    for sentence in sentences:
              encoded_output_sentences.append(encode_output_sentence_for_short_vowels(sentence))
    return encoded_output_sentences

def encode_output_sentences_for_double_case_endings(sentences):
    encoded_output_sentences=[]
    for sentence in sentences:
              encoded_output_sentences.append(encode_output_sentence_for_double_case_endings(sentence))
    return encoded_output_sentences

# def encode_output_sentences_for_sukun_and_shadda(sentences):
#     encoded_output_sentences=[]
#     for sentence in sentences:
#               encoded_output_sentences.append(encode_output_sentence_for_sukun_and_shadda(sentence))
#     return encoded_output_sentences

def encodeShaddaList(shadda_list):
    encoded_shadda_list=list()
    for sentence in shadda_list:
        list_shadda_encoded=[1 if char==SHADDA else 0 for char in sentence]
        encoded_shadda_list.append(list_shadda_encoded)
    return encoded_shadda_list

def encodeSukoonList(sukoon_list):
    encoded_sukoon_list=list()
    for sentence in sukoon_list:
        list_sukoon_encoded=[1 if char==SUKOON else 0 for char in sentence]
        encoded_sukoon_list.append(list_sukoon_encoded)
    return encoded_sukoon_list




In [18]:
# encoded_input= encode_input_sentences(sentences_without_diacritics)
# encoded_gold_output_for_short_vowels= encode_output_sentences_for_short_vowels(short_vowels_list)
# encoded_gold_output_for_double_case_endings= encode_output_sentences_for_double_case_endings(double_case_endings_list)
# encoded_gold_output_for_shada= encodeShaddaList(shadda_list)
# encoded_gold_output_for_sukun = encodeSukoonList(sukoon_list)


In [19]:
# encoded_input_df = pd.DataFrame(encoded_input)
# csv_filename = 'encoded_input.csv'
# encoded_input_df.to_csv(csv_filename, index=True)

In [20]:
# encoded_golden_short_vowels_df = pd.DataFrame(encoded_gold_output_for_short_vowels)
# csv_filename = 'encoded_golden_short_vowels.csv'
# encoded_golden_short_vowels_df.to_csv(csv_filename, index=True)

In [21]:
# encoded_golden_double_case_endings_df = pd.DataFrame(encoded_gold_output_for_double_case_endings)
# csv_filename = 'encoded_golden_double_case_endings.csv'
# encoded_golden_double_case_endings_df.to_csv(csv_filename, index=True)

In [22]:
# encoded_golden_shadda_df = pd.DataFrame(encoded_gold_output_for_shada)
# csv_filename = 'encoded_golden_shadda.csv'
# encoded_golden_shadda_df.to_csv(csv_filename, index=True)

In [23]:
# encoded_golden_sukkun_df = pd.DataFrame(encoded_gold_output_for_sukun)
# csv_filename = 'encoded_golden_sukkun.csv'
# encoded_golden_sukkun_df.to_csv(csv_filename, index=True)

In [24]:
# def readCSVtoListofLists(filepath):
#     major_list = []
#     df=pd.read_csv(filepath)
#     # Iterate through DataFrame rows and create a list from each row
#     for index, row in df.iterrows():
#         row_list = row.values.tolist()  # Convert row to a list
#         row_list.pop(0)
#         row_list = [x for x in row_list if not math.isnan(x)]
#         row_list = [int(num) for num in row_list]
#         major_list.append(row_list)  # Append the list to the major list
#     return major_list
def readCSVtoListofLists(filepath):
    major_list = []
    df=pd.read_csv(filepath)
    df=df.fillna(0)
    # Iterate through DataFrame rows and create a list from each row
    for index, row in df.iterrows():
        row_list = row.values.tolist()  # Convert row to a list
        row_list.pop(0)
        # row_list = [x if x != np.nan else 0 for x in row_list]
        row_list = [int(num) for num in row_list]
        major_list.append(row_list)  # Append the list to the major list
    return major_list

In [25]:
encoded_input = readCSVtoListofLists("./encoded_input.csv")

In [26]:
encoded_gold_output_for_short_vowels=readCSVtoListofLists("./encoded_golden_short_vowels.csv")

In [27]:
encoded_gold_output_for_double_case_endings=readCSVtoListofLists("./encoded_golden_double_case_endings.csv")

In [28]:
encoded_gold_output_for_shadda=readCSVtoListofLists("./encoded_golden_shadda.csv")

In [29]:
encoded_gold_output_for_sukun=readCSVtoListofLists("./encoded_golden_sukkun.csv")

In [30]:
# print(sentences_without_diacritics[0][-1])
print(sentences_with_diacritics[0])
print(sentences_without_diacritics[0])
print("Encoded Input:", encoded_input[0])
print("Encoded Gold Output Primary Class:", encoded_gold_output_for_short_vowels[0])
print("Encoded Gold Output Sec Class:", encoded_gold_output_for_double_case_endings[0])
print("Encoded Gold Output Sukun Class:", encoded_gold_output_for_sukun[0])
print("Encoded Gold Output Shadda Class:", encoded_gold_output_for_shadda[0])

قَوْلُهُ أَوْ قَطَعَ الْأَوَّلُ يَدَهُ إلَخْ قَالَ الزَّرْكَشِيُّ ابْنُ عَرَفَةَ قَوْلُهُ بِلَفْظٍ يَقْتَضِيه كَإِنْكَارِ غَيْرِ حَدِيثٍ بِالْإِسْلَامِ وُجُوبَ مَا عُلِمَ وُجُوبُهُ مِنْ الدِّينِ ضَرُورَةً كَإِلْقَاءِ مُصْحَفٍ بِقَذَرٍ وَشَدِّ زُنَّارٍ ابْنُ عَرَفَةَ قَوْلُ ابْنِ شَاسٍ أَوْ بِفِعْلٍ يَتَضَمَّنُهُ هُوَ كَلُبْسِ الزُّنَّارِ وَإِلْقَاءِ الْمُصْحَفِ فِي صَرِيحِ النَّجَاسَةِ وَالسُّجُودِ لِلصَّنَمِ وَنَحْوِ ذَلِكَ وَسِحْرٍ مُحَمَّدٌ قَوْلُ مَالِكٍ وَأَصْحَابِهِ أَنَّ السَّاحِرَ كَافِرٌ بِاَللَّهِ تَعَالَى قَالَ مَالِكٌ هُوَ كَالزِّنْدِيقِ إذَا عَمِلَ السِّحْرَ بِنَفْسِهِ قُتِلَ وَلَمْ يُسْتَتَبْ 

قوله أو قطع الأول يده إلخ قال الزركشي ابن عرفة قوله بلفظ يقتضيه كإنكار غير حديث بالإسلام وجوب ما علم وجوبه من الدين ضرورة كإلقاء مصحف بقذر وشد زنار ابن عرفة قول ابن شاس أو بفعل يتضمنه هو كلبس الزنار وإلقاء المصحف في صريح النجاسة والسجود للصنم ونحو ذلك وسحر محمد قول مالك وأصحابه أن الساحر كافر بالله تعالى قال مالك هو كالزنديق إذا عمل السحر بنفسه قتل ولم يستتب 

Encoded Input: [30, 3

In [42]:
max_length = max(len(lst) for lst in encoded_input)
# Pad the lists to the maximum length
padded_short_vowels = [lst + [0] * (max_length - len(lst)) for lst in encoded_gold_output_for_short_vowels]

# max_length = max(len(lst) for lst in encoded_gold_output_for_double_case_endings)
# Pad the lists to the maximum length
padded_double_case_endings = [lst + [0] * (max_length - len(lst)) for lst in encoded_gold_output_for_double_case_endings]

# max_length = max(len(lst) for lst in encoded_gold_output_for_sukun)
# Pad the lists to the maximum length
padded_sukun = [lst + [0] * (max_length - len(lst)) for lst in encoded_gold_output_for_sukun]

# max_length = max(len(lst) for lst in encoded_gold_output_for_shadda)
# Pad the lists to the maximum length
padded_shadda = [lst + [0] * (max_length - len(lst)) for lst in encoded_gold_output_for_shadda]

In [61]:
dataset = tf.data.Dataset.from_tensor_slices((encoded_input[0:1000],
                                              (padded_short_vowels[0:1000],
                                               padded_double_case_endings[0:1000],
                                               padded_shadda[0:1000],
                                               padded_sukun[0:1000])))



In [62]:
for line in dataset.take(5):  # Take and print the first 5 elements
    print(line[0]) #input
    print(line[1][0]) #short vowels
    break

tf.Tensor([30 36 32 ...  0  0  0], shape=(3482,), dtype=int32)
tf.Tensor([1 0 2 ... 0 0 0], shape=(3482,), dtype=int32)


In [65]:
dataset1 = dataset.concatenate(tf.data.Dataset.from_tensor_slices((
            tf.zeros((1, 7), tf.int32),
            tuple(tf.zeros((1,7), tf.int32) for _ in range(4))
        )))

zip_data = lambda x, y: tf.data.Dataset.zip((x, y))
dataset1 = dataset1.unbatch().window(21, 7, drop_remainder=True) \
            .flat_map(zip_data).batch(21, drop_remainder=True)
size = dataset1.reduce(0, lambda old, new: old + 1).numpy()


In [66]:
counter=0
for line in dataset1.take(5):  # Take and print the first 5 elements
    print({'dataset': line, 'size': size})
    counter+=1
    if counter==4:
        break

{'dataset': (<tf.Tensor: shape=(21,), dtype=int32, numpy=
array([30, 36, 32, 35,  1,  5, 36,  1, 30, 25, 27,  1,  9, 32,  5, 36, 32,
        1, 38, 17, 35])>, (<tf.Tensor: shape=(21,), dtype=int32, numpy=array([1, 0, 2, 2, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 2, 1, 1, 2, 0, 1, 0])>, <tf.Tensor: shape=(21,), dtype=int32, numpy=array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])>, <tf.Tensor: shape=(21,), dtype=int32, numpy=array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0])>, <tf.Tensor: shape=(21,), dtype=int32, numpy=array([0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])>)), 'size': 497427}
{'dataset': (<tf.Tensor: shape=(21,), dtype=int32, numpy=
array([ 1, 30, 25, 27,  1,  9, 32,  5, 36, 32,  1, 38, 17, 35,  1,  7, 32,
       16,  1, 30,  9])>, (<tf.Tensor: shape=(21,), dtype=int32, numpy=array([1, 1, 0, 0, 1, 0, 1, 2, 1, 1, 2, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0])>, <tf.Tensor: shape=(21,), dtype=int32, numpy=array([0, 0, 0, 0, 0, 0, 0, 0, 0,

## Division to batches where each batch takes 100 line (row)

## Input/output concat 

## Sliding Window Approach

## Batch and input preparation

## Training

## Evaluation

--- 