In [1]:
# pip installs

In [2]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

from sklearn.model_selection import train_test_split
import statistics
from tqdm import tqdm

from zipfile import ZipFile

from tokenizers import Tokenizer, models, pre_tokenizers, trainers

# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


cuda


# Data preprocessing

### here we need to load the data and extract only data with vowels punctuations

In [3]:
import requests
import os
import json
# path to the jason file for the dataset
data_json_path = 'data/books.json'

# Root directory where the downloaded files will be saved
texts_path = 'data/texts'


# Create the directory if it does not exist
if not os.path.exists(texts_path):
    os.makedirs(texts_path)


# Load the json dataset
with open(data_json_path, 'r', encoding='utf-8') as f:
    jason_data = json.load(f)

# download the files and save them in a folder

#### remove\add the comment as needed

In [4]:
# # Loop through the json dataset and download the files
# for entry in tqdm(jason_data):
#     try:
#         # Download the Nikud Meteg file
#         if entry['fileName'] + '__nikud_meteg' in os.listdir(texts_path):
#             continue
#         nikud_meteg_url = entry['nikudMetegFileURL']
#         nikud_meteg_local_path = os.path.join(texts_path, entry['fileName'] + '__nikud_meteg.zip')
#         nikud_meteg_response = requests.get(nikud_meteg_url)
#         with open(nikud_meteg_local_path, 'wb') as f:
#             f.write(nikud_meteg_response.content)

#             # Unzip the Nikud Meteg file
#             with ZipFile(nikud_meteg_local_path, 'r') as zipObj:
#                 zipObj.extractall(os.path.join(texts_path, entry['fileName'] + '__nikud_meteg'))
#     except Exception as e:
#         print(f"Error reading file {entry['fileName']}: {e}")
#         continue


# # iterate through the texts folder and delete the zip folders
# for file in tqdm(os.listdir(texts_path)):
#     if file.endswith(".zip"):
#         os.remove(os.path.join(texts_path, file))

            


# Author files

### Create a dictionary whose keys are authors and values are a list containing all it's files

In [5]:
# Define a method to create the author files dictionary
def create_author_files_dict(author_files):
    """
    This function creates a dictionary of author files with a list of their corresponding texts.
    """
    author_files_dict = {}
    for file in author_files:
        author_files_dict[file] = []
        for text_file_name in os.listdir(os.path.join(texts_path, file)):
            if text_file_name.endswith('.txt'):
                author_files_dict[file].append(text_file_name)
    return author_files_dict

author_files = os.listdir(texts_path)
author_files_dict = create_author_files_dict(author_files)

# Functions to clean the data

In [6]:
# Nikud unicode range (https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet)

# Read a txt file from the author files dictionary
def read_txt_file(file_path):
    """
    This function reads a txt file and returns the text as a string.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text

def remove_nikud(string):
    """Removes the nikud from the given string."""
    nikud = re.compile(r'[\u05B0-\u05C2]')
    return nikud.sub("", string)

def get_nikud(word):
    """Returns the nikud from the given word."""
    nikud = re.compile(r'[\u05B0-\u05C2]')
    current_nikud = ''
    nikud_arr = []
    for i in range(len(word)):
        if i == 0:
            continue
        if nikud.match(word[i]):
            current_nikud += word[i]
        else:
            nikud_arr.append(current_nikud)
            current_nikud = ''
    nikud_arr.append(current_nikud)
    return nikud_arr

def add_nikud(word, nikud):
    """Adds the nikud to the given word."""
    new_word = ''
    for i in range(len(word)):
        new_word += word[i] + nikud[i]
    return new_word

# Split data to sentences and save to CSV

In [7]:
text = read_txt_file(os.path.join(texts_path, 'afikeiyam1__nikud_meteg', 'afikeiyam1-002__nikud_meteg.txt'))
text_after_split = re.split(r'\n|\.', text)
print(text_after_split)
print(text)

['פֶּתַח דָּבָר  יִתְבָּרֵךְ הַבּוֹרֵא וְיִשְׁתַּבַּח הַיּוֹצֵר אֲשֶׁר מֵעוּדַי גָּבַר עָלַי חַסְדּוֹ לָשׂוּם חֶלְקִי בֵּין חוֹבְשֵׁי ביהמ"ד לְהִתְחַמֵּם כְּנֶגֶד אוּרָן שֶׁל חֲכָמִים', ' וְלֵיֽהָנוֹת מִזִּיו הַתּוֹרָה', ' וַאֲשֶׁר לֹא עָזַב חַסְדּוֹ זֶה מִמֶּנִּי עַד הַיּוֹם', ' וְהִיא שֶׁעָמְדָה לִי לְחַדֵּשׁ חִיֽדּוּשִׁים בְּעִנְיָנִים שׁוֹנִים בְּשַׁ"ס וּפוֹסְקִים', " כְּיָד ה' הַטּוֹבָה עָלַי", ' וָתָ"ל כִּי בְּכׇל מָקוֹם שָׂמְתִּי לִי לְמַטָּרָה', ' לְבָרֵר דִּין וַהֲלָכָה וַחֲקִירָה חֲדָשָׁה', ' אוֹ לַעֲמֹוֽד עַל בֵּיֽרוּר ד\' הָרִאשׁוֹנִים זַ"ל', ' וְלֹא לְפַלְפֵּל בְּפִלְפּוּל שֶׁל הֶבֶל לְאֵין קֵץ וְתַכְלִית', ' כַּאֲשֶׁר עֵינֶיךָ הַקּוֹרֵא תֶּחֱזֶינָה מֵישָׁרִים', ' וּמַה מְאֹד יָגֵל לִבִּי וְתַעֲלֹוֽזְנָה כִּלְיוֹתַי בְּצֵאתָם עַתָּה לְאוֹר עוֹלָם', ' וְיוּחֲקוּ בְּסֵפֶר מִלֵּי', ' זֶה הַיּוֹם שֶׁקִּוִּיתִי', ' כִּי יָפוּצוּ מֵעֵינוֹתַי הַחוּצָה', ' כִּי אׇמְנָם ת"ל הַרְבֵּה מִדְּבָרַי הי\' לְמַרְאֵה עֵינַי גְּדוֹלֵי יִשְׂרָאֵל יחי\' אֲשֶׁר נִשֵּׂאתִי וְנָתַתִּי עִמָּהֶם בּ

## Create a unified csv of all sentences

In [8]:
import csv

columns = ['text_with_nikud', 'text_without_nikud', 'nikud', 'author', 'file_name', 'sentence_num']
# with open('data/full_data.csv', 'w', encoding='utf-8') as f:
#     writer = csv.writer(f)
#     writer.writerow(columns)
#     for author in tqdm(author_files_dict):
#         for file in author_files_dict[author]:
#             text = read_txt_file(os.path.join(texts_path, author, file))
#             # split the text into sentences by \n or .
#             sentences = re.split(r'\n|\.', text)
#             for i, sentence in enumerate(sentences):
#                 sentence_words = sentence.split()
#                 # remove the nikud from the sentence
#                 sentence_without_nikud = remove_nikud(sentence)
#                 # get the nikud from the sentence
#                 nikud = list(map(get_nikud, sentence_words))

#                 # add the sentence to the dataframe
#                 writer.writerow([sentence, sentence_without_nikud, nikud, author, file, i])

In [41]:
# dataframe of the CSV with chunksize of 1000
# data_df_chunks = pd.read_csv('data/full_data.csv', chunksize=1000)

# read only first 100000 rows of the CSV, we will use this for now
data_df = pd.read_csv('data/full_data.csv', nrows=100000, converters={'nikud': eval})
print(data_df.head())

                                     text_with_nikud   
0  פֶּתַח דָּבָר  יִתְבָּרֵךְ הַבּוֹרֵא וְיִשְׁתּ...  \
1                     וְלֵיֽהָנוֹת מִזִּיו הַתּוֹרָה   
2   וַאֲשֶׁר לֹא עָזַב חַסְדּוֹ זֶה מִמֶּנִּי עַד...   
3   וְהִיא שֶׁעָמְדָה לִי לְחַדֵּשׁ חִיֽדּוּשִׁים...   
4                          כְּיָד ה' הַטּוֹבָה עָלַי   

                                  text_without_nikud   
0  פתח דבר  יתברך הבורא וישתבח היוצר אשר מעודי גב...  \
1                                 וליהנות מזיו התורה   
2                   ואשר לא עזב חסדו זה ממני עד היום   
3   והיא שעמדה לי לחדש חידושים בענינים שונים בש"ס...   
4                                   כיד ה' הטובה עלי   

                                               nikud                   author   
0  [[ֶּ, ַ, ], [ָּ, ָ, ], [ִ, ְ, ָּ, ֵ, ְ], [ַ, ּ...  afikeiyam1__nikud_meteg  \
1  [[ְ, ֵ, ֽ, ָ, , ֹ, ], [ִ, ִּ, , ], [ַ, ּ, ֹ, ָ...  afikeiyam1__nikud_meteg   
2  [[ַ, ֲ, ֶׁ, ], [ֹ, ], [ָ, ַ, ], [ַ, ְ, ּ, ֹ], ...  afikeiyam1__nikud_meteg   
3 

In [42]:
label_to_id = {}
id_to_label = {}
for label_list in tqdm(data_df['nikud']):
    # flatten the list
    label_list = [item for sublist in label_list for item in sublist]
    for label in label_list:
        if len(label) == 0:
            label = "<no_nikud>"
        if label not in label_to_id:
            label_to_id[label] = len(label_to_id)
            id_to_label[len(id_to_label)] = label

print(label_to_id)



100%|██████████| 100000/100000 [00:04<00:00, 20122.01it/s]

{'ֶּ': 0, 'ַ': 1, '<no_nikud>': 2, 'ָּ': 3, 'ָ': 4, 'ִ': 5, 'ְ': 6, 'ֵ': 7, 'ּ': 8, 'ֹ': 9, 'ְׁ': 10, 'ַּ': 11, 'ֲ': 12, 'ֶׁ': 13, 'ׂ': 14, 'ֶ': 15, 'ֵּ': 16, 'ֵׁ': 17, 'ְּ': 18, 'ֽ': 19, 'ִּ': 20, 'ׁ': 21, 'ִׁ': 22, 'ַׁ': 23, 'ָׂ': 24, 'ָׁ': 25, 'ֱ': 26, 'ְׂ': 27, 'ֵּׂ': 28, 'ָּׂ': 29, 'ּׁ': 30, 'ֳ': 31, 'ֻ': 32, 'ֹּ': 33, 'ֵׂ': 34, 'ִׂ': 35, 'ֶׂ': 36, 'ֵּׁ': 37, 'ַּׁ': 38, 'ְּׁ': 39, 'ֻּ': 40, 'ִּׁ': 41, 'ָּׁ': 42, 'ֻׁ': 43, 'ִּׂ': 44, 'ֹׁ': 45, 'ֶּׁ': 46, 'ַּׂ': 47, 'ַׂ': 48, 'ּׂ': 49, 'ֳּ': 50, 'ְּׂ': 51, 'ֻּׁ': 52, 'ֶּׂ': 53, 'ַָ': 54, 'ֲּ': 55, 'ֹׂ': 56, 'ַָ': 57, 'ֹּׁ': 58, 'ִּֿ': 59, 'ִַ': 60, 'ָֹ': 61, 'ִָ': 62, 'ֹּׂ': 63, 'ֻׂ': 64, 'ֲׁ': 65, 'ִָ': 66, 'ִַ': 67, 'ְִ': 68, 'ֱּׁ': 69, 'ֳׁ': 70, 'ַַ': 71, 'ֱּ': 72, 'ֳּׁ': 73}





#### download tokenizer and model
(alephbert-base, with vocab of words with len <= 1)

In [43]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
model_path = 'C:\\Users\\baruc\\PycharmProjects\\pythonProject\\Punctuation_Restoration\\AlephBERT-main\\AlephBERT-main\\models\\alephbert-base'
alephbert_tokenizer = AutoTokenizer.from_pretrained(model_path)
alephbert_model = AutoModelForMaskedLM.from_pretrained("onlplab/alephbert-base")

In [44]:
# test the tokenization and detokenization
test = "בדיקה של הדבר הזה"
tokenized = alephbert_tokenizer.tokenize(test)
encoded = alephbert_tokenizer.encode(test)
decoded = alephbert_tokenizer.decode(encoded)
print(test)
print("tokenized: ", tokenized)
print("encoded: ", encoded)
print("decoded: ", decoded)

בדיקה של הדבר הזה
tokenized:  ['ב', '##ד', '##י', '##ק', '##ה', 'ש', '##ל', 'ה', '##ד', '##ב', '##ר', 'ה', '##ז', '##ה']
encoded:  [2, 177, 1039, 1008, 1013, 1016, 201, 1009, 180, 1039, 1037, 1014, 180, 1075, 1016, 3]
decoded:  [CLS] בדיקה של הדבר הזה [SEP]


# create DataSet class

In [45]:
# create pytorch dataset class for punctuation restoration (returns input(text) and target(nikud))

class PunctuationRestorationDataset(Dataset):
    def __init__(self, data_df, tokenizer, label_to_id, max_len):
        self.data = data_df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label_to_id = label_to_id 
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index): # TODO: need to make sure not to look at fraction of words
        text = self.data.iloc[index]['text_without_nikud']
        nikud = self.data.iloc[index]['nikud']  # list of lists of nikud

        # flatten nikud list
        nikud = [item for sublist in nikud for item in sublist]

        # replace empty strings with <no_nikud> token
        nikud = [label if label != "" else "<no_nikud>" for label in nikud]
        # replace labels with ids
        nikud = [self.label_to_id[label] for label in nikud]
        # truncate if needed
        nikud = nikud[:self.max_len]
        # pad if needed
        nikud = nikud + [len(label_to_id)] * (self.max_len - len(nikud))
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'text': text,
            'nikud': nikud,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }


# create pytorch dataset class for punctuation restoration (returns input(text) and target(nikud))
dataset = PunctuationRestorationDataset(data_df, alephbert_tokenizer, label_to_id, 100)
sample = dataset[0]
print(len(dataset))
print(sample.keys())
# remove spaces from text
print(len(sample['text'].replace(" ", "")))
print(len(sample['nikud']))
print(sample['input_ids'].shape)
print(sample['attention_mask'].shape)




100000
dict_keys(['text', 'nikud', 'input_ids', 'attention_mask'])
88
100
torch.Size([100])
torch.Size([100])


In [58]:
from torch.utils.data import Dataset, DataLoader, random_split

# split dataset to train, val and test
train_size = int(0.9 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])

batch_size = 4

# create dataloader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

# performe EDAs on the data

# dataloader and tokenizer

# Define the two models (one with look-ahead, one without)

### train the two models

### Evaluate the models

# Define the dual model class, it will be composed of two models.
#### whenever there is a disagreement between the two models, the model will add nikud using the lookahead model

## Evaluation of the dual model