In [None]:
# pip installs

In [23]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.nn import TransformerDecoder, TransformerDecoderLayer
from torch.nn import Transformer

from sklearn.model_selection import train_test_split
import statistics
from tqdm import tqdm

from zipfile import ZipFile

from tokenizers import Tokenizer, models, pre_tokenizers, trainers
from transformers import AutoTokenizer, AutoModelForMaskedLM

import requests
import os
import json

from torch.utils.data import Dataset, DataLoader, random_split
import random

# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


cuda


### Define seeds

In [24]:
# define seed for reproducibility
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

# Data preprocessing

### here we need to load the data and extract only data with vowels punctuations

In [2]:
# path to the jason file for the dataset
data_json_path = 'data/books.json'

# Root directory where the downloaded files will be saved
texts_path = 'data/texts'


# Create the directory if it does not exist
if not os.path.exists(texts_path):
    os.makedirs(texts_path)


# Load the json dataset
with open(data_json_path, 'r', encoding='utf-8') as f:
    jason_data = json.load(f)

# download the files and save them in a folder

#### remove\add the comment as needed

In [None]:
# # Loop through the json dataset and download the files
# for entry in tqdm(jason_data):
#     try:
#         # Download the Nikud Meteg file
#         if entry['fileName'] + '__nikud_meteg' in os.listdir(texts_path):
#             continue
#         nikud_meteg_url = entry['nikudMetegFileURL']
#         nikud_meteg_local_path = os.path.join(texts_path, entry['fileName'] + '__nikud_meteg.zip')
#         nikud_meteg_response = requests.get(nikud_meteg_url)
#         with open(nikud_meteg_local_path, 'wb') as f:
#             f.write(nikud_meteg_response.content)

#             # Unzip the Nikud Meteg file
#             with ZipFile(nikud_meteg_local_path, 'r') as zipObj:
#                 zipObj.extractall(os.path.join(texts_path, entry['fileName'] + '__nikud_meteg'))
#     except Exception as e:
#         print(f"Error reading file {entry['fileName']}: {e}")
#         continue


# # iterate through the texts folder and delete the zip folders
# for file in tqdm(os.listdir(texts_path)):
#     if file.endswith(".zip"):
#         os.remove(os.path.join(texts_path, file))

            


# Author files

### Create a dictionary whose keys are authors and values are a list containing all it's files

In [3]:
# Define a method to create the author files dictionary
def create_author_files_dict(author_files):
    """
    This function creates a dictionary of author files with a list of their corresponding texts.
    """
    author_files_dict = {}
    for file in author_files:
        author_files_dict[file] = []
        for text_file_name in os.listdir(os.path.join(texts_path, file)):
            if text_file_name.endswith('.txt'):
                author_files_dict[file].append(text_file_name)
    return author_files_dict

author_files = os.listdir(texts_path)
author_files_dict = create_author_files_dict(author_files)

# Functions to clean the data

In [5]:
# Nikud unicode range (https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet)

# Read a txt file from the author files dictionary
def read_txt_file(file_path):
    """
    This function reads a txt file and returns the text as a string.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text

def remove_nikud(string):
    """Removes the nikud from the given string."""
    nikud = re.compile(r'[\u05B0-\u05C2]')
    return nikud.sub("", string)

def get_nikud(word):
    """Returns the nikud from the given word."""
    nikud = re.compile(r'[\u05B0-\u05C2]')
    current_nikud = ''
    nikud_arr = []
    for i in range(len(word)):
        if i == 0:
            continue
        if nikud.match(word[i]):
            current_nikud += word[i]
        else:
            nikud_arr.append(current_nikud)
            current_nikud = ''
    nikud_arr.append(current_nikud)
    return nikud_arr

def add_nikud(word, nikud):
    """Adds the nikud to the given word."""
    new_word = ''
    for i in range(len(word)):
        new_word += word[i] + nikud[i]
    return new_word

# Split data to sentences and save to CSV

In [None]:
text = read_txt_file(os.path.join(texts_path, 'afikeiyam1__nikud_meteg', 'afikeiyam1-002__nikud_meteg.txt'))
text_after_split = re.split(r'\n|\.', text)
print(text_after_split)
print(text)

## Create a unified csv of all sentences

In [6]:
import csv

columns = ['text_with_nikud', 'text_without_nikud', 'nikud', 'author', 'file_name', 'sentence_num']
# with open('data/full_data.csv', 'w', encoding='utf-8') as f:
#     writer = csv.writer(f)
#     writer.writerow(columns)
#     for author in tqdm(author_files_dict):
#         for file in author_files_dict[author]:
#             text = read_txt_file(os.path.join(texts_path, author, file))
#             # split the text into sentences by \n or .
#             sentences = re.split(r'\n|\.', text)
#             for i, sentence in enumerate(sentences):
#                 sentence_words = sentence.split()
#                 # remove the nikud from the sentence
#                 sentence_without_nikud = remove_nikud(sentence)
#                 # get the nikud from the sentence
#                 nikud = list(map(get_nikud, sentence_words))

#                 # add the sentence to the dataframe
#                 writer.writerow([sentence, sentence_without_nikud, nikud, author, file, i])

In [8]:
# dataframe of the CSV with chunksize of 1000
# data_df_chunks = pd.read_csv('data/full_data.csv', chunksize=1000)

# # read only first 100000 rows of the CSV, we will use this for now
# data_df = pd.read_csv('data/full_data.csv', nrows=100000, converters={'nikud': eval})
# print(data_df.head())

# # keep only rows s.t the length of the text (without spaces) is at most 100
# data_df = data_df[data_df['text_without_nikud'].str.replace(' ', '').str.len() <= 100]

# # save the dataframe to a json file
# data_df.to_json('data/short_data_df.json', orient='records', lines=True)





                                     text_with_nikud   
0  פֶּתַח דָּבָר  יִתְבָּרֵךְ הַבּוֹרֵא וְיִשְׁתּ...  \
1                     וְלֵיֽהָנוֹת מִזִּיו הַתּוֹרָה   
2   וַאֲשֶׁר לֹא עָזַב חַסְדּוֹ זֶה מִמֶּנִּי עַד...   
3   וְהִיא שֶׁעָמְדָה לִי לְחַדֵּשׁ חִיֽדּוּשִׁים...   
4                          כְּיָד ה' הַטּוֹבָה עָלַי   

                                  text_without_nikud   
0  פתח דבר  יתברך הבורא וישתבח היוצר אשר מעודי גב...  \
1                                 וליהנות מזיו התורה   
2                   ואשר לא עזב חסדו זה ממני עד היום   
3   והיא שעמדה לי לחדש חידושים בענינים שונים בש"ס...   
4                                   כיד ה' הטובה עלי   

                                               nikud                   author   
0  [[ֶּ, ַ, ], [ָּ, ָ, ], [ִ, ְ, ָּ, ֵ, ְ], [ַ, ּ...  afikeiyam1__nikud_meteg  \
1  [[ְ, ֵ, ֽ, ָ, , ֹ, ], [ִ, ִּ, , ], [ַ, ּ, ֹ, ָ...  afikeiyam1__nikud_meteg   
2  [[ַ, ֲ, ֶׁ, ], [ֹ, ], [ָ, ַ, ], [ַ, ְ, ּ, ֹ], ...  afikeiyam1__nikud_meteg   
3 

In [9]:
# read the json file
data_df = pd.read_json('data/short_data_df.json', orient='records', lines=True)

print(data_df.shape)

(57492, 6)


In [10]:
label_to_id = {}
id_to_label = {}
for label_list in tqdm(data_df['nikud']):
    # flatten the list
    label_list = [item for sublist in label_list for item in sublist]
    for label in label_list:
        if len(label) == 0:
            label = "<no_nikud>"
        if label not in label_to_id:
            label_to_id[label] = len(label_to_id)
            id_to_label[len(id_to_label)] = label

print(label_to_id)



100%|██████████| 57492/57492 [00:00<00:00, 110615.56it/s]

{'ֶּ': 0, 'ַ': 1, '<no_nikud>': 2, 'ָּ': 3, 'ָ': 4, 'ִ': 5, 'ְ': 6, 'ֵ': 7, 'ּ': 8, 'ֹ': 9, 'ְׁ': 10, 'ַּ': 11, 'ֲ': 12, 'ֶׁ': 13, 'ׂ': 14, 'ֶ': 15, 'ֵּ': 16, 'ֵׁ': 17, 'ְּ': 18, 'ֽ': 19, 'ִּ': 20, 'ׁ': 21, 'ִׁ': 22, 'ַׁ': 23, 'ָׂ': 24, 'ָׁ': 25, 'ֱ': 26, 'ְׂ': 27, 'ֵּׂ': 28, 'ָּׂ': 29, 'ּׁ': 30, 'ֳ': 31, 'ֻ': 32, 'ֹּ': 33, 'ִׂ': 34, 'ֵׂ': 35, 'ַּׁ': 36, 'ֵּׁ': 37, 'ְּׁ': 38, 'ֻּ': 39, 'ִּׁ': 40, 'ֶׂ': 41, 'ָּׁ': 42, 'ֻׁ': 43, 'ֹׁ': 44, 'ֶּׁ': 45, 'ַּׂ': 46, 'ּׂ': 47, 'ֳּ': 48, 'ַׂ': 49, 'ְּׂ': 50, 'ִּׂ': 51, 'ֻּׁ': 52, 'ֶּׂ': 53, 'ַָ': 54, 'ֲּ': 55, 'ֹּׁ': 56, 'ַָ': 57, 'ֹּׂ': 58, 'ֹׂ': 59, 'ִַ': 60, 'ֲׁ': 61}





#### download tokenizer and model
(alephbert-base, with vocab of words with len <= 1)

In [11]:
model_path = 'C:\\Users\\baruc\\PycharmProjects\\pythonProject\\Punctuation_Restoration\\AlephBERT-main\\AlephBERT-main\\models\\alephbert-base'
alephbert_tokenizer = AutoTokenizer.from_pretrained(model_path)
alephbert_model = AutoModelForMaskedLM.from_pretrained("onlplab/alephbert-base")

In [12]:
# test the tokenization and detokenization
test = "בדיקה של הדבר הזה"
tokenized = alephbert_tokenizer.tokenize(test)
encoded = alephbert_tokenizer.encode(test)
decoded = alephbert_tokenizer.decode(encoded)
print(test)
print("tokenized: ", tokenized)
print("encoded: ", encoded)
print("decoded: ", decoded)

בדיקה של הדבר הזה
tokenized:  ['ב', '##ד', '##י', '##ק', '##ה', 'ש', '##ל', 'ה', '##ד', '##ב', '##ר', 'ה', '##ז', '##ה']
encoded:  [2, 177, 1039, 1008, 1013, 1016, 201, 1009, 180, 1039, 1037, 1014, 180, 1075, 1016, 3]
decoded:  [CLS] בדיקה של הדבר הזה [SEP]


# create DataSet class

In [35]:
# create pytorch dataset class for punctuation restoration (returns input(text) and target(nikud))

class PunctuationRestorationDataset(Dataset):
    def __init__(self, data_df, tokenizer, label_to_id, max_len):
        self.data = data_df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label_to_id = label_to_id 
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index): # TODO: need to make sure not to look at fraction of words
        text = self.data.iloc[index]['text_without_nikud']
        nikud = self.data.iloc[index]['nikud']  # list of lists of nikud

        # flatten nikud list
        nikud = [item for sublist in nikud for item in sublist]

        # replace empty strings with <no_nikud> token
        nikud = [label if label != "" else "<no_nikud>" for label in nikud]
        # replace labels with ids
        nikud = [self.label_to_id[label] for label in nikud]

        # truncate if needed
        #nikud = nikud[:self.max_len] #TODO: check if correct (should be the same as text)

        # pad if needed
        nikud = nikud + [len(label_to_id)] * (self.max_len - len(nikud))

        # convert to tensor
        nikud = torch.tensor(nikud, dtype=torch.long)

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'text': text,
            'nikud': nikud,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }


# create pytorch dataset class for punctuation restoration (returns input(text) and target(nikud))
dataset = PunctuationRestorationDataset(data_df, alephbert_tokenizer, label_to_id, 100)
sample = dataset[0]
print(len(dataset))
print(sample.keys())
# remove spaces from text
print(sample['text'].replace(" ", ""))
print(len(sample['text'].replace(" ", "")))
print((sample['nikud']))
print(sample['input_ids'].shape)
print(sample['attention_mask'].shape)




57492
dict_keys(['text', 'nikud', 'input_ids', 'attention_mask'])
פתחדבריתברךהבוראוישתבחהיוצראשרמעודיגברעליחסדולשוםחלקיביןחובשיביהמ"דלהתחמםכנגדאורןשלחכמים
88
tensor([ 0,  1,  2,  3,  4,  2,  5,  6,  3,  7,  6,  1,  8,  9,  7,  2,  6,  5,
        10, 11, 11,  2,  1,  8,  9,  7,  2, 12, 13,  2,  7,  2,  8,  1,  2,  3,
         1,  2,  4,  1,  2,  1,  6,  8,  9,  4, 14,  8,  2, 15,  6,  5,  2, 16,
         2,  2,  2,  9,  6, 17,  2,  2,  2,  2,  2,  2,  2,  6,  5,  6,  1, 16,
         2, 18, 15, 15,  2,  2,  8,  4,  2, 13,  2, 12,  4,  5,  2,  2, 62, 62,
        62, 62, 62, 62, 62, 62, 62, 62, 62, 62])
torch.Size([100])
torch.Size([100])


In [36]:
# split dataset to train, val and test
train_size = int(0.9 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])

batch_size = 32

# create dataloader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)



# performe EDAs on the data

In [37]:
# EDA on train, val and test dataset
print("train dataset size: ", len(train_dataset))
print("val dataset size: ", len(val_dataset))
print("test dataset size: ", len(test_dataset))

train dataset size:  41393
val dataset size:  10349
test dataset size:  5750


# Define the two models (one with look-ahead, one without)

### train the two models

### Evaluate the models

In [40]:
# define a charachter level transformer model with the following architecture:
# 1. Embedding layer
# 2. Positional encoding layer
# 3. N Transformer blocks
# 4. Linear layer to predict punctuation



class FullSentenceModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=512, nhead=8, num_layers=6, output_dim=len(label_to_id)):
        super(FullSentenceModel, self).__init__()
        
        # Non-pretrained embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # 6-layer transformer with 8 attention heads
        # self.transformer = Transformer(
        #     d_model=embedding_dim,
        #     nhead=nhead,
        #     num_encoder_layers=num_layers,
        #     num_decoder_layers=num_layers
        # )
        
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=nhead),
            num_layers=num_layers
        )
        
        # Fully connected layer
        self.fc = nn.Linear(embedding_dim, output_dim)

    def forward(self, sentence):
        # Pass input through embedding layer
        characters_embeddings = self.embedding(sentence)
        
        # Pass through transformer
        characters_after_transformer_layers = self.transformer_encoder(characters_embeddings)
        
        # Pass through fully connected layer
        predictions = self.fc(characters_after_transformer_layers)
        
        return predictions




In [41]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss(ignore_index=len(label_to_id)).to(device)

vocab_size = len(alephbert_tokenizer.vocab)

model = FullSentenceModel(vocab_size).to(device)

# define the training loop
def train(model, dataloader, optimizer, criterion):
    model.train()
    epoch_loss = 0
    for batch in tqdm(dataloader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        labels = batch['nikud'].to(device)
        predictions = model.forward(input_ids)
        predictions = predictions.view(-1, predictions.shape[-1])
        labels = labels.view(-1)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

# define the evaluation loop
def evaluate(model, dataloader, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids = batch['input_ids'].to(device)
            labels = batch['nikud'].to(device)
            predictions = model.forward(input_ids)
            predictions = predictions.view(-1, predictions.shape[-1])
            labels = labels.view(-1)
            loss = criterion(predictions, labels)
            epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

# define the training and evaluation loop
def train_and_evaluate(model, train_dataloader, val_dataloader, optimizer, criterion, epochs=10):
    for epoch in range(epochs):
        train_loss = train(model, train_dataloader, optimizer, criterion)
        val_loss = evaluate(model, val_dataloader, criterion)
        print(f'Epoch: {epoch + 1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

# train the model
train_and_evaluate(model, train_loader, val_loader, optimizer, criterion, epochs=10)




100%|██████████| 1294/1294 [01:55<00:00, 11.17it/s]
100%|██████████| 324/324 [00:11<00:00, 28.50it/s]


Epoch: 1, Train Loss: 4.1605, Val Loss: 4.1568


100%|██████████| 1294/1294 [01:52<00:00, 11.48it/s]
100%|██████████| 324/324 [00:11<00:00, 29.08it/s]


Epoch: 2, Train Loss: 4.1606, Val Loss: 4.1576


100%|██████████| 1294/1294 [01:53<00:00, 11.45it/s]
100%|██████████| 324/324 [00:11<00:00, 29.11it/s]


Epoch: 3, Train Loss: 4.1607, Val Loss: 4.1565


100%|██████████| 1294/1294 [01:53<00:00, 11.41it/s]
100%|██████████| 324/324 [00:11<00:00, 29.33it/s]


Epoch: 4, Train Loss: 4.1608, Val Loss: 4.1572


100%|██████████| 1294/1294 [01:53<00:00, 11.41it/s]
100%|██████████| 324/324 [00:11<00:00, 29.43it/s]


Epoch: 5, Train Loss: 4.1606, Val Loss: 4.1566


100%|██████████| 1294/1294 [01:52<00:00, 11.48it/s]
100%|██████████| 324/324 [00:10<00:00, 29.73it/s]


Epoch: 6, Train Loss: 4.1605, Val Loss: 4.1567


100%|██████████| 1294/1294 [01:52<00:00, 11.52it/s]
100%|██████████| 324/324 [00:11<00:00, 28.81it/s]


Epoch: 7, Train Loss: 4.1604, Val Loss: 4.1572


100%|██████████| 1294/1294 [01:55<00:00, 11.24it/s]
100%|██████████| 324/324 [00:11<00:00, 29.27it/s]


Epoch: 8, Train Loss: 4.1605, Val Loss: 4.1569


100%|██████████| 1294/1294 [01:52<00:00, 11.49it/s]
100%|██████████| 324/324 [00:11<00:00, 29.28it/s]


Epoch: 9, Train Loss: 4.1606, Val Loss: 4.1570


 12%|█▏        | 156/1294 [00:13<01:38, 11.50it/s]

# Define the dual model class, it will be composed of two models.
#### whenever there is a disagreement between the two models, the model will add nikud using the lookahead model

## Evaluation of the dual model