In [None]:
# pip installs

In [None]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.nn import TransformerDecoder, TransformerDecoderLayer
from torch.nn import Transformer

from sklearn.model_selection import train_test_split
import statistics
from tqdm import tqdm

from zipfile import ZipFile

from tokenizers import Tokenizer, models, pre_tokenizers, trainers
from transformers import AutoTokenizer, AutoModelForMaskedLM

import requests
import os
import json

from torch.utils.data import Dataset, DataLoader, random_split
import random

# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

### Define seeds

In [None]:
# define seed for reproducibility
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

# Data preprocessing

### here we need to load the data and extract only data with vowels punctuations

In [None]:
# path to the jason file for the dataset
data_json_path = 'data/books.json'

# Root directory where the downloaded files will be saved
texts_path = 'data/texts'


# Create the directory if it does not exist
if not os.path.exists(texts_path):
    os.makedirs(texts_path)


# Load the json dataset
with open(data_json_path, 'r', encoding='utf-8') as f:
    jason_data = json.load(f)

# download the files and save them in a folder

#### remove\add the comment as needed

In [None]:
# # Loop through the json dataset and download the files
# for entry in tqdm(jason_data):
#     try:
#         # Download the Nikud Meteg file
#         if entry['fileName'] + '__nikud_meteg' in os.listdir(texts_path):
#             continue
#         nikud_meteg_url = entry['nikudMetegFileURL']
#         nikud_meteg_local_path = os.path.join(texts_path, entry['fileName'] + '__nikud_meteg.zip')
#         nikud_meteg_response = requests.get(nikud_meteg_url)
#         with open(nikud_meteg_local_path, 'wb') as f:
#             f.write(nikud_meteg_response.content)

#             # Unzip the Nikud Meteg file
#             with ZipFile(nikud_meteg_local_path, 'r') as zipObj:
#                 zipObj.extractall(os.path.join(texts_path, entry['fileName'] + '__nikud_meteg'))
#     except Exception as e:
#         print(f"Error reading file {entry['fileName']}: {e}")
#         continue


# # iterate through the texts folder and delete the zip folders
# for file in tqdm(os.listdir(texts_path)):
#     if file.endswith(".zip"):
#         os.remove(os.path.join(texts_path, file))

# Author files

### Create a dictionary whose keys are authors and values are a list containing all it's files

In [None]:
# Define a method to create the author files dictionary
def create_author_files_dict(author_files):
    """
    This function creates a dictionary of author files with a list of their corresponding texts.
    """
    author_files_dict = {}
    for file in author_files:
        author_files_dict[file] = []
        for text_file_name in os.listdir(os.path.join(texts_path, file)):
            if text_file_name.endswith('.txt'):
                author_files_dict[file].append(text_file_name)
    return author_files_dict

author_files = os.listdir(texts_path)
author_files_dict = create_author_files_dict(author_files)

# Functions to clean the data

In [None]:
# Nikud unicode range (https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet)

# Read a txt file from the author files dictionary
def read_txt_file(file_path):
    """
    This function reads a txt file and returns the text as a string.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text

def remove_nikud(string):
    """Removes the nikud from the given string."""
    nikud = re.compile(r'[\u05B0-\u05C2]')
    return nikud.sub("", string)

def get_nikud(word):
    """Returns the nikud from the given word."""
    nikud = re.compile(r'[\u05B0-\u05C2]')
    current_nikud = ''
    nikud_arr = []
    for i in range(len(word)):
        if i == 0:
            continue
        if nikud.match(word[i]):
            current_nikud += word[i]
        else:
            nikud_arr.append(current_nikud)
            current_nikud = ''
    nikud_arr.append(current_nikud)
    return nikud_arr

def add_nikud(word, nikud):
    """Adds the nikud to the given word."""
    new_word = ''
    for i in range(len(word)):
        new_word += word[i] + nikud[i]
    return new_word

### Test nikud functions

In [None]:
# test the functions for adding and removing nikud
text = read_txt_file(os.path.join(texts_path, 'afikeiyam1__nikud_meteg', 'afikeiyam1-002__nikud_meteg.txt'))
# take just the first 100 characters
text = text[:100]
text_no_nikud = remove_nikud(text)
text_nikud = get_nikud(text)
text_with_nikud = add_nikud(text_no_nikud, text_nikud)
print("original text:\n", text)
print("text with added nikud:\n", text_with_nikud)
print("text without nikud:\n", text_no_nikud)
print("nikud array:\n", text_nikud)

## Create a unified csv of all sentences

In [None]:
import csv

columns = ['text_with_nikud', 'text_without_nikud', 'nikud', 'author', 'file_name', 'sentence_num']
# with open('data/full_data.csv', 'w', encoding='utf-8') as f:
#     writer = csv.writer(f)
#     writer.writerow(columns)
#     for author in tqdm(author_files_dict):
#         for file in author_files_dict[author]:
#             text = read_txt_file(os.path.join(texts_path, author, file))
#             # split the text into sentences by \n or .
#             sentences = re.split(r'\n|\.', text)
#             for i, sentence in enumerate(sentences):
#                 sentence_words = sentence.split()
#                 # remove the nikud from the sentence
#                 sentence_without_nikud = remove_nikud(sentence)
#                 # get the nikud from the sentence
#                 nikud = list(map(get_nikud, sentence_words))

#                 # add the sentence to the dataframe
#                 writer.writerow([sentence, sentence_without_nikud, nikud, author, file, i])

#### load the data from the csv for chuncks and save first chunck in json for short data

In [None]:
# dataframe of the CSV with chunksize of 1000
# data_df_chunks = pd.read_csv('data/full_data.csv', chunksize=1000)

# # read only first 100000 rows of the CSV, we will use this for now
# data_df = pd.read_csv('data/full_data.csv', nrows=100000, converters={'nikud': eval})
# print(data_df.head())

# # keep only rows s.t the length of the text (without spaces) is at most 100
# data_df = data_df[data_df['text_without_nikud'].str.replace(' ', '').str.len() <= 100]

# # save the dataframe to a json file
# data_df.to_json('data/short_data_df.json', orient='records', lines=True)

##### read the json short data

In [None]:
# read the json file
data_df = pd.read_json('data/short_data_df.json', orient='records', lines=True)

print(data_df.shape)
print(data_df.columns)

#### Define dictionary label_to_id and id_to_label


In [None]:
label_to_id = {}
id_to_label = {}
for label_list in tqdm(data_df['nikud']):
    # flatten the list
    label_list = [item for sublist in label_list for item in sublist]
    for label in label_list:
        if len(label) == 0:
            label = "<no_nikud>"
        if label not in label_to_id:
            label_to_id[label] = len(label_to_id)
            id_to_label[len(id_to_label)] = label

print(label_to_id)



#### count labels for label_weights

In [None]:
# Count the number of each label
label_count = {}
for label_list in tqdm(data_df['nikud']):
    # Flatten the list
    label_list = [item for sublist in label_list for item in sublist]
    for label in label_list:
        if len(label) == 0:
            label = "<no_nikud>"
        if label not in label_count:
            label_count[label] = 0
        label_count[label] += 1

print(label_count)

# Plot counts of each label (sorted)
sorted_labels = sorted(label_count.items(), key=lambda x: x[1], reverse=True)
print(sorted_labels)
plt.figure(figsize=(8, 4))
plt.xticks(rotation=90)
plt.xlabel('Label')
plt.ylabel('Count')
plt.title('Label Counts')
plt.bar([x[0] for x in sorted_labels], [x[1] for x in sorted_labels])
plt.show()

# Define the label weights
label_weights = {}
for label in label_count:
    label_weights[label] = 1 / label_count[label]

# Normalize the weights
sum_weights = sum(label_weights.values())
for label in label_weights:
    label_weights[label] /= sum_weights

print(label_weights)


#### download tokenizer and model
(alephbert-base, with vocab of words with len <= 1)

In [None]:
model_path = 'C:\\Users\\baruc\\PycharmProjects\\pythonProject\\Punctuation_Restoration\\AlephBERT-main\\AlephBERT-main\\models\\alephbert-base'
alephbert_tokenizer = AutoTokenizer.from_pretrained(model_path)
alephbert_model = AutoModelForMaskedLM.from_pretrained("onlplab/alephbert-base")

In [None]:
# test the tokenization and detokenization
test = "בדיקה של הדבר הזה"
tokenized = alephbert_tokenizer.tokenize(test)
encoded = alephbert_tokenizer.encode(test)
decoded = alephbert_tokenizer.decode(encoded)
print(test)
print("tokenized: ", tokenized)
print("encoded: ", encoded)
print("decoded: ", decoded)

# create DataSet class

In [None]:
# create pytorch dataset class for punctuation restoration (returns input(text) and target(nikud))

class PunctuationRestorationDataset(Dataset):
    def __init__(self, data_df, tokenizer, label_to_id, max_len):
        self.data = data_df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label_to_id = label_to_id 
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index): # TODO: need to make sure not to look at fraction of words
        text = self.data.iloc[index]['text_without_nikud']
        nikud = self.data.iloc[index]['nikud']  # list of lists of nikud

        # flatten nikud list
        nikud = [item for sublist in nikud for item in sublist]

        # replace empty strings with <no_nikud> token
        nikud = [label if label != "" else "<no_nikud>" for label in nikud]
        # replace labels with ids
        nikud = [self.label_to_id[label] for label in nikud]

        # pad if needed
        nikud = nikud + [len(label_to_id)] * (self.max_len - len(nikud))

        # convert to tensor
        nikud = torch.tensor(nikud, dtype=torch.long)

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'text': text,
            'nikud': nikud,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }


# create pytorch dataset class for punctuation restoration (returns input(text) and target(nikud))
dataset = PunctuationRestorationDataset(data_df, alephbert_tokenizer, label_to_id, 100)
sample = dataset[0]
print(len(dataset))
print(sample.keys())
# remove spaces from text
print(sample['text'].replace(" ", ""))
print(len(sample['text'].replace(" ", "")))
print((sample['nikud']))
print(sample['input_ids'].shape)
print(sample['attention_mask'].shape)




#### split to train,val and test datasets and dataloaders

In [None]:
# split dataset to train, val and test
train_size = int(0.9 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])

batch_size = 32

# create dataloader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)



# performe EDAs on the data

In [None]:
# EDA on train, val and test dataset
print("train dataset size: ", len(train_dataset))
print("val dataset size: ", len(val_dataset))
print("test dataset size: ", len(test_dataset))

# train dataset
print("train dataset")
print(train_dataset[0])

# Define the two models (one with look-ahead, one without)

### train the two models

### Evaluate the models

## Full-Sentence Model

In [None]:
# define a charachter level transformer model with the following architecture:
# 1. Embedding layer
# 2. Transformer layer
# 3. Fully connected layer

model_path = 'C:\\Users\\baruc\\PycharmProjects\\pythonProject\\Punctuation_Restoration\\AlephBERT-main\\AlephBERT-main\\models\\alephbert-base'
alephbert_tokenizer = AutoTokenizer.from_pretrained(model_path)

# Assuming `your_vocab` is a list of tokens in your vocabulary
alephbert_vocab = alephbert_tokenizer.get_vocab()

# Your specific vocab size and embedding dimension
new_vocab_size = alephbert_tokenizer.vocab_size
embedding_dim = 512

class FullSentenceModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=512, nhead=8, num_layers=6, output_dim=len(label_to_id), dropout=0.1):
        super(FullSentenceModel, self).__init__()
        
        # Non-pretrained embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.transformer = nn.Transformer(d_model=embedding_dim, nhead=nhead, num_encoder_layers=num_layers, num_decoder_layers=num_layers, dropout=dropout, activation='relu')
        
        # Fully connected layer with ReLU activation function
        self.fc = nn.Sequential(
            nn.Linear(embedding_dim, embedding_dim),
            nn.ReLU(),
            nn.Linear(embedding_dim, output_dim)
        )

        self.fc = nn.Linear(embedding_dim, output_dim)
        

    def forward(self, sentence):
        # Pass input through embedding layer
        characters_embeddings = self.embedding(sentence)
        
        # Pass through transformer
        characters_after_transformer_layers = self.transformer(characters_embeddings, characters_embeddings)
        
        # Pass through fully connected layer
        predictions = self.fc(characters_after_transformer_layers)
        
        return predictions
    


In [None]:
from torch.optim.lr_scheduler import StepLR

vocab_size = len(alephbert_tokenizer.vocab)

#scheduler = StepLR(optimizer, step_size=1, gamma=0.9)


criterion = nn.CrossEntropyLoss(ignore_index=len(label_to_id)).to(device)
#weights = torch.tensor(list(label_weights.values())).to(device)
#criterion = nn.CrossEntropyLoss(ignore_index=len(label_to_id), weight=weights).to(device)

loss_history = []

# define the training loop
def train(model, dataloader, optimizer, criterion):
    model.train()
    epoch_loss = 0
    for batch in tqdm(dataloader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        labels = batch['nikud'].to(device)
        predictions = model.forward(input_ids)
        predictions = predictions.view(-1, predictions.shape[-1])
        labels = labels.view(-1)
        loss = criterion(predictions, labels)
        loss_history.append(loss.item())
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

# define the evaluation loop
def evaluate(model, dataloader, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids = batch['input_ids'].to(device)
            labels = batch['nikud'].to(device)
            predictions = model.forward(input_ids)
            predictions = predictions.view(-1, predictions.shape[-1])
            labels = labels.view(-1)
            loss = criterion(predictions, labels)
            epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

# define the training and evaluation loop
def train_and_evaluate(model, train_dataloader, val_dataloader, optimizer, criterion, epochs=5):
    for epoch in range(epochs):
        train_loss = train(model, train_dataloader, optimizer, criterion)
        #scheduler.step()
        val_loss = evaluate(model, val_dataloader, criterion)
        print(f'Epoch: {epoch + 1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
        # plot loss_history
        plt.plot(loss_history)
        plt.title('Loss history')
        plt.ylabel('Loss value')
        plt.xlabel('Batch #')
        plt.show()

def hyperparameter_grid_search():
    # train the model
    nhead = [2, 4, 8]
    nlayers = [2, 4, 6]
    lr = [0.0001, 0.00001]
    for n in nhead:
        for l in nlayers:
            for r in lr:
                model = FullSentenceModel(vocab_size, nhead=n, num_layers=l).to(device)
                optimizer = torch.optim.Adam(model.parameters(), lr=r)
                print(f'nhead: {n}, nlayers: {l}, lr: {r}')
                train_and_evaluate(model, train_loader, val_loader, optimizer, criterion, epochs=5)

# Imprortant reminder: Optimizer should be defined for each model separately
model = FullSentenceModel(vocab_size, nhead=8, num_layers=6).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

train_and_evaluate(model, train_loader, val_loader, optimizer, criterion)




In [None]:
# plot loss_history
plt.plot(loss_history)
plt.title('Loss history')
plt.ylabel('Loss value')
plt.xlabel('Batch #')
plt.show()




In [None]:
for batch in train_loader:
    # predict on the first batch of the test set
    model.eval()
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        labels = batch['nikud'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        predictions = model.forward(input_ids)
        predictions = predictions.view(-1, predictions.shape[-1])
        labels = labels.view(-1)
        loss = criterion(predictions, labels)
        print(f'Loss: {loss.item():.4f}')
        # cut the padding tokens
        predictions = predictions[attention_mask.view(-1) == 1]
        labels = labels[attention_mask.view(-1) == 1]
        print(f'Predictions shape: {predictions.shape}')
        print(f'Labels shape: {labels.shape}')
        # argmax to get the best prediction
        predictions = torch.argmax(predictions, dim=1)
        print(f'Predictions vs labels: {predictions[:100]} vs {labels[:100]}')
        # accuracy
        accuracy = torch.sum(predictions == labels) / len(labels)
        print(f'Accuracy: {accuracy.item():.4f}')
        break


# TODO: we notice we usually predict 2 (no_nikud) - why? how to fix?...

#### Reading-Direction Model

In [None]:
# just a skeleton, need to be fixed
import torch.nn as nn

class ReadingDirectionModel(nn.Module):
    def __init__(self, char_vocab_size, char_embedding_dim, word_hidden_dim, char_hidden_dim, num_labels, word_lstm_layers=1, char_lstm_layers=4):
        super(ReadingDirectionModel, self).__init__()
        
        # Character Embedding Layer
        self.char_embedding = nn.Embedding(char_vocab_size, char_embedding_dim)
        
        # Word-Level BiLSTM
        self.word_bilstm = nn.LSTM(char_embedding_dim, word_hidden_dim, num_layers=word_lstm_layers, bidirectional=True, batch_first=True)
        
        # Character-Level LSTM
        self.char_lstm = nn.LSTM(char_embedding_dim + 2 * word_hidden_dim, char_hidden_dim, num_layers=char_lstm_layers, batch_first=True)
        
        # Fully Connected Layer
        self.fc = nn.Linear(char_hidden_dim, num_labels)
        
        # Dropout Layer
        self.dropout = nn.Dropout(0.2)

    def forward(self, char_sequence, word_boundaries):
        # Apply Character Embedding
        char_embedded = self.char_embedding(char_sequence)
        
        # Process each word using the Word-Level BiLSTM
        word_embeddings = []
        for i, boundaries in enumerate(word_boundaries):
            word_sequences = [char_embedded[i, start:end, :] for start, end in zip(boundaries[:-1], boundaries[1:])]
            word_sequences = [self.word_bilstm(word_seq.unsqueeze(0))[0][:, -1, :] for word_seq in word_sequences]
            word_embedding = torch.cat(word_sequences, dim=0)
            word_embeddings.append(word_embedding)
        
        # Repeat word embeddings for each character
        char_word_embeddings = [torch.repeat_interleave(word_embedding, boundary[1:] - boundary[:-1]) for word_embedding, boundary in zip(word_embeddings, word_boundaries)]
        
        # Concatenate character embeddings with word embeddings
        char_word_embeddings = torch.stack(char_word_embeddings)
        char_word_embedded = torch.cat((char_embedded, char_word_embeddings), dim=2)
        
        # Apply Character-Level LSTM
        char_lstm_output, _ = self.char_lstm(char_word_embedded)
        
        # Apply Dropout
        char_lstm_output = self.dropout(char_lstm_output)
        
        # Apply Fully Connected Layer
        predictions = self.fc(char_lstm_output)
        
        return predictions

# Hyperparameters (example)
char_vocab_size = 128
char_embedding_dim = 32
word_hidden_dim = 16
char_hidden_dim = 512
num_labels = 15

# Creating the model instance (example)
reading_direction_model = ReadingDirectionModel(char_vocab_size, char_embedding_dim, word_hidden_dim, char_hidden_dim, num_labels)


# Define the dual model class, it will be composed of two models.
#### whenever there is a disagreement between the two models, the model will add nikud using the lookahead model

## Evaluation of the dual model