<a href="https://colab.research.google.com/github/Mahmoudlimam/Personal-Projects/blob/main/treebank_chunking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/kaisdukes/quran-neural-chunker.git

Cloning into 'quran-neural-chunker'...
remote: Enumerating objects: 265, done.[K
remote: Total 265 (delta 0), reused 0 (delta 0), pack-reused 265[K
Receiving objects: 100% (265/265), 14.54 MiB | 21.54 MiB/s, done.
Resolving deltas: 100% (144/144), done.


In [None]:
!cd /kaggle/working/quran-neural-chunker/src

In [None]:
import math
from typing import List
import csv
import numpy as np
import pandas as pd
from dataclasses import dataclass
from pandas import DataFrame
from tqdm import tqdm
import torch
from torch import nn
from torch.optim import Adam, AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
max_length = 128


@dataclass(frozen=True)
class Location:
    chapter_number: int
    verse_number: int
    token_number: int

    def __str__(self):
        parts = [str(self.chapter_number), str(self.verse_number)]
        if self.token_number > 0:
            parts.append(str(self.token_number))
        return ':'.join(parts)


@dataclass(frozen=True)
class Chunk:
    start: Location
    end: Location


def get_chunks(df: DataFrame):
    chunks: List[Chunk] = []
    start: Location = None

    for _, row in df.iterrows():
        loc = Location(row['chapter_number'], row['verse_number'], row['token_number'])
        if start is None:
            start = loc
        if row['chunk_end'] == 1:
            end = loc
            chunk = Chunk(start, end)
            chunks.append(chunk)
            start = None

    return chunks


def preprocess(df: DataFrame):
    df['verse_end'] = (
        (df.groupby(['chapter_number', 'verse_number']).token_number.transform(max) == df.token_number)
        .astype(int))

    df['punctuation'] = df['translation'].apply(_punctuation)


PUNCTUATION = [',', '.', '\'', '\"', '!', '?']


def _punctuation(text: str) -> str:
    n = len(text)
    for i in range(n - 1, -1, -1):
        if text[i] not in PUNCTUATION:
            return text[i+1:] if i < n - 1 else ''
    return text


class Embeddings:

    def __init__(self):
        self._embeddings: Dict[int, np.ndarray] = {}
        self._default_vector: np.ndarray = np.zeros(256)
        self._load_embeddings()

    def get_vector(self, embeddingId: int):
        return self._embeddings.get(embeddingId, self._default_vector)

    def _load_embeddings(self):
        VECTOR_FILE = '/kaggle/working/quran-neural-chunker/data/vectors.txt'
        with open(VECTOR_FILE, 'r') as file:
            for line in file:
                line = line.strip().split()
                embeddingId = int(line[0][:-1])
                vector = np.array(list(map(float, line[2:])))
                self._embeddings[embeddingId] = vector

class Evaluator:

    def __init__(self):
        self._expected_chunks = 0
        self._output_chunks = 0
        self._equivalent_chunks = 0

    def compare(self, expected_chunks: List[Chunk], output_chunks: List[Chunk]):
        expected_set = set(expected_chunks)
        output_set = set(output_chunks)

        self._expected_chunks += len(expected_set)
        self._output_chunks += len(output_set)
        self._equivalent_chunks += len(expected_set & output_set)

    @property
    def precision(self):
        return 0 if self._output_chunks == 0 else self._equivalent_chunks / self._output_chunks

    @property
    def recall(self):
        return 0 if self._expected_chunks == 0 else self._equivalent_chunks / self._expected_chunks

    @property
    def f1_score(self):
        precision = self.precision
        recall = self.recall
        return 0 if precision + recall == 0 else 2 * (precision * recall) / (precision + recall)


def load_data():
    CHUNKS_FILE = '/kaggle/working/quran-neural-chunker/data/quranic-treebank-0.4-chunks.tsv'
    return pd.read_csv(CHUNKS_FILE, sep='\t', quoting=csv.QUOTE_NONE)

class TransformerModel(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(TransformerModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.pos_encoder = PositionalEncoding(input_size)
        transformer_layers =nn.TransformerEncoderLayer(d_model=input_size, nhead=9, dim_feedforward=hidden_size, dropout=0.5)
        self.transformer = nn.TransformerEncoder(transformer_layers, num_layers)
        self.fc = nn.Linear(input_size, output_size)

    def forward(self, x):
        x = self.pos_encoder(x)
        output = self.transformer(x)
        out = self.fc(output)
        return out

class PositionalEncoding(nn.Module):

    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        if d_model % 2 == 0:
            pe[:, 1::2] = torch.cos(position * div_term)
        else:
            pe[:, 1::2] = torch.cos(position * div_term[:-1])
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x

class BiLSTMModel(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(BiLSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(2*hidden_size, output_size)

    def forward(self, x, lengths):
        h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(x.device)

        x = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)

        packed_output, _ = self.lstm(x, (h0, c0))

        # unpack the output before passing through the linear layer
        output, output_lengths = pad_packed_sequence(packed_output, batch_first=True)

        # manually pad the sequences to max_length
        if output.size(1) < max_length:
            output = nn.functional.pad(output, (0, 0, 0, max_length - output.size(1)))

        out = self.fc(output)
        return out


class QuranDataset(Dataset):
    def __init__(self, verses, labels):
        self.verses = verses
        self.labels = labels

    def __len__(self):
        return len(self.verses)

    def __getitem__(self, index):
        verse = self.verses[index]
        label = self.labels[index]
        length = len(verse)

        # padding
        if length < max_length:
            verse.extend([[0]*len(verse[0])] * (max_length - length))
            label.extend([0] * (max_length - length))

        return torch.tensor(verse, dtype=torch.float32), torch.tensor(label), length


def get_verses(df: DataFrame):
    le = LabelEncoder()
    df['encoded_punctuation'] = le.fit_transform(df['punctuation'])

    word_vectors = Embeddings()

    rows = []
    for _, row in df.iterrows():
        embedding_vector = word_vectors.get_vector(row['embedding_id'])
        core_values = row[['token_number', 'pause_mark', 'irab_end', 'verse_end', 'encoded_punctuation']].values
        full_vector = np.concatenate([core_values, embedding_vector]).tolist()
        rows.append(full_vector + [row['chunk_end']])
    X = pd.DataFrame(rows, columns=[f'feature_{i}' for i in range(261)]+['chunk_end'])

    verses: List[List[int]] = []
    labels: List[int] = []
    verse_info: List[List[int]] = []

    for _, group in df.groupby(['chapter_number', 'verse_number']):
        group_df = X.loc[group.index]
        verse = group_df[group_df.columns.difference(['chunk_end'])].values.tolist()
        label = group_df['chunk_end'].tolist()

        verses.append(verse)
        labels.append(label)

        verse_info_single = group[['chapter_number', 'verse_number', 'token_number']].values.tolist()
        verse_info.append(verse_info_single)

    temp_data = list(zip(verses, verse_info, labels))
    train_temp, test_temp = train_test_split(temp_data, test_size=0.10, random_state=42)

    train_verses, train_verse_info, train_labels = zip(*train_temp)
    test_verses, test_verse_info, test_labels = zip(*test_temp)

    return train_verses, test_verses, train_labels, test_labels, train_verse_info, test_verse_info


def pack_labels(labels):
    lengths = [len(label) for label in labels]
    max_len = max(lengths)
    labels_padded = [torch.cat([label, torch.zeros(max_len - len(label))]) for label in labels]
    return torch.stack(labels_padded)


def train_and_test_lstm():
    df = load_data()
    preprocess(df)

    df.fillna(0, inplace=True)

    input_size = 261
    hidden_size = 512
    num_layers = 2
    output_size = 2
    num_epochs = 50
    batch_size = 256
    learning_rate = 0.001

    train_verses, test_verses, train_labels, test_labels, train_verse_info, test_verse_info = get_verses(df)
    print(f'Train verse count: {len(train_verses)}')
    print(f'Test verse count: {len(test_verses)}')

    training_data = QuranDataset(train_verses, train_labels)
    testing_data = QuranDataset(test_verses, test_labels)

    train_loader = DataLoader(training_data, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(testing_data, batch_size=batch_size, shuffle=False)

    model = BiLSTMModel(input_size, hidden_size, num_layers, output_size)
    model = model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)
    sched = CosineAnnealingLR(optimizer,eta_min=1e-10,T_max=len(train_loader)*num_epochs)

    # train
    model.train()
    for epoch in range(num_epochs):
        model.train()
        for i, (verses, labels, lengths) in tqdm(enumerate(train_loader)):
            verses = verses.to(device)
            labels = labels.to(device)

            # forward pass
            raw_outputs = model(verses, lengths)
            labels = labels.view(-1)  # reshape labels to be a 1D tensor
            loss = criterion(raw_outputs.view(-1, output_size), labels)

            # backward pass and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            sched.step()

        print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))

        # test
        model.eval()

        expected_results_df = DataFrame(columns=['chapter_number', 'verse_number', 'token_number', 'chunk_end'])
        output_results_df = DataFrame(columns=['chapter_number', 'verse_number', 'token_number', 'chunk_end'])

        evaluator = Evaluator()
        with torch.no_grad():
            for test_index in range(len(testing_data)):
                verse, label, length = testing_data[test_index]
                verse, label, length = verse.to(device).unsqueeze(0), label.to(device).unsqueeze(0), torch.tensor([length])

                raw_output = model(verse, length)
                _, predicted = torch.max(raw_output.data, 2)
                predicted = predicted.cpu().numpy()

                verse_info = test_verse_info[test_index]

                for idx, token in enumerate(verse_info):
                    expected_row = DataFrame({
                        'chapter_number': token[0],
                        'verse_number': token[1],
                        'token_number': token[2],
                        'chunk_end': label.cpu().numpy()[0][idx]}, index=[0])
                    expected_results_df = pd.concat([expected_results_df, expected_row])

                    output_row = DataFrame({
                        'chapter_number': token[0],
                        'verse_number': token[1],
                        'token_number': token[2],
                        'chunk_end': predicted[0][idx]}, index=[0])
                    output_results_df = pd.concat([output_results_df, output_row])

        # chunk-level evaluation
        expected_chunks = get_chunks(expected_results_df)
        output_chunks = get_chunks(output_results_df)
        print(f'Expected: {len(expected_chunks)} chunks')
        print(f'Output: {len(output_chunks)} chunks')

        evaluator.compare(expected_chunks, output_chunks)
        print(f'Precision: {evaluator.precision}')
        print(f'Recall: {evaluator.recall}')
        print(f'F1 score: {evaluator.f1_score}')
        print()

def train_and_test_transformer():
    df = load_data()
    preprocess(df)

    df.fillna(0, inplace=True)

    input_size = 261
    hidden_size = 261 * 9
    num_layers = 3
    output_size = 2
    num_epochs = 50
    batch_size = 64
    learning_rate = 0.0001

    train_verses, test_verses, train_labels, test_labels, train_verse_info, test_verse_info = get_verses(df)
    print(f'Train verse count: {len(train_verses)}')
    print(f'Test verse count: {len(test_verses)}')

    training_data = QuranDataset(train_verses, train_labels)
    testing_data = QuranDataset(test_verses, test_labels)

    train_loader = DataLoader(training_data, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(testing_data, batch_size=batch_size, shuffle=False)

    model = TransformerModel(input_size, hidden_size, num_layers, output_size)
    model = model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)

    # train
    model.train()
    for epoch in range(num_epochs):
        for i, (verses, labels, lengths) in enumerate(train_loader):
            verses = verses.to(device)
            labels = labels.to(device)

            # forward pass
            raw_outputs = model(verses)
            labels = labels.view(-1)  # reshape labels to be a 1D tensor
            loss = criterion(raw_outputs.view(-1, output_size), labels)

            # backward pass and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))

        # test
        model.eval()

        expected_results_df = DataFrame(columns=['chapter_number', 'verse_number', 'token_number', 'chunk_end'])
        output_results_df = DataFrame(columns=['chapter_number', 'verse_number', 'token_number', 'chunk_end'])

        evaluator = Evaluator()
        with torch.no_grad():
            for test_index in range(len(testing_data)):
                verse, label, length = testing_data[test_index]
                verse, label, length = verse.to(device).unsqueeze(0), label.to(device).unsqueeze(0), torch.tensor([length])

                raw_output = model(verse)
                _, predicted = torch.max(raw_output.data, 2)
                predicted = predicted.cpu().numpy()

                verse_info = test_verse_info[test_index]

                for idx, token in enumerate(verse_info):
                    expected_row = DataFrame({
                        'chapter_number': token[0],
                        'verse_number': token[1],
                        'token_number': token[2],
                        'chunk_end': label.cpu().numpy()[0][idx]}, index=[0])
                    expected_results_df = pd.concat([expected_results_df, expected_row])

                    output_row = DataFrame({
                        'chapter_number': token[0],
                        'verse_number': token[1],
                        'token_number': token[2],
                        'chunk_end': predicted[0][idx]}, index=[0])
                    output_results_df = pd.concat([output_results_df, output_row])

        # chunk-level evaluation
        expected_chunks = get_chunks(expected_results_df)
        output_chunks = get_chunks(output_results_df)
        print(f'Expected: {len(expected_chunks)} chunks')
        print(f'Output: {len(output_chunks)} chunks')

        evaluator.compare(expected_chunks, output_chunks)
        print(f'Precision: {evaluator.precision}')
        print(f'Recall: {evaluator.recall}')
        print(f'F1 score: {evaluator.f1_score}')
        print()

In [None]:
train_and_test_transformer()

Train verse count: 2192
Test verse count: 244
Epoch [1/50], Loss: 0.0596
Expected: 768 chunks
Output: 639 chunks
Precision: 0.04381846635367762
Recall: 0.036458333333333336
F1 score: 0.03980099502487563

Epoch [2/50], Loss: 0.0342
Expected: 768 chunks
Output: 1722 chunks
Precision: 0.11149825783972125
Recall: 0.25
F1 score: 0.15421686746987953

Epoch [3/50], Loss: 0.0413
Expected: 768 chunks
Output: 1769 chunks
Precision: 0.11758055398530243
Recall: 0.2708333333333333
F1 score: 0.16397319668900273

Epoch [4/50], Loss: 0.0380
Expected: 768 chunks
Output: 1853 chunks
Precision: 0.10361575822989746
Recall: 0.25
F1 score: 0.14650896604349486

Epoch [5/50], Loss: 0.0226
Expected: 768 chunks
Output: 1757 chunks
Precision: 0.12122936824132043
Recall: 0.27734375
F1 score: 0.1687128712871287

Epoch [6/50], Loss: 0.0689
Expected: 768 chunks
Output: 1272 chunks
Precision: 0.23427672955974843
Recall: 0.3880208333333333
F1 score: 0.29215686274509806

Epoch [7/50], Loss: 0.0771
Expected: 768 chunks
