In [22]:
import glob
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import os
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW, get_linear_schedule_with_warmup
from tqdm.auto import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [23]:
model_name = "aubmindlab/bert-base-arabertv02"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(64000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [24]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # Ensure `idx` is within the bounds of `self.encodings` and `self.labels`
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item


    def __len__(self):
        return len(self.labels)

In [25]:
def read_json_file(file_path):
    try:
        # Attempt to read the file assuming it's line-delimited or an array of objects
        data = pd.read_json(file_path, lines=True)
    except ValueError:
        try:
            # For a single JSON object, read it into a Series and then convert to a DataFrame
            data = pd.read_json(file_path, typ='series')
            data = pd.DataFrame([data])  # Convert Series to DataFrame
        except ValueError as e:
            print(f"Failed to read {file_path}: {e}")
            data = pd.DataFrame()  # Return an empty DataFrame on failure
    return data

In [26]:
def read_json_files(directory):
    files = glob.glob(f"{directory}/*.json")
    data_frames = [read_json_file(file) for file in files]
    # Filter out empty DataFrames
    data_frames = [df for df in data_frames if not df.empty]
    if data_frames:
        # Concatenate all DataFrames
        return pd.concat(data_frames, ignore_index=True)
    else:
        print("No valid JSON data could be loaded.")
        return pd.DataFrame()

In [27]:
dataset_dir = os.getcwd()+"/labeled_dataset"
print(dataset_dir)
dataset = read_json_files(dataset_dir)

display(dataset.head())

/home/yousef-tarek-st/LiteraryHub-ML/src/nlp/restricted_topic_detection/labeled_dataset
Failed to read /home/yousef-tarek-st/LiteraryHub-ML/src/nlp/restricted_topic_detection/labeled_dataset/5977.json: Expected object or value
Failed to read /home/yousef-tarek-st/LiteraryHub-ML/src/nlp/restricted_topic_detection/labeled_dataset/5958.json: Expected object or value
Failed to read /home/yousef-tarek-st/LiteraryHub-ML/src/nlp/restricted_topic_detection/labeled_dataset/5931.json: Expected object or value
Failed to read /home/yousef-tarek-st/LiteraryHub-ML/src/nlp/restricted_topic_detection/labeled_dataset/6048.json: Expected object or value
Failed to read /home/yousef-tarek-st/LiteraryHub-ML/src/nlp/restricted_topic_detection/labeled_dataset/5830.json: Expected object or value
Failed to read /home/yousef-tarek-st/LiteraryHub-ML/src/nlp/restricted_topic_detection/labeled_dataset/5750.json: Expected object or value
Failed to read /home/yousef-tarek-st/LiteraryHub-ML/src/nlp/restricted_topic_d

Unnamed: 0,book_name,page_number,text_chunk,label,confidence
0,خماسية مدن الملح,811,يا سيدي الله يخليك المسألة من أولها إلى آخرها...,0,0.6
1,الفرق و المذاهب في الرسالات الثلاث,56,لتعاليم بولسء ويعدونها من الهرطقة واندثرت هذه ...,1,0.8
2,الحارس في حقل الشوفان,39,أمي الحبيية ماذا لا قدين لي يدكدة كنت أعيث فقط...,0,0.8
3,خماسية مدن الملح,902,أي نعم كيف لا أتذكر صارت خمسة وعشرين ألف دول...,0,0.8
4,خماسية مدن الملح,143,بدت الحدرة وهما يغادرانها أكثر حزبا وأكثر شيخو...,1,0.6


In [28]:
# Print statistics for the entire DataFrame
print("General Statistics for the dataset:")
print(dataset.describe(include='all'))  # 'include=all' to get statistics for all columns

# Print a blank line for better readability
print("\n")

# Since our 'Label' column is categorical, we specify include='all' to get its statistics
print("Statistics for the 'label' column:")
print(dataset['label'].describe())

General Statistics for the dataset:
               book_name  page_number  \
count               4891  4891.000000   
unique                24          NaN   
top     خماسية مدن الملح          NaN   
freq                1804          NaN   
mean                 NaN   489.038847   
std                  NaN   520.073097   
min                  NaN     2.000000   
25%                  NaN   115.000000   
50%                  NaN   272.000000   
75%                  NaN   639.500000   
max                  NaN  1945.000000   

                                               text_chunk        label  \
count                                                4891  4891.000000   
unique                                               4891          NaN   
top      يا سيدي الله يخليك المسألة من أولها إلى آخرها...          NaN   
freq                                                    1          NaN   
mean                                                  NaN     0.454508   
std                        

In [29]:
# Count the occurrences of 0s and 1s in the 'Label' column
label_counts = dataset['label'].value_counts()

print("Counts of 0s and 1s in the 'label' column:")
print(label_counts)

Counts of 0s and 1s in the 'label' column:
label
0    2668
1    2223
Name: count, dtype: int64


In [30]:
train_dataset, eval_dataset = train_test_split(dataset, test_size=0.2)
print("Train Dataset: ")
display(train_dataset.head())
print("Eval Dataset: ")
display(eval_dataset.head())

Train Dataset: 


Unnamed: 0,book_name,page_number,text_chunk,label,confidence
1053,خماسية مدن الملح,1501,أشعث أغبر دميم الثياب عافي الشعر قلت أي الطعام...,1,0.8
410,الحارس في حقل الشوفان,58,إلى أي مكان سوف تذهب أنت وفناتك هل حددت مكان ل...,1,0.8
396,خماسية مدن الملح,215,وحين يسمع يستعيد يطرب يسافر بعيداء ويروي الكثي...,0,0.6
1776,خماسية مدن الملح,956,لدقائق بدا نمر عاجرا عن فهم الكلمات التي سمعها...,0,0.6
614,خماسية مدن الملح,1186,ولم يتأخر السلطان ليدرك ما هو ممكن في هذه المر...,0,0.6


Eval Dataset: 


Unnamed: 0,book_name,page_number,text_chunk,label,confidence
1943,نظرية السياسة الخارجية,162,يا ليامة لشارحية احتجزث كنذا ماني قوارب أمريك...,1,0.8
1656,حكاية الجارية,147,أوما براسه في رزانة تستحيل معرفة هل كان يعني م...,1,0.6
2977,نقد العقل العربي,357,ال فستيل في لاسي بفسر جزنيا على الأقل الظاهر...,0,0.6
3827,بنات حواء الثلاث,243,انم تفرست في السججادة المثبتة على الأرضية فلاح...,0,1.0
2017,نظرية السياسة الخارجية,48,ظرية ليمة الارحة ونسجل أيضا أثالم نطلق أحكاما...,1,0.8


In [31]:
# Adjusted Tokenization Function to Work on Lists
def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=512)

# Tokenize the Entire Training and Evaluation Datasets at Once
tokenized_train = tokenize_function(train_dataset["text_chunk"].tolist())
tokenized_eval = tokenize_function(eval_dataset["text_chunk"].tolist())

# Prepare the Datasets
train_dataset = Dataset(tokenized_train, train_dataset["label"].tolist())
eval_dataset = Dataset(tokenized_eval, eval_dataset["label"].tolist())

In [32]:
# Prepare Data Loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=32)

In [33]:
# Optimizer and Learning Rate Scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 10
num_training_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)



In [None]:
# Initialize lists to store per-epoch metrics for both loss and accuracy
train_losses = []
eval_losses = []
train_accuracies = []
eval_accuracies = []

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    
    for batch in tqdm(train_loader, desc=f"Training (Epoch {epoch+1}/{num_epochs})"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        
        loss = outputs.loss
        total_loss += loss.item()
        
        # Calculate accuracy
        preds = torch.argmax(outputs.logits, dim=1)
        correct_predictions += (preds == batch["labels"]).sum().item()
        total_predictions += preds.size(0)
        
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    
    # Calculate average loss and accuracy over the epoch for training
    avg_train_loss = total_loss / len(train_loader)
    train_accuracy = correct_predictions / total_predictions
    train_losses.append(avg_train_loss)
    train_accuracies.append(train_accuracy)

    # Evaluation loop
    model.eval()
    total_eval_loss = 0
    correct_predictions = 0
    total_predictions = 0
    
    with torch.no_grad():
        for batch in eval_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            total_eval_loss += outputs.loss.item()
            
            # Calculate accuracy
            preds = torch.argmax(outputs.logits, dim=1)
            correct_predictions += (preds == batch["labels"]).sum().item()
            total_predictions += preds.size(0)
    
    # Calculate average evaluation loss and accuracy over the epoch
    avg_eval_loss = total_eval_loss / len(eval_loader)
    eval_accuracy = correct_predictions / total_predictions
    eval_losses.append(avg_eval_loss)
    eval_accuracies.append(eval_accuracy)
    
    print(f"\nEpoch {epoch+1} finished. Training Loss: {avg_train_loss}. Training Accuracy: {train_accuracy}. Evaluation Loss: {avg_eval_loss}. Evaluation Accuracy: {eval_accuracy}\n")

Training (Epoch 1/10):   0%|          | 0/123 [00:00<?, ?it/s]


Epoch 1 finished. Training Loss: 0.7036830414601458. Training Accuracy: 0.522239263803681. Evaluation Loss: 0.6923362247405513. Evaluation Accuracy: 0.5280898876404494



Training (Epoch 2/10):   0%|          | 0/123 [00:00<?, ?it/s]

In [None]:
# Plot training and evaluation losses
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Training Loss')
plt.plot(eval_losses, label='Evaluation Loss')
plt.title('Training and Evaluation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

# Plot training and evaluation accuracies
plt.subplot(1, 2, 2)
plt.plot(train_accuracies, label='Training Accuracy')
plt.plot(eval_accuracies, label='Evaluation Accuracy')
plt.title('Training and Evaluation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()
plt.savefig(os.getcwd()+'/plots/training_evaluation_plots.png')

In [None]:
model_save_path = os.getcwd() + "/AraBERT_fine_tuned_model"
model.save_pretrained(model_save_path)
tokenizer_save_path = os.getcwd() + "/AraBERT_fine_tuned_tokenizer"
tokenizer.save_pretrained(tokenizer_save_path)