# **Install Required Libraries**

In [None]:
%pip install transformers datasets evaluate torch accelerate pandas

# **Import Libraries**

In [None]:
import os
import json
import pandas as pd
import torch
from transformers import (
    DistilBertForSequenceClassification,
    DistilBertTokenizer,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
from datasets import Dataset
import evaluate
import matplotlib.pyplot as plt

# **Define File Paths and Hyperparameters**


In [None]:
file_path = "C:/Users/indur/OneDrive - University of Westminster/GitHub/FYP_Project/Models/Review_Score/Dataset/correct_reviews_balanced.json"

# Paths for saving the model and logs
Base = "C:/Users/indur/OneDrive - University of Westminster/GitHub/FYP_Project/Models/Testing/Review_ScoreTest/DistilBERTPrediction"
model_save_path = f"{Base}/DistilBERT/models"
chunk_tracker_file = f"{Base}/DistilBERT/chunk_tracker.json"

os.makedirs(model_save_path, exist_ok=True)
os.makedirs(os.path.dirname(chunk_tracker_file), exist_ok=True)

# **Load Metrics**


In [None]:
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=1)
    accuracy = accuracy_metric.compute(predictions=predictions.numpy(), references=labels)
    f1_score = f1_metric.compute(predictions=predictions.numpy(), references=labels, average="weighted")
    return {"accuracy": accuracy["accuracy"] * 100, "f1": f1_score["f1"] * 100}

# **Load Tokenizer & Model**

In [None]:
if os.path.exists(os.path.join(model_save_path, "tokenizer.json")):
    tokenizer = DistilBertTokenizer.from_pretrained(model_save_path)
else:
    tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
    tokenizer.save_pretrained(model_save_path)

if os.path.exists(os.path.join(model_save_path, "pytorch_model.bin")):
    model = DistilBertForSequenceClassification.from_pretrained(model_save_path)
else:
    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=5)
    model.save_pretrained(model_save_path)

# **Track Last Processed Chunk**

In [None]:
if os.path.exists(chunk_tracker_file):
    with open(chunk_tracker_file, "r") as f:
        chunk_tracker = json.load(f)
        last_chunk = chunk_tracker.get("last_chunk", 0)
        cycle_count = chunk_tracker.get("cycle_count", 1)
else:
    last_chunk = 0
    cycle_count = 1

chunk_size = 1000  # Process data in 1000-row chunks
df_iter = pd.read_json(file_path, lines=True, chunksize=chunk_size)

# **Training Loop**

In [None]:
best_f1_score = 0.0  # Track the best F1 score seen
best_accuracy = 0.0  # Track the best accuracy
best_loss = float("inf")  # Track the best (lowest) evaluation loss
chunk_metrics = []  # To store metrics from each chunk

for i, df_chunk in enumerate(df_iter):
    if i < last_chunk:
        continue  # Skip chunks already processed

    print(f"\nTraining Cycle {cycle_count}, Chunk {i+1}")

    # Preprocess Data: select required columns and adjust labels from [1,5] to [0,4]
    df_chunk = df_chunk[["text", "stars"]].dropna()
    df_chunk["stars"] = df_chunk["stars"].astype(int) - 1

    # Split into training and evaluation sets
    train_texts, eval_texts, train_labels, eval_labels = train_test_split(
        df_chunk["text"].tolist(), df_chunk["stars"].tolist(), test_size=0.2, random_state=42
    )

    # Convert to Hugging Face Dataset
    train_dataset = Dataset.from_dict({"text": train_texts, "labels": train_labels})
    eval_dataset = Dataset.from_dict({"text": eval_texts, "labels": eval_labels})

    # Tokenize the datasets
    def tokenize_function(batch):
        return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=512)

    train_dataset = train_dataset.map(tokenize_function, batched=True).remove_columns(["text"])
    eval_dataset = eval_dataset.map(tokenize_function, batched=True).remove_columns(["text"])
    train_dataset.set_format("torch")
    eval_dataset.set_format("torch")

    # Define Training Arguments with early stopping
    training_args = TrainingArguments(
        output_dir=model_save_path,
        num_train_epochs=3,  # Lower epochs per chunk; early stopping will stop if no improvement
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_dir="/logs",
        logging_steps=10,
        load_best_model_at_end=True,
        save_total_limit=2,
        warmup_steps=500,
        greater_is_better=True,
        metric_for_best_model="f1",
        report_to="none"
    )

    # Initialize Trainer with EarlyStoppingCallback (patience=2 epochs)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    # Train on the current chunk
    trainer.train()

    # Evaluate the current chunk
    eval_results = trainer.evaluate()
    eval_loss = eval_results.get("eval_loss", float("inf"))
    eval_f1 = eval_results.get("eval_f1", 0)
    eval_acc = eval_results.get("eval_accuracy", 0)

    print(f"Chunk {i+1} Evaluation Metrics: Loss: {eval_loss:.4f}, F1: {eval_f1:.2f}%, Accuracy: {eval_acc:.2f}%")
    chunk_metrics.append({
        "chunk": i + 1,
        "loss": eval_loss,
        "f1": eval_f1,
        "accuracy": eval_acc
    })

    # Save model if current chunk's F1 improves over the best so far
    if eval_f1 > best_f1_score:
        best_f1_score = eval_f1
        best_accuracy = eval_acc
        best_loss = eval_loss
        print(f" New Best Model Found F1: {best_f1_score:.2f}%, Accuracy: {best_accuracy:.2f}%, Loss: {best_loss:.4f}")
        model.save_pretrained(model_save_path)
        tokenizer.save_pretrained(model_save_path)

    # Update chunk tracker so that training can resume later if needed
    with open(chunk_tracker_file, "w") as f:
        json.dump({"last_chunk": i + 1, "cycle_count": cycle_count}, f)

    print(f"Finished chunk {i+1} of cycle {cycle_count}")

# **Visualization of Evaluation Metrics**

In [None]:
if chunk_metrics:
    metrics_df = pd.DataFrame(chunk_metrics)
    
    plt.figure(figsize=(10, 5))
    plt.plot(metrics_df["chunk"], metrics_df["f1"], marker="o", label="F1 Score (%)")
    plt.plot(metrics_df["chunk"], metrics_df["accuracy"], marker="o", label="Accuracy (%)")
    plt.xlabel("Chunk Number")
    plt.ylabel("Metric Value (%)")
    plt.title("Evaluation Metrics per Chunk")
    plt.legend()
    plt.grid(True)
    plt.show()
    
    plt.figure(figsize=(10, 5))
    plt.plot(metrics_df["chunk"], metrics_df["loss"], marker="o", color="red", label="Loss")
    plt.xlabel("Chunk Number")
    plt.ylabel("Loss")
    plt.title("Evaluation Loss per Chunk")
    plt.legend()
    plt.grid(True)
    plt.show()
else:
    print("No chunk metrics to display.")

# **Reset Chunk Tracker for Next Cycle**

In [None]:
with open(chunk_tracker_file, "w") as f:
    json.dump({"last_chunk": 0, "cycle_count": cycle_count + 1}, f)

print("\n Training Complete! Best model saved.")
print(f"\n Final Best Model Metrics:")
print(f" Best F1 Score: {best_f1_score:.2f}%")
print(f" Best Accuracy: {best_accuracy:.2f}%")
print(f" Best Loss: {best_loss:.4f}")