In [None]:
#!/usr/bin/env python
"""
===============================================================================
Download and Convert CoNLL-2003 Dataset to Raw CoNLL Format
===============================================================================
This script downloads the CoNLL-2003 dataset from Hugging Face, converts it
into the raw CoNLL format, and saves the resulting files to disk. The steps
include:

  1. Directory Setup:
     - Ensure required directories exist for storing raw and processed data.

  2. Dataset Download:
     - Load the CoNLL-2003 dataset using the Hugging Face 'datasets' library.

  3. Data Conversion:
     - Convert the dataset's token and numeric NER tag format into the standard
       CoNLL format.
     - Map numeric tags to their corresponding string representations.

  4. File Saving:
     - Save the converted data into separate files for train, validation, and test splits.

  5. Next Steps:
     - Instructions on subsequent commands for data preprocessing, training,
       and starting an API.

===============================================================================
Author: Your Name
Date: 2025-03-20
===============================================================================
"""

# =============================================================================
# Standard Library Imports
# =============================================================================
import os  # For file and directory operations

# =============================================================================
# Third-Party Library Imports
# =============================================================================
from datasets import load_dataset  # For downloading datasets from Hugging Face
import spacy  # Optional: can be used later for further processing (not used directly here)
from spacy.tokens import DocBin  # Optional: for serializing Spacy Docs (not used here)

# =============================================================================
# Directory Setup
# =============================================================================
# Create directories to store raw CoNLL data and processed data if they don't exist.
os.makedirs("data/conll2003", exist_ok=True)
os.makedirs("data/processed", exist_ok=True)

# =============================================================================
# Dataset Download
# =============================================================================
print("Downloading CoNLL-2003 dataset from HuggingFace...")

# Load the CoNLL-2003 dataset using the 'datasets' library. This automatically
# downloads and caches the dataset.
dataset = load_dataset("conll2003")

print("Download complete!")

# =============================================================================
# Data Conversion: Save Raw Data in CoNLL Format
# =============================================================================
print("Converting to CoNLL format...")

# Iterate over each split of the dataset (train, validation, test)
for split in ["train", "validation", "test"]:
    # Determine the output filename based on the split:
    # - "eng.train" for training split.
    # - "eng.testa" for validation split.
    # - "eng.testb" for test split.
    output_file = (
        f"data/conll2003/eng.train"
        if split == "train" else
        f"data/conll2003/eng.testa"
        if split == "validation" else
        f"data/conll2003/eng.testb"
    )
    print(f"Processing {split} set -> {output_file}")

    # Open the output file for writing in UTF-8 encoding.
    with open(output_file, "w", encoding="utf-8") as f:
        # Iterate over each example (sentence) in the current dataset split.
        for example in dataset[split]:
            # Each example contains a list of tokens and corresponding numeric NER tags.
            for token, tag in zip(example["tokens"], example["ner_tags"]):
                # Initialize default tag as "O" (outside any named entity).
                tag_str = "O"
                if tag > 0:
                    # Map the numeric tag to its string representation.
                    # The following mapping is specific to the CoNLL-2003 dataset.
                    tag_map = {
                        1: "B-PER", 2: "I-PER",
                        3: "B-ORG", 4: "I-ORG",
                        5: "B-LOC", 6: "I-LOC",
                        7: "B-MISC", 8: "I-MISC"
                    }
                    tag_str = tag_map[tag]
                # Write the token and its corresponding tag to the file.
                f.write(f"{token} {tag_str}\n")
            # Write an empty line after each sentence to separate sentences.
            f.write("\n")

print("Dataset downloaded and converted to CoNLL format successfully!")
print("Files created:")
print("  - data/conll2003/eng.train")
print("  - data/conll2003/eng.testa")
print("  - data/conll2003/eng.testb")
print("\nNext steps:")
print("1. Run preprocessing: python -m src.preprocessing.data_loader")
print("2. Train model: python -m src.training.train_model")
print("3. Start API: uvicorn src.api.main:app --host 0.0.0.0 --port 8000")


In [None]:
#!/usr/bin/env python
"""
===============================================================================
CoNLL-2003 Preprocessing Pipeline
===============================================================================
This script implements a comprehensive preprocessing pipeline for the CoNLL-2003
dataset, typically used for Named Entity Recognition (NER) tasks. The pipeline
covers the following steps:

  1. Resource Setup:
     - Download required NLTK resources.
     - Verify and install the necessary SpaCy model.

  2. Data Ingestion:
     - Read and parse CoNLL-2003 formatted files into a structured format.

  3. Data Exploration:
     - Compute dataset statistics (e.g., sentence counts, word counts, entity
       distributions, sentence lengths).
     - Visualize entity distributions using bar charts.

  4. Text Preprocessing:
     - Clean and normalize text using both NLTK and SpaCy libraries.
     - Optionally remove stopwords and perform lemmatization.

  5. Data Conversion:
     - Convert parsed data into different formats: SpaCy training format, BIO
       format, and JSON format.
     - Prepare data for transformer-based models (e.g., tokenized texts and
       corresponding labels).

  6. Data Saving:
     - Persist processed data and label mappings to disk for downstream tasks.

===============================================================================
Author: Your Name
Date: 2025-03-20
===============================================================================
"""

# =============================================================================
# Standard Library Imports
# =============================================================================
import os
import sys
import json
import pickle
import subprocess
import re
from collections import Counter
from typing import List, Dict, Tuple, Optional

# =============================================================================
# Third-Party Imports
# =============================================================================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# =============================================================================
# NLTK Resource Downloads
# =============================================================================
# Download required NLTK resources for text tokenization, stopword removal,
# and lemmatization. The 'quiet=True' flag prevents excessive console output.
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)  # Confirm if 'punkt_tab' is needed for your tasks.
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# =============================================================================
# SpaCy Model Installation and Loading
# =============================================================================
def ensure_spacy_model(model_name: str = "en_core_web_sm") -> None:
    """
    Verify that the specified SpaCy model is installed. If not, install the model.

    The function attempts to load the model and prints a confirmation message if
    successful. If the model is missing, it uses the SpaCy CLI via subprocess to
    download and install the model.

    Args:
        model_name (str): The name of the SpaCy model to load. Defaults to "en_core_web_sm".
    """
    try:
        # Attempt to load the specified SpaCy model.
        spacy.load(model_name)
        print(f"SpaCy model '{model_name}' is already installed.")
    except OSError:
        # Handle the case where the model is not installed.
        print(f"SpaCy model '{model_name}' not found. Installing...")
        subprocess.check_call([sys.executable, "-m", "spacy", "download", model_name])
        print(f"SpaCy model '{model_name}' has been installed.")

# Ensure the SpaCy model is available before continuing.
ensure_spacy_model()
nlp = spacy.load("en_core_web_sm")

# =============================================================================
# Data Reading and Parsing Functions
# =============================================================================
def read_conll_file(file_path: str) -> List[List[Tuple[str, str]]]:
    """
    Parse a CoNLL-2003 formatted file into a structured list of sentences.

    Each sentence is represented as a list of tuples, where each tuple contains
    a token (word) and its corresponding annotation tag (e.g., 'B-PER', 'I-LOC', 'O').

    The function handles:
      - Skipping of metadata and comment lines (e.g., "-DOCSTART-").
      - Correct grouping of tokens into sentences based on blank lines.

    Args:
        file_path (str): Full file path to the CoNLL file.

    Returns:
        List[List[Tuple[str, str]]]: List of sentences with each sentence represented
                                     as a list of (word, tag) tuples.
    """
    sentences = []         # Holds all parsed sentences.
    current_sentence = []  # Temporarily accumulates tokens for the current sentence.

    # Open the file with UTF-8 encoding to support diverse characters.
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()  # Remove extra whitespace.

            # Skip empty lines and metadata lines (these indicate sentence boundaries).
            if not line or line.startswith('-DOCSTART-') or line.startswith('//'):
                if current_sentence:
                    # Append the accumulated sentence before resetting.
                    sentences.append(current_sentence)
                    current_sentence = []
                continue

            # Split the line into parts; the first part is the token, last is the tag.
            parts = line.split()
            if len(parts) >= 2:
                word = parts[0]
                tag = parts[-1]  # Assumes the tag is the final element.
                current_sentence.append((word, tag))

    # Ensure the last sentence is added if the file does not end with a blank line.
    if current_sentence:
        sentences.append(current_sentence)

    return sentences

# =============================================================================
# Dataset Exploration and Visualization Functions
# =============================================================================
def explore_dataset(sentences: List[List[Tuple[str, str]]]) -> Dict:
    """
    Compute and return a set of statistics describing the dataset.

    The returned statistics include:
      - Number of sentences.
      - Total word count.
      - Count of unique words (case-insensitive).
      - Entity counts by type (e.g., PER, LOC).
      - Average length of entities.
      - Average and maximum sentence lengths.

    Args:
        sentences (List[List[Tuple[str, str]]]): Structured sentences with (word, tag) tuples.

    Returns:
        Dict: Dictionary containing various computed statistics.
    """
    num_sentences = len(sentences)
    total_words = sum(len(sentence) for sentence in sentences)

    # Use a set to count unique words; convert to lowercase for consistency.
    unique_words = set()
    for sentence in sentences:
        for word, _ in sentence:
            unique_words.add(word.lower())
    num_unique_words = len(unique_words)

    # Initialize containers to track entity counts and their lengths.
    entity_counts = Counter()
    entity_length_distribution = {}
    current_entity = None
    current_entity_length = 0

    # Loop through every sentence and token to compute entity-based statistics.
    for sentence in sentences:
        for _, tag in sentence:
            if tag.startswith('B-'):
                # If a new entity begins, finish tracking the previous entity.
                if current_entity:
                    entity_length_distribution.setdefault(current_entity, []).append(current_entity_length)
                current_entity = tag[2:]  # Remove the "B-" prefix to get the entity type.
                current_entity_length = 1
                entity_counts[current_entity] += 1
            elif tag.startswith('I-'):
                # If tag indicates a continuation and matches the current entity type, increment length.
                if current_entity == tag[2:]:
                    current_entity_length += 1
            else:
                # Encountered a non-entity token; finalize the current entity if one is being tracked.
                if current_entity:
                    entity_length_distribution.setdefault(current_entity, []).append(current_entity_length)
                    current_entity = None
                    current_entity_length = 0

    # Calculate the average length for each entity type.
    avg_entity_length = {
        entity: (sum(lengths) / len(lengths) if lengths else 0)
        for entity, lengths in entity_length_distribution.items()
    }

    # Compute sentence length metrics.
    sentence_lengths = [len(sentence) for sentence in sentences]
    avg_sentence_length = sum(sentence_lengths) / len(sentence_lengths) if sentence_lengths else 0
    max_sentence_length = max(sentence_lengths) if sentence_lengths else 0

    # Return a dictionary of computed statistics.
    return {
        'num_sentences': num_sentences,
        'total_words': total_words,
        'num_unique_words': num_unique_words,
        'entity_counts': dict(entity_counts),
        'avg_entity_length': avg_entity_length,
        'avg_sentence_length': avg_sentence_length,
        'max_sentence_length': max_sentence_length
    }


def visualize_entity_distribution(stats: Dict, save_path: Optional[str] = None) -> None:
    """
    Create and display or save a bar chart of entity type distribution.

    The function extracts entity counts from the statistics dictionary and uses
    Matplotlib to create a bar chart. If a save path is provided, the chart is
    saved to a file; otherwise, it is displayed interactively.

    Args:
        stats (Dict): Dataset statistics (must contain 'entity_counts').
        save_path (Optional[str]): File path for saving the plot. If None, plot is shown.
    """
    # Set up a figure with a defined size.
    plt.figure(figsize=(12, 6))

    # Retrieve entity names and counts from the provided statistics.
    entity_counts = stats['entity_counts']
    entities = list(entity_counts.keys())
    counts = list(entity_counts.values())

    # Create the bar chart.
    plt.bar(entities, counts)
    plt.title('Distribution of Entity Types')
    plt.xlabel('Entity Type')
    plt.ylabel('Count')
    plt.xticks(rotation=45)

    # Save the plot to a file if a path is provided; otherwise, display it.
    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
    else:
        plt.show()
    plt.close()

# =============================================================================
# Text Preprocessing Functions (Using NLTK and SpaCy)
# =============================================================================
def preprocess_text(text: str, remove_stopwords: bool = True, lemmatize: bool = True) -> str:
    """
    Preprocess a text string using NLTK libraries.

    The preprocessing steps include:
      - Converting text to lowercase.
      - Tokenizing the text into words.
      - Optionally removing stopwords.
      - Optionally performing lemmatization to reduce words to their base forms.

    Args:
        text (str): Input text to be processed.
        remove_stopwords (bool): Flag indicating whether to remove stopwords. Defaults to True.
        lemmatize (bool): Flag indicating whether to perform lemmatization. Defaults to True.

    Returns:
        str: The preprocessed text as a single space-separated string.
    """
    # Normalize text by converting to lowercase.
    text = text.lower()

    # Tokenize text using NLTK's word_tokenize.
    tokens = nltk.word_tokenize(text)

    # Remove stopwords if enabled.
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]

    # Perform lemmatization if enabled.
    if lemmatize:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Return the preprocessed text.
    return ' '.join(tokens)


def preprocess_spacy(text: str, remove_stopwords: bool = True, lemmatize: bool = True) -> str:
    """
    Preprocess a text string using SpaCy for advanced tokenization and linguistic processing.

    The function leverages SpaCy to:
      - Tokenize the input text.
      - Optionally remove stopwords based on SpaCy's built-in list.
      - Optionally perform lemmatization.
      - Convert tokens to lowercase.

    Args:
        text (str): Input text to be processed.
        remove_stopwords (bool): Flag indicating whether to remove stopwords. Defaults to True.
        lemmatize (bool): Flag indicating whether to use token lemmas. Defaults to True.

    Returns:
        str: Processed text as a single string.
    """
    # Process text to create a SpaCy Doc object.
    doc = nlp(text)
    tokens = []

    # Iterate through each token in the document.
    for token in doc:
        # Skip tokens that are identified as stopwords if removal is enabled.
        if remove_stopwords and token.is_stop:
            continue

        # Choose the token's lemma or its original text based on the flag.
        processed_token = token.lemma_ if lemmatize else token.text
        processed_token = processed_token.lower()  # Ensure consistency.
        tokens.append(processed_token)

    return ' '.join(tokens)

# =============================================================================
# Data Conversion Functions for Downstream Tasks
# =============================================================================
def convert_to_spacy_format(sentences: List[List[Tuple[str, str]]], output_file: str) -> List:
    """
    Convert parsed CoNLL data into a format suitable for SpaCy training.

    The output format is a list of tuples, where each tuple contains:
      - A full sentence as a string.
      - A dictionary with a key 'entities' mapping to a list of tuples.
        Each tuple in 'entities' contains (character_start, character_end, entity_type).

    The formatted data is saved as a pickle file.

    Args:
        sentences (List[List[Tuple[str, str]]]): Parsed sentences with (word, tag) tuples.
        output_file (str): File path where the output pickle file is saved.

    Returns:
        List: A list of training data formatted for SpaCy.
    """
    training_data = []

    for sentence in sentences:
        # Extract words and corresponding tags from the sentence.
        words = [word for word, _ in sentence]
        tags = [tag for _, tag in sentence]
        text = ' '.join(words)
        entities = []
        i = 0  # Initialize token index.

        # Iterate over tokens to determine entity boundaries using the BIO scheme.
        while i < len(tags):
            if tags[i].startswith('B-'):
                entity_type = tags[i][2:]  # Extract entity type by removing the 'B-' prefix.
                start = i
                end = i + 1

                # Extend the entity span through consecutive 'I-' tags.
                while end < len(tags) and tags[end].startswith('I-') and tags[end][2:] == entity_type:
                    end += 1

                # Calculate character offsets for the entity in the sentence.
                char_start = len(' '.join(words[:start]))
                if start > 0:
                    char_start += 1  # Adjust for the whitespace.
                char_end = char_start + len(' '.join(words[start:end]))

                # Append the entity span information.
                entities.append((char_start, char_end, entity_type))
                i = end  # Move index to the end of the current entity.
            else:
                i += 1

        training_data.append((text, {'entities': entities}))

    # Save the formatted data in pickle format for efficient reloading.
    with open(output_file, 'wb') as f:
        pickle.dump(training_data, f)

    return training_data


def convert_to_bio_format(sentences: List[List[Tuple[str, str]]], output_file: str) -> None:
    """
    Convert dataset into BIO format and write to a text file.

    Each token and its corresponding tag is written on a separate line, and
    sentences are separated by an empty line. This format is commonly used for
    NER model training and evaluation.

    Args:
        sentences (List[List[Tuple[str, str]]]): Parsed sentences with (word, tag) tuples.
        output_file (str): File path where the BIO formatted text file will be saved.
    """
    with open(output_file, 'w', encoding='utf-8') as f:
        for sentence in sentences:
            for word, tag in sentence:
                f.write(f"{word} {tag}\n")
            f.write("\n")  # Insert an empty line between sentences.


def convert_to_json_format(sentences: List[List[Tuple[str, str]]], output_file: str) -> None:
    """
    Convert dataset into JSON format and save it to a file.

    Each sentence is represented as a JSON object with the following keys:
      - 'text': The full sentence as a string.
      - 'tokens': A list of individual tokens.
      - 'tags': The corresponding list of tags for the tokens.

    Args:
        sentences (List[List[Tuple[str, str]]]): Parsed sentences with (word, tag) tuples.
        output_file (str): File path where the JSON output will be saved.
    """
    data = []
    for sentence in sentences:
        words = [word for word, _ in sentence]
        tags = [tag for _, tag in sentence]
        data.append({
            'text': ' '.join(words),
            'tokens': words,
            'tags': tags
        })

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2)


def preprocess_data_for_transformers(sentences: List[List[Tuple[str, str]]]) -> Tuple[List, List]:
    """
    Prepare data for transformer-based models by tokenizing the sentences and
    collecting corresponding tag sequences.

    The output consists of:
      - A list where each element is a list of tokens from a sentence.
      - A parallel list where each element is a list of tags for the sentence tokens.

    Args:
        sentences (List[List[Tuple[str, str]]]): Parsed sentences with (word, tag) tuples.

    Returns:
        Tuple[List, List]: Tokenized texts and their corresponding tag sequences.
    """
    tokenized_texts = []
    tags_list = []

    for sentence in sentences:
        words = [word for word, _ in sentence]
        tags = [tag for _, tag in sentence]
        tokenized_texts.append(words)
        tags_list.append(tags)

    return tokenized_texts, tags_list


def create_entity_labels_mapping(sentences: List[List[Tuple[str, str]]]) -> Tuple[Dict, Dict]:
    """
    Generate mapping dictionaries for entity labels to unique integer IDs and vice versa.

    This is essential for model training, where labels are typically represented by integers.

    Args:
        sentences (List[List[Tuple[str, str]]]): Parsed sentences with (word, tag) tuples.

    Returns:
        Tuple[Dict, Dict]: A tuple containing:
            - tag_to_id: Mapping from entity tag to a unique integer ID.
            - id_to_tag: Reverse mapping from the integer ID to the entity tag.
    """
    unique_tags = set()
    for sentence in sentences:
        for _, tag in sentence:
            unique_tags.add(tag)

    # Sort tags to ensure a consistent mapping order.
    tag_to_id = {tag: i for i, tag in enumerate(sorted(list(unique_tags)))}
    id_to_tag = {i: tag for tag, i in tag_to_id.items()}
    return tag_to_id, id_to_tag

# =============================================================================
# Main Preprocessing Pipeline Execution
# =============================================================================
def main() -> None:
    """
    Execute the complete preprocessing pipeline for the CoNLL-2003 dataset.

    This includes:
      - Reading and parsing the dataset.
      - Computing and displaying dataset statistics.
      - Visualizing entity distributions.
      - Converting data to multiple formats (SpaCy, BIO, JSON).
      - Preparing and saving data for transformer-based models.
    """
    # Define directories and file paths for the dataset.
    data_dir = "/content/data"
    train_path = os.path.join(data_dir, "conll2003", "eng.train")
    testa_path = os.path.join(data_dir, "conll2003", "eng.testa")
    testb_path = os.path.join(data_dir, "conll2003", "eng.testb")

    # Read datasets in CoNLL format.
    train_sentences = read_conll_file(train_path)
    dev_sentences = read_conll_file(testa_path)
    test_sentences = read_conll_file(testb_path)

    # Compute statistics for the training dataset.
    train_stats = explore_dataset(train_sentences)
    print("Training Dataset Statistics:")
    print(f"  Number of sentences: {train_stats['num_sentences']}")
    print(f"  Total words: {train_stats['total_words']}")
    print(f"  Unique words: {train_stats['num_unique_words']}")
    print(f"  Entity counts: {train_stats['entity_counts']}")
    print(f"  Average sentence length: {train_stats['avg_sentence_length']}")

    # Visualize the entity distribution and save the plot to disk.
    vis_dir = os.path.join(data_dir, "visualizations")
    os.makedirs(vis_dir, exist_ok=True)
    visualize_entity_distribution(
        train_stats,
        save_path=os.path.join(vis_dir, "entity_distribution_train.png")
    )

    # Create directories for saving processed data in various formats.
    processed_dir = os.path.join(data_dir, "processed")
    spacy_dir = os.path.join(processed_dir, "spacy")
    bio_dir = os.path.join(processed_dir, "bio")
    json_dir = os.path.join(processed_dir, "json")
    transformer_dir = os.path.join(processed_dir, "transformer")
    for directory in [spacy_dir, bio_dir, json_dir, transformer_dir]:
        os.makedirs(directory, exist_ok=True)

    # Convert and save the dataset in SpaCy training format.
    print("Converting data to SpaCy format...")
    convert_to_spacy_format(train_sentences, os.path.join(spacy_dir, "train.pickle"))
    convert_to_spacy_format(dev_sentences, os.path.join(spacy_dir, "dev.pickle"))
    convert_to_spacy_format(test_sentences, os.path.join(spacy_dir, "test.pickle"))

    # Convert and save the dataset in BIO format.
    print("Converting data to BIO format...")
    convert_to_bio_format(train_sentences, os.path.join(bio_dir, "train.txt"))
    convert_to_bio_format(dev_sentences, os.path.join(bio_dir, "dev.txt"))
    convert_to_bio_format(test_sentences, os.path.join(bio_dir, "test.txt"))

    # Convert and save the dataset in JSON format.
    print("Converting data to JSON format...")
    convert_to_json_format(train_sentences, os.path.join(json_dir, "train.json"))
    convert_to_json_format(dev_sentences, os.path.join(json_dir, "dev.json"))
    convert_to_json_format(test_sentences, os.path.join(json_dir, "test.json"))

    # Preprocess the data for transformer-based models.
    print("Preparing data for transformer models...")
    train_tokens, train_tags = preprocess_data_for_transformers(train_sentences)
    dev_tokens, dev_tags = preprocess_data_for_transformers(dev_sentences)
    test_tokens, test_tags = preprocess_data_for_transformers(test_sentences)

    # Generate and save label mappings for entity types.
    tag_to_id, id_to_tag = create_entity_labels_mapping(train_sentences + dev_sentences + test_sentences)
    mapping_file = os.path.join(transformer_dir, "tag_mappings.json")
    with open(mapping_file, 'w', encoding='utf-8') as f:
        # Convert integer keys to strings to ensure JSON compatibility.
        json.dump({
            "tag_to_id": tag_to_id,
            "id_to_tag": {str(k): v for k, v in id_to_tag.items()}
        }, f, indent=2)

    # Package and save transformer-ready data as a pickle file.
    transformer_data = {
        "train": {"texts": train_tokens, "tags": train_tags},
        "dev": {"texts": dev_tokens, "tags": dev_tags},
        "test": {"texts": test_tokens, "tags": test_tags}
    }
    transformer_file = os.path.join(transformer_dir, "transformer_data.pickle")
    with open(transformer_file, 'wb') as f:
        pickle.dump(transformer_data, f)

    # Demonstrate preprocessing capabilities using a sample from the training dataset.
    print("\nText Preprocessing Examples:")
    if train_sentences:
        # Create a sample text by joining the tokens of the first sentence.
        sample_text = ' '.join([word for word, _ in train_sentences[0]])
        print(f"Original: {sample_text}")
        print(f"NLTK processed: {preprocess_text(sample_text)}")
        print(f"SpaCy processed: {preprocess_spacy(sample_text)}")

    print("\nPreprocessing completed successfully!")
    print(f"Processed data has been saved to the directory: {processed_dir}")

# =============================================================================
# Script Entry Point
# =============================================================================
if __name__ == "__main__":
    main()


In [None]:
#!/usr/bin/env python
"""
===============================================================================
BERT-based NER Training Pipeline with Hyperparameter Tuning
===============================================================================
This script implements a complete pipeline for training and evaluating a BERT-
based Named Entity Recognition (NER) model. It performs the following steps:

  1. Data Loading:
     - Loads preprocessed transformer data and tag mappings from disk.

  2. Data Encoding:
     - Encodes tokenized texts and their labels using a BERT tokenizer.
     - Handles subword tokenization and label alignment.

  3. DataLoader Creation:
     - Constructs PyTorch DataLoaders for training, validation, and testing.

  4. Model Training:
     - Trains the model using gradient accumulation and optionally mixed precision.
     - Periodically evaluates the model on a validation set and logs metrics.

  5. Model Evaluation:
     - Evaluates the trained model by calculating token-level and entity-level metrics.

  6. Hyperparameter Tuning:
     - Conducts grid search over key hyperparameters (learning rate, batch size, etc.)
       to determine the optimal configuration.

  7. Visualization and Saving:
     - Visualizes training history.
     - Saves the trained model, tokenizer, tag mappings, and evaluation metrics.

===============================================================================
Author: Your Name
Date: 2025-03-20
===============================================================================
"""

# =============================================================================
# Standard Library Imports
# =============================================================================
import os          # File and directory operations
import json        # Reading and writing JSON files
import pickle      # Serializing and deserializing Python objects
import numpy as np # Numerical operations and array handling
from typing import Dict, List, Tuple  # Type annotations for clarity

# =============================================================================
# PyTorch and Related Imports
# =============================================================================
import torch
from torch.utils.data import DataLoader, TensorDataset  # Data loading utilities
from torch.optim import AdamW                           # Optimizer with weight decay
from torch.nn import CrossEntropyLoss                   # Loss function for classification

# =============================================================================
# Scikit-learn and Transformers Imports
# =============================================================================
from sklearn.metrics import precision_recall_fscore_support, accuracy_score  # Metrics for evaluation
from sklearn.model_selection import ParameterGrid  # Hyperparameter grid search
from transformers import (
    BertTokenizer,
    BertForTokenClassification,
    get_linear_schedule_with_warmup
)  # Pretrained BERT modules and scheduling

# =============================================================================
# Visualization and Progress Bar Imports
# =============================================================================
import matplotlib.pyplot as plt  # Plotting library for visualization
import seaborn as sns            # Advanced visualization (if needed)
from tqdm import tqdm            # Progress bar for loops


# =============================================================================
# Data Loading Function
# =============================================================================
def load_preprocessed_data(data_dir: str) -> Tuple[Dict, Dict]:
    """
    Load preprocessed transformer data and tag mappings from disk.

    This function reads a pickle file for transformer data and a JSON file for tag mappings.
    These files are expected to reside in the 'processed/transformer' subdirectory of the
    provided data directory.

    Args:
        data_dir (str): Root directory containing the processed data.

    Returns:
        Tuple[Dict, Dict]: A tuple where:
            - The first element is the transformer-formatted data.
            - The second element is the tag mappings (both tag-to-ID and ID-to-tag).
    """
    transformer_dir = os.path.join(data_dir, "processed", "transformer")

    # Load the transformer data from a pickle file.
    with open(os.path.join(transformer_dir, "transformer_data.pickle"), "rb") as f:
        transformer_data = pickle.load(f)

    # Load the tag mappings from a JSON file.
    with open(os.path.join(transformer_dir, "tag_mappings.json"), "r") as f:
        tag_mappings = json.load(f)

    return transformer_data, tag_mappings


# =============================================================================
# Data Encoding Function
# =============================================================================
def encode_dataset(texts: List[List[str]], tags: List[List[str]], tokenizer,
                   tag_to_id: Dict, max_length: int = 128) -> Tuple:
    """
    Encode the dataset using the BERT tokenizer while aligning token labels with subwords.

    This function iterates over each sentence and its corresponding tag sequence.
    For each word:
      - It tokenizes the word into subword tokens.
      - The first subword receives the actual label.
      - Any subsequent subwords are assigned a pad label (either the "O" tag or -100, which is ignored in loss computation).

    After processing, the function adds special tokens ([CLS] and [SEP]), truncates to the specified
    max_length if necessary, and pads the sequence to ensure fixed length.

    Args:
        texts (List[List[str]]): List of tokenized sentences (each sentence is a list of words).
        tags (List[List[str]]): List of tag sequences (each is a list of tags for the sentence).
        tokenizer: Pretrained BERT tokenizer.
        tag_to_id (Dict): Dictionary mapping tag strings to integer IDs.
        max_length (int): Maximum allowed sequence length (default is 128).

    Returns:
        Tuple: Tensors for input IDs, attention masks, and labels.
    """
    input_ids = []        # List to hold encoded token IDs for each sentence.
    attention_masks = []  # List to hold attention masks (1 for tokens, 0 for padding).
    labels = []           # List to hold aligned label IDs for each token.

    pad_token_id = tokenizer.pad_token_id
    # If available, use the ID for the "O" tag as padding; otherwise, use -100.
    pad_token_label_id = tag_to_id.get("O", -100)

    # Process each sentence and its corresponding tags.
    for sentence_tokens, sentence_tags in tqdm(zip(texts, tags), total=len(texts), desc="Encoding dataset"):
        bert_tokens = []  # This list will store the BERT subword tokens.
        bert_labels = []  # This list will store the labels aligned with the subwords.

        # Iterate through each word in the sentence.
        for word, tag in zip(sentence_tokens, sentence_tags):
            # Tokenize the word into subword tokens using the BERT tokenizer.
            word_tokens = tokenizer.tokenize(word)
            # Extend the overall token list with the subword tokens.
            bert_tokens.extend(word_tokens)
            # For the first subword, assign the true label.
            bert_labels.append(tag_to_id[tag])
            # For subsequent subwords, assign the pad label (so they don't affect loss).
            bert_labels.extend([pad_token_label_id] * (len(word_tokens) - 1))

        # Check if the tokenized sentence exceeds the maximum allowed length (minus special tokens).
        if len(bert_tokens) > max_length - 2:
            bert_tokens = bert_tokens[:max_length - 2]
            bert_labels = bert_labels[:max_length - 2]

        # Add special tokens: [CLS] token at the beginning and [SEP] token at the end.
        bert_tokens = [tokenizer.cls_token] + bert_tokens + [tokenizer.sep_token]
        bert_labels = [pad_token_label_id] + bert_labels + [pad_token_label_id]

        # Convert tokens into their corresponding numerical IDs.
        token_ids = tokenizer.convert_tokens_to_ids(bert_tokens)
        # Create an attention mask that has 1s for real tokens.
        attention_mask = [1] * len(token_ids)

        # Determine how many padding tokens are needed to reach max_length.
        padding_length = max_length - len(token_ids)
        token_ids += [pad_token_id] * padding_length
        attention_mask += [0] * padding_length
        bert_labels += [pad_token_label_id] * padding_length

        # Append the processed outputs to our lists.
        input_ids.append(token_ids)
        attention_masks.append(attention_mask)
        labels.append(bert_labels)

    # Convert the lists into PyTorch tensors and return.
    return torch.tensor(input_ids), torch.tensor(attention_masks), torch.tensor(labels)


# =============================================================================
# DataLoader Creation Function
# =============================================================================
def create_data_loaders(train_inputs, train_masks, train_labels,
                        val_inputs=None, val_masks=None, val_labels=None,
                        test_inputs=None, test_masks=None, test_labels=None,
                        batch_size=16, num_workers=4) -> Dict:
    """
    Create PyTorch DataLoaders for the training, validation, and testing datasets.

    This function wraps the tensor data into TensorDatasets and then DataLoaders,
    which will handle batching, shuffling (for training), and parallel data loading.

    Args:
        train_inputs, train_masks, train_labels: Tensors for the training set.
        val_inputs, val_masks, val_labels: Tensors for the validation set (optional).
        test_inputs, test_masks, test_labels: Tensors for the test set (optional).
        batch_size (int): Number of samples per batch.
        num_workers (int): Number of subprocesses to use for data loading.

    Returns:
        Dict: A dictionary containing DataLoaders for 'train', 'validation', and 'test' splits.
    """
    # Create a TensorDataset for the training data.
    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_loader = DataLoader(
        train_data,
        batch_size=batch_size,
        shuffle=True,          # Shuffle data during training for randomness.
        pin_memory=True,       # Enable faster data transfer to GPU.
        num_workers=num_workers  # Use multiple subprocesses.
    )

    loaders = {"train": train_loader}

    # If validation data is provided, create a corresponding DataLoader.
    if val_inputs is not None:
        val_data = TensorDataset(val_inputs, val_masks, val_labels)
        val_loader = DataLoader(
            val_data,
            batch_size=batch_size,
            pin_memory=True,
            num_workers=num_workers
        )
        loaders["validation"] = val_loader

    # Similarly, if test data is provided, create its DataLoader.
    if test_inputs is not None:
        test_data = TensorDataset(test_inputs, test_masks, test_labels)
        test_loader = DataLoader(
            test_data,
            batch_size=batch_size,
            pin_memory=True,
            num_workers=num_workers
        )
        loaders["test"] = test_loader

    return loaders


# =============================================================================
# Model Training Function
# =============================================================================
def train_model(model, data_loaders, optimizer, scheduler, device,
                num_epochs=3, evaluation_steps=100, id_to_tag=None,
                gradient_accumulation_steps=1):
    """
    Train the BERT-based NER model and periodically evaluate it on validation data.

    This training function uses gradient accumulation to simulate larger batch sizes,
    and optionally uses mixed precision training if a GPU is available.

    Args:
        model: The BERT model for token classification.
        data_loaders: Dictionary of DataLoaders for 'train' and 'validation' splits.
        optimizer: Optimizer for updating model parameters.
        scheduler: Learning rate scheduler.
        device: Device to use for training (CPU or GPU).
        num_epochs (int): Number of epochs to train.
        evaluation_steps (int): How many steps between each evaluation.
        id_to_tag: Mapping from label IDs to tag strings (for computing metrics).
        gradient_accumulation_steps (int): Number of steps over which to accumulate gradients.

    Returns:
        Tuple: The trained model and a history dictionary with training and validation metrics.
    """
    # If a GPU is available, print its details and enable performance optimizations.
    if torch.cuda.is_available():
        torch.backends.cudnn.benchmark = True  # Enable cuDNN autotuner for faster performance.
        print(f"Using GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

    model.to(device)  # Move the model to the specified device.
    model.train()     # Set the model to training mode.

    # Setup for mixed precision training if available.
    scaler = torch.cuda.amp.GradScaler() if torch.cuda.is_available() else None

    # Define the loss function. Ignore any indices marked with -100 (used for padding).
    loss_fn = CrossEntropyLoss(ignore_index=-100)

    # Initialize a dictionary to store training history (losses and metrics).
    history = {
        "train_loss": [],
        "val_loss": [],
        "val_f1": [],
        "val_accuracy": []
    }

    global_step = 0  # Counter for the number of gradient update steps.

    # Loop over epochs.
    for epoch in range(num_epochs):
        total_train_loss = 0  # Accumulate loss over the epoch.
        progress_bar = tqdm(data_loaders["train"], desc=f"Epoch {epoch+1}/{num_epochs}")

        # Iterate through each batch in the training DataLoader.
        for step, batch in enumerate(progress_bar):
            # Transfer the batch to the device (GPU or CPU).
            batch_inputs, batch_masks, batch_labels = [b.to(device) for b in batch]

            # If using mixed precision, perform forward and backward passes under autocast.
            if scaler:
                with torch.cuda.amp.autocast():
                    outputs = model(
                        input_ids=batch_inputs,
                        attention_mask=batch_masks,
                        labels=batch_labels
                    )
                    # Scale loss for gradient accumulation.
                    loss = outputs.loss / gradient_accumulation_steps

                # Scale the gradients and perform backward pass.
                scaler.scale(loss).backward()

                # Update model parameters after accumulating gradients.
                if (step + 1) % gradient_accumulation_steps == 0:
                    scaler.unscale_(optimizer)  # Unscale gradients before clipping.
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Clip gradients.
                    scaler.step(optimizer)      # Update parameters with scaled gradients.
                    scaler.update()             # Update scaler for mixed precision.
                    scheduler.step()            # Update learning rate.
                    model.zero_grad()           # Clear gradients.
                    global_step += 1            # Increment global step.
            else:
                # Standard training without mixed precision.
                outputs = model(
                    input_ids=batch_inputs,
                    attention_mask=batch_masks,
                    labels=batch_labels
                )
                loss = outputs.loss / gradient_accumulation_steps
                loss.backward()

                if (step + 1) % gradient_accumulation_steps == 0:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Clip gradients.
                    optimizer.step()       # Update model parameters.
                    scheduler.step()       # Update the learning rate.
                    model.zero_grad()      # Reset gradients.
                    global_step += 1       # Increment update step count.

            # Multiply loss back by accumulation steps for proper scaling.
            total_train_loss += loss.item() * gradient_accumulation_steps
            progress_bar.set_postfix({"loss": loss.item() * gradient_accumulation_steps})

            # Evaluate model performance on the validation set at specified intervals.
            if global_step > 0 and global_step % evaluation_steps == 0 and "validation" in data_loaders:
                val_metrics = evaluate_model(model, data_loaders["validation"], device, id_to_tag)
                history["val_loss"].append(val_metrics["loss"])
                history["val_f1"].append(val_metrics["f1"])
                history["val_accuracy"].append(val_metrics["accuracy"])
                print(f"\nStep {global_step}: Validation Loss: {val_metrics['loss']:.4f}, "
                      f"F1: {val_metrics['f1']:.4f}, Accuracy: {val_metrics['accuracy']:.4f}")
                model.train()  # Return model to training mode after evaluation.

        # Compute the average training loss for the epoch.
        avg_train_loss = total_train_loss / len(data_loaders["train"])
        history["train_loss"].append(avg_train_loss)
        print(f"Epoch {epoch+1}/{num_epochs} - Average training loss: {avg_train_loss:.4f}")

        # At the end of each epoch, perform a full validation evaluation.
        if "validation" in data_loaders:
            val_metrics = evaluate_model(model, data_loaders["validation"], device, id_to_tag)
            print(f"Validation - Loss: {val_metrics['loss']:.4f}, "
                  f"F1: {val_metrics['f1']:.4f}, Accuracy: {val_metrics['accuracy']:.4f}")

    # Return the final model and the history of training metrics.
    return model, history


# =============================================================================
# Model Evaluation Function
# =============================================================================
def evaluate_model(model, data_loader, device, id_to_tag=None):
    """
    Evaluate the BERT NER model on a given dataset and compute performance metrics.

    The evaluation computes the average loss over the dataset, and if tag mappings
    are provided, it calculates both token-level and entity-level precision, recall,
    F1 score, and accuracy.

    Args:
        model: BERT model for token classification.
        data_loader: DataLoader providing evaluation data.
        device: Device on which evaluation is performed.
        id_to_tag: Optional mapping from label IDs to tag strings.

    Returns:
        Dict: A dictionary containing evaluation metrics.
    """
    model.eval()  # Set model to evaluation mode.
    total_loss = 0  # Initialize total loss.
    loss_fn = CrossEntropyLoss(ignore_index=-100)  # Loss function ignores padded indices.

    true_labels = []       # List to store true label IDs.
    predicted_labels = []  # List to store predicted label IDs.

    # Disable gradient computation during evaluation.
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            # Move batch data to device.
            batch_inputs, batch_masks, batch_labels = [b.to(device) for b in batch]

            # Use mixed precision during evaluation if GPU is available.
            if torch.cuda.is_available():
                with torch.cuda.amp.autocast():
                    outputs = model(
                        input_ids=batch_inputs,
                        attention_mask=batch_masks
                    )
                    # Create a mask for active positions (where attention mask is 1).
                    active_loss = batch_masks.view(-1) == 1
                    active_logits = outputs.logits.view(-1, model.config.num_labels)
                    # Set inactive positions to -100 so they are ignored in the loss.
                    active_labels = torch.where(
                        active_loss,
                        batch_labels.view(-1),
                        torch.tensor(-100).type_as(batch_labels)
                    )
                    loss = loss_fn(active_logits, active_labels)
            else:
                outputs = model(
                    input_ids=batch_inputs,
                    attention_mask=batch_masks
                )
                active_loss = batch_masks.view(-1) == 1
                active_logits = outputs.logits.view(-1, model.config.num_labels)
                active_labels = torch.where(
                    active_loss,
                    batch_labels.view(-1),
                    torch.tensor(-100).type_as(batch_labels)
                )
                loss = loss_fn(active_logits, active_labels)

            # Accumulate loss.
            total_loss += loss.item()

            # Obtain predictions by selecting the index with maximum logit.
            logits = outputs.logits
            batch_preds = torch.argmax(logits, dim=2)

            # Move predictions and labels to CPU for further processing.
            labels = batch_labels.detach().cpu().numpy()
            preds = batch_preds.detach().cpu().numpy()
            mask = batch_masks.detach().cpu().numpy()

            # Only consider tokens that are not padding (label != -100) and where mask is active.
            for i in range(labels.shape[0]):
                for j in range(labels.shape[1]):
                    if labels[i, j] != -100 and mask[i, j] == 1:
                        true_labels.append(labels[i, j])
                        predicted_labels.append(preds[i, j])

    # Compute average loss over the evaluation set.
    metrics = {"loss": total_loss / len(data_loader)}

    # If there are valid tokens, compute additional metrics.
    if len(true_labels) > 0:
        if id_to_tag:
            # Convert numerical label IDs to their string representations.
            id_to_tag = {int(k): v for k, v in id_to_tag.items()}
            true_tags = [id_to_tag.get(label, "O") for label in true_labels]
            pred_tags = [id_to_tag.get(label, "O") for label in predicted_labels]

            # For entity-level evaluation, filter out non-entity tokens ("O").
            entity_true = [label for label in true_tags if label != "O"]
            entity_pred = [pred_tags[i] for i, label in enumerate(true_tags) if label != "O"]

            # Compute precision, recall, and F1 score for entities.
            entity_precision, entity_recall, entity_f1, _ = precision_recall_fscore_support(
                entity_true, entity_pred, average='micro', zero_division=0
            )
            metrics["entity_precision"] = entity_precision
            metrics["entity_recall"] = entity_recall
            metrics["entity_f1"] = entity_f1

        # Compute token-level metrics across all tokens (including "O").
        precision, recall, f1, _ = precision_recall_fscore_support(
            true_labels, predicted_labels, average='micro', zero_division=0
        )
        accuracy = accuracy_score(true_labels, predicted_labels)
        metrics.update({
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "accuracy": accuracy
        })

    return metrics


# =============================================================================
# Hyperparameter Tuning Function
# =============================================================================
def hyperparameter_tuning(train_inputs, train_masks, train_labels,
                          val_inputs, val_masks, val_labels,
                          tag_to_id, id_to_tag, device, num_epochs=3,
                          num_workers=4):
    """
    Perform hyperparameter tuning over a grid of hyperparameters.

    This function tests various combinations of:
      - Learning rate
      - Batch size
      - Weight decay
      - Gradient accumulation steps

    For each combination, the model is trained for a small number of epochs and evaluated
    on the validation set. The best configuration is selected based on F1 score.

    Args:
        train_inputs, train_masks, train_labels: Training tensors.
        val_inputs, val_masks, val_labels: Validation tensors.
        tag_to_id: Mapping from tag strings to integer IDs.
        id_to_tag: Mapping from integer IDs to tag strings.
        device: Device to use for training (CPU or GPU).
        num_epochs (int): Number of epochs for tuning (fewer epochs for quick search).
        num_workers (int): Number of DataLoader workers.

    Returns:
        Tuple: The best hyperparameter configuration and its corresponding evaluation metrics.
    """
    # Define the grid of hyperparameters.
    param_grid = {
        'learning_rate': [2e-5, 3e-5, 5e-5],
        'batch_size': [16, 32, 64],
        'weight_decay': [0.0, 0.01],
        'gradient_accumulation_steps': [1, 2, 4]
    }

    # Create a list of all combinations from the grid.
    grid = list(ParameterGrid(param_grid))
    best_f1 = 0
    best_params = None
    best_metrics = None

    # Iterate over each combination in the hyperparameter grid.
    for params in grid:
        print(f"\n=== Testing parameters: {params} ===\n")
        # Create DataLoaders using the current batch size.
        loaders = create_data_loaders(
            train_inputs, train_masks, train_labels,
            val_inputs, val_masks, val_labels,
            batch_size=params['batch_size'],
            num_workers=num_workers
        )

        # Initialize a new model for each hyperparameter configuration.
        model = BertForTokenClassification.from_pretrained(
            "bert-base-cased",
            num_labels=len(tag_to_id)
        )

        # Setup the optimizer with the current learning rate and weight decay.
        optimizer = AdamW(
            model.parameters(),
            lr=params['learning_rate'],
            weight_decay=params['weight_decay']
        )

        # Calculate total training steps based on batch size and gradient accumulation.
        total_steps = len(loaders['train']) * num_epochs // params['gradient_accumulation_steps']
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=int(0.1 * total_steps),
            num_training_steps=total_steps
        )

        # Train the model with the current hyperparameters.
        model, _ = train_model(
            model, loaders, optimizer, scheduler,
            device, num_epochs=num_epochs,
            evaluation_steps=len(loaders['train']),  # Evaluate at the end of each epoch.
            id_to_tag=id_to_tag,
            gradient_accumulation_steps=params['gradient_accumulation_steps']
        )

        # Evaluate the model on the validation set.
        metrics = evaluate_model(model, loaders['validation'], device, id_to_tag)
        if metrics['f1'] > best_f1:
            best_f1 = metrics['f1']
            best_params = params
            best_metrics = metrics

        print(f"F1 score: {metrics['f1']:.4f}\n")

    return best_params, best_metrics


# =============================================================================
# Training History Visualization Function
# =============================================================================
def visualize_training_history(history, save_path=None):
    """
    Visualize the training history including loss curves and evaluation metrics.

    Generates two subplots:
      - Left: Training and validation loss over epochs.
      - Right: Evaluation metrics (F1 Score and Accuracy) over evaluation steps.

    Args:
        history (dict): Dictionary containing recorded training history.
        save_path (str): Optional file path to save the generated plot.
    """
    plt.figure(figsize=(15, 5))

    # Plot loss curves.
    plt.subplot(1, 2, 1)
    plt.plot(history["train_loss"], label="Training Loss", marker="o")
    if "val_loss" in history and history["val_loss"]:
        plt.plot(history["val_loss"], label="Validation Loss", marker="o")
    plt.title("Loss During Training")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()

    # Plot evaluation metrics.
    plt.subplot(1, 2, 2)
    if "val_f1" in history and history["val_f1"]:
        plt.plot(history["val_f1"], label="F1 Score", marker="o")
    if "val_accuracy" in history and history["val_accuracy"]:
        plt.plot(history["val_accuracy"], label="Accuracy", marker="o")
    plt.title("Evaluation Metrics During Training")
    plt.xlabel("Evaluation Step")
    plt.ylabel("Score")
    plt.legend()

    plt.tight_layout()

    # Save the figure if a path is provided; otherwise, display it.
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()


# =============================================================================
# Main Function: Pipeline Execution
# =============================================================================
def main():
    """
    Execute the complete training and evaluation pipeline for the BERT-based NER model.

    This includes:
      - Device and DataLoader worker configuration.
      - Loading preprocessed data and tag mappings.
      - Encoding datasets using the BERT tokenizer.
      - Optionally performing hyperparameter tuning.
      - Final model training with the best hyperparameters.
      - Visualization of training history.
      - Evaluation on the test set.
      - Saving the trained model and related artifacts.
    """
    # Configure the device to use (prefer GPU if available).
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # If using GPU, clear cache and set device properties.
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.reset_max_memory_allocated()
        torch.cuda.set_device(0)
        print(f"GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

    # Determine the number of DataLoader worker processes.
    num_workers = min(os.cpu_count(), 8) if os.cpu_count() else 4
    print(f"Using {num_workers} dataloader workers")

    # Load preprocessed transformer data and tag mappings.
    data_dir = "/content/data"
    transformer_data, tag_mappings = load_preprocessed_data(data_dir)
    tag_to_id = tag_mappings["tag_to_id"]
    id_to_tag = tag_mappings["id_to_tag"]

    # Extract text and tag sequences for training, validation, and testing.
    train_texts = transformer_data["train"]["texts"]
    train_tags = transformer_data["train"]["tags"]
    dev_texts = transformer_data["dev"]["texts"]
    dev_tags = transformer_data["dev"]["tags"]
    test_texts = transformer_data["test"]["texts"]
    test_tags = transformer_data["test"]["tags"]

    print(f"Training set: {len(train_texts)} examples")
    print(f"Validation set: {len(dev_texts)} examples")
    print(f"Test set: {len(test_texts)} examples")

    # Initialize the BERT tokenizer.
    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
    max_length = 128  # Maximum sequence length for BERT input.

    # Encode the datasets into fixed-length tensors.
    print("Encoding datasets...")
    train_inputs, train_masks, train_labels = encode_dataset(
        train_texts, train_tags, tokenizer, tag_to_id, max_length
    )
    dev_inputs, dev_masks, dev_labels = encode_dataset(
        dev_texts, dev_tags, tokenizer, tag_to_id, max_length
    )
    test_inputs, test_masks, test_labels = encode_dataset(
        test_texts, test_tags, tokenizer, tag_to_id, max_length
    )
    print("Datasets encoded successfully!")

    # --------------------------------------------------------------------------
    # Hyperparameter Tuning (Optional)
    # --------------------------------------------------------------------------
    print("\nStarting hyperparameter tuning...")
    # Use a subset of the training data for quick tuning.
    subset_size = min(5000, len(train_inputs))
    subset_indices = np.random.choice(len(train_inputs), subset_size, replace=False)

    tuning_train_inputs = train_inputs[subset_indices]
    tuning_train_masks = train_masks[subset_indices]
    tuning_train_labels = train_labels[subset_indices]

    # Run hyperparameter tuning over the defined grid.
    best_params, _ = hyperparameter_tuning(
        tuning_train_inputs, tuning_train_masks, tuning_train_labels,
        dev_inputs, dev_masks, dev_labels,
        tag_to_id, id_to_tag, device, num_epochs=2,  # Use fewer epochs for faster tuning.
        num_workers=num_workers
    )
    print(f"\nBest hyperparameters: {best_params}")

    # --------------------------------------------------------------------------
    # Final Model Training with Best Hyperparameters
    # --------------------------------------------------------------------------
    batch_size = best_params["batch_size"]
    data_loaders = create_data_loaders(
        train_inputs, train_masks, train_labels,
        dev_inputs, dev_masks, dev_labels,
        test_inputs, test_masks, test_labels,
        batch_size=batch_size,
        num_workers=num_workers
    )

    print("\nInitializing model...")
    model = BertForTokenClassification.from_pretrained(
        "bert-base-cased",
        num_labels=len(tag_to_id)
    )

    learning_rate = best_params["learning_rate"]
    weight_decay = best_params["weight_decay"]
    gradient_accumulation_steps = best_params["gradient_accumulation_steps"]

    optimizer = AdamW(
        model.parameters(),
        lr=learning_rate,
        weight_decay=weight_decay
    )

    num_epochs = 5  # Set the number of epochs for final training.
    total_steps = len(data_loaders["train"]) * num_epochs // gradient_accumulation_steps
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * total_steps),
        num_training_steps=total_steps
    )

    print("\nTraining model...")
    model, history = train_model(
        model, data_loaders, optimizer, scheduler,
        device, num_epochs=num_epochs, evaluation_steps=100,
        id_to_tag=id_to_tag,
        gradient_accumulation_steps=gradient_accumulation_steps
    )

    # Visualize training history and save the plot.
    print("Visualizing training history...")
    vis_dir = os.path.join(data_dir, "visualizations")
    os.makedirs(vis_dir, exist_ok=True)
    visualize_training_history(
        history,
        save_path=os.path.join(vis_dir, "bert_training_history.png")
    )

    # --------------------------------------------------------------------------
    # Evaluate the Final Model on the Test Set
    # --------------------------------------------------------------------------
    print("\nEvaluating on test set...")
    test_metrics = evaluate_model(model, data_loaders["test"], device, id_to_tag)
    print("\nTest Set Metrics:")
    print(f"Loss: {test_metrics['loss']:.4f}")
    print(f"Precision: {test_metrics['precision']:.4f}")
    print(f"Recall: {test_metrics['recall']:.4f}")
    print(f"F1 Score: {test_metrics['f1']:.4f}")
    print(f"Accuracy: {test_metrics['accuracy']:.4f}")
    if "entity_f1" in test_metrics:
        print(f"\nEntity-Level Metrics:")
        print(f"Precision: {test_metrics['entity_precision']:.4f}")
        print(f"Recall: {test_metrics['entity_recall']:.4f}")
        print(f"F1 Score: {test_metrics['entity_f1']:.4f}")

    # --------------------------------------------------------------------------
    # Save the Trained Model and Related Artifacts
    # --------------------------------------------------------------------------
    print("\nSaving model...")
    output_dir = os.path.join(data_dir, "models")
    os.makedirs(output_dir, exist_ok=True)
    model_path = os.path.join(output_dir, "bert_ner_model")
    model.save_pretrained(model_path)
    tokenizer.save_pretrained(model_path)

    # Save tag mappings to JSON.
    with open(os.path.join(model_path, "tag_mappings.json"), "w") as f:
        json.dump(tag_mappings, f, indent=2)

    # Save test metrics for later reference.
    with open(os.path.join(model_path, "test_metrics.json"), "w") as f:
        json.dump({k: float(v) for k, v in test_metrics.items()}, f, indent=2)

    print(f"\nModel saved at {model_path}")
    print("\nDone!")


# =============================================================================
# Script Entry Point
# =============================================================================
if __name__ == "__main__":
    main()
