In [1]:
################################################################################
# CELL 1: SETUP, INSTALLS, AND IMPORTS
################################################################################

from google.colab import drive
drive.mount('/content/drive')

!pip install chardet transformers sentencepiece nltk

import os
import re
import chardet
import nltk
import torch
import numpy as np
from nltk import sent_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)

# Download NLTK sentence tokenizer data
nltk.download('punkt')

# Global configs
MODEL_NAME = "bert-base-uncased"   # Or another HF model
MAX_LEN = 128

# For Stage 1 boundary detection
BOUNDARY_LABELS = ["O", "B-CHUNK", "I-CHUNK"]
label2id_boundary = {lab: i for i, lab in enumerate(BOUNDARY_LABELS)}
id2label_boundary = {i: lab for i, lab in enumerate(BOUNDARY_LABELS)}

# For Stage 2 multi-label classification
SCHEMA_LABELS = ["P", "F", "L", "B", "C"]
SCHEMA2ID = {s: i for i, s in enumerate(SCHEMA_LABELS)}


Mounted at /content/drive


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
################################################################################
# CELL 2: PARSING LOGIC FOR INLINE TAGS -> CHUNKS
################################################################################

import re
from transformers import AutoTokenizer
from nltk.tokenize import sent_tokenize

# Define SCHEMA_LABELS and SCHEMA2ID (Ensure consistency)
SCHEMA_LABELS = ["P", "F", "L", "B", "C"]
SCHEMA2ID = {s: i for i, s in enumerate(SCHEMA_LABELS)}
SCHEMA_ID2LABEL = {i: s for i, s in enumerate(SCHEMA_LABELS)}

def parse_clusters_in_sentence(sentence):
    """
    Splits a sentence on consecutive <...> tags.
    Associates each tag group with the preceding chunk.
    Returns a list of dicts:
      [
        {
          "chunk_text": "...",
          "raw_tags": [...],
        },
        ...
      ]
    """
    # Pattern to identify consecutive tags as one group
    pattern = r'((?:<[^>]+>)+)'
    parts = re.split(pattern, sentence)

    chunks = []

    # Iterate over parts in pairs: text + tags
    for i in range(0, len(parts), 2):
        text_chunk = parts[i].strip()
        tags = []
        if i + 1 < len(parts):
            tags = re.findall(r'<([^>]+)>', parts[i + 1])
        if text_chunk:
            chunks.append({
                'chunk_text': text_chunk,
                'raw_tags': tags.copy()
            })

    return chunks

def collapse_raw_tags(raw_tags):
    """
    Convert tags like ["ms", "F+", "P++"] to a set of [P, F, L, B, C].
    Discard irrelevant tags (ms, spec, forward, etc.).
    """
    core_set = set()
    for rt in raw_tags:
        if not rt:
            continue
        base = rt[0].upper()  # Ensure case insensitivity
        if base in {"P", "F", "L", "B", "C"}:
            core_set.add(base)
    return core_set

def parse_text_into_stage_data(raw_text, tokenizer):
    """
    1) Sentence-splits the text.
    2) For each sentence, parse chunk boundaries (Stage 2) + create B/I/O (Stage 1).
    Returns:
      stage1_data: [{"tokens": [...], "labels": ["B-CHUNK","I-CHUNK",...]}]
      stage2_data: [{"chunk": "...", "labels": [0/1,...]}]
    """
    sentences = sent_tokenize(raw_text)
    stage1_data = []
    stage2_data = []

    for sent_idx, sent in enumerate(sentences, 1):
        # Identify chunk boundaries
        sent_chunks = parse_clusters_in_sentence(sent)
        print(f"\nProcessing Sentence {sent_idx}: {sent}")
        print(f"Identified Chunks: {sent_chunks}")

        # Reconstruct clean_sentence without tags
        clean_sentence = re.sub(r'<[^>]+>', '', sent).strip()
        print(f"Clean Sentence: {clean_sentence}")

        # Tokenize the clean_sentence with offset mapping
        encoding = tokenizer(clean_sentence, return_offsets_mapping=True, add_special_tokens=False)
        tokens = tokenizer.convert_ids_to_tokens(encoding['input_ids'])
        offset_mappings = encoding['offset_mapping']
        print(f"Tokens: {tokens}")
        print(f"Offset Mappings: {offset_mappings}")

        # Initialize labels as "O"
        label_sequence = ["O"] * len(tokens)

        # Track the last assigned character to prevent overlapping assignments
        last_assigned_char = 0

        for ch_idx, ch in enumerate(sent_chunks, 1):
            chunk_text = ch["chunk_text"]
            raw_tag_set = collapse_raw_tags(ch["raw_tags"])

            if not raw_tag_set:
                # This chunk has no labels, so tokens remain "O"
                print(f"Chunk {ch_idx}: '{chunk_text}' - No Labels Assigned")
                continue  # Do not assign labels to these tokens

            # Find the chunk_text in clean_sentence starting from last_assigned_char
            start_char = clean_sentence.find(chunk_text, last_assigned_char)
            if start_char == -1:
                print(f"Warning: Chunk '{chunk_text}' not found in clean_sentence.")
                continue

            end_char = start_char + len(chunk_text)
            print(f"Chunk {ch_idx}: '{chunk_text}' - Start: {start_char}, End: {end_char}")

            # Assign labels to tokens within [start_char, end_char)
            first_token = True
            for i, (token_start, token_end) in enumerate(offset_mappings):
                if token_start >= start_char and token_end <= end_char:
                    if label_sequence[i] == "O":  # Only assign if not already labeled
                        if first_token:
                            label_sequence[i] = "B-CHUNK"
                            first_token = False
                        else:
                            label_sequence[i] = "I-CHUNK"

            # Assign Stage 2 labels
            label_vec = [0] * len(SCHEMA_LABELS)
            for t in raw_tag_set:
                if t in SCHEMA2ID:
                    idx = SCHEMA2ID[t]
                    label_vec[idx] = 1
            print(f"Raw Tags: {ch['raw_tags']}")
            print(f"Assigned Labels: {label_vec}")

            stage2_data.append({
                "chunk": chunk_text,
                "labels": label_vec
            })

            # Update last_assigned_char to end_char to prevent overlapping
            last_assigned_char = end_char

        # Append to Stage1 data
        stage1_data.append({
            "tokens": tokens,
            "labels": label_sequence
        })

    return stage1_data, stage2_data


In [4]:
####CELL 2.1: TESTING THE CHUNKING PROCEDURE####
# Initialize the tokenizer (use the same MODEL_NAME as in Cell 1)
MODEL_NAME = "bert-base-uncased"  # Replace with your specific model name if different
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Define SCHEMA_LABELS and SCHEMA2ID (Ensure consistency)
SCHEMA_LABELS = ["P", "F", "L", "B", "C"]
SCHEMA2ID = {s: i for i, s in enumerate(SCHEMA_LABELS)}
SCHEMA_ID2LABEL = {i: s for i, s in enumerate(SCHEMA_LABELS)}

# Sample input text
sample_text = (
    "Today another American president faces rising<ms><P><F><Spec><UP> fuel prices, "
    "spurred<ms><F+><P++><L+> by a challenge mostly out of his control, an invasion<s>"
    "<F++><P++><L++><C+> of Ukraine by Russia, a top oil and gas producer intent to use its "
    "energy supplies as a weapon when necessary."
)

# Parse the text
stage1_data, stage2_data = parse_text_into_stage_data(sample_text, tokenizer)

# Display Stage 2 data
print("\nStage 2 Data:")
for idx, item in enumerate(stage2_data, 1):
    labels = [SCHEMA_LABELS[i] for i, val in enumerate(item["labels"]) if val]
    print(f"Chunk {idx}: {item['chunk']}")
    print(f"Assigned Labels: {labels}\n")

# Display Stage 1 data
print("Stage 1 Data:")
for idx, item in enumerate(stage1_data, 1):
    print(f"Sentence {idx}:")
    for token, label in zip(item["tokens"], item["labels"]):
        print(f"{token}: {label}")
    print()








The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]


Processing Sentence 1: Today another American president faces rising<ms><P><F><Spec><UP> fuel prices, spurred<ms><F+><P++><L+> by a challenge mostly out of his control, an invasion<s><F++><P++><L++><C+> of Ukraine by Russia, a top oil and gas producer intent to use its energy supplies as a weapon when necessary.
Identified Chunks: [{'chunk_text': 'Today another American president faces rising', 'raw_tags': ['ms', 'P', 'F', 'Spec', 'UP']}, {'chunk_text': 'fuel prices, spurred', 'raw_tags': ['ms', 'F+', 'P++', 'L+']}, {'chunk_text': 'by a challenge mostly out of his control, an invasion', 'raw_tags': ['s', 'F++', 'P++', 'L++', 'C+']}, {'chunk_text': 'of Ukraine by Russia, a top oil and gas producer intent to use its energy supplies as a weapon when necessary.', 'raw_tags': []}]
Clean Sentence: Today another American president faces rising fuel prices, spurred by a challenge mostly out of his control, an invasion of Ukraine by Russia, a top oil and gas producer intent to use its energy s

In [5]:
################################################################################
# CELL 3: READ ALL TXT FILES, DETECT ENCODING, PARSE -> STAGE1 & STAGE2
################################################################################

import os
import chardet
from transformers import AutoTokenizer

# Define your tokenizer (ensure it matches the one used in parsing logic)
MODEL_NAME = "bert-base-uncased"  # Replace with your specific model if different
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

FOLDER_PATH = "/content/drive/MyDrive/02 ENGLISH CORPUS The Daily Telegraph"

all_stage1 = []
all_stage2 = []

txt_files = [f for f in os.listdir(FOLDER_PATH) if f.endswith('.txt')]
print(f"Found {len(txt_files)} text files.")

for filename in txt_files:
    full_path = os.path.join(FOLDER_PATH, filename)

    # 1) Detect encoding
    with open(full_path, 'rb') as f:
        raw_data = f.read(2048)
        detected = chardet.detect(raw_data)
        encoding = detected['encoding']
        if not encoding:
            encoding = 'utf-8'  # Fallback encoding
            print(f"Encoding not detected for {filename}. Using fallback encoding 'utf-8'.")

    # 2) Read file with detected encoding
    try:
        with open(full_path, 'r', encoding=encoding, errors='replace') as f:
            file_text = f.read()
    except Exception as e:
        print(f"Error reading {filename} with encoding {encoding}: {e}")
        continue  # Skip to the next file in case of an error

    # 3) Parse text -> stage1, stage2
    stage1_data, stage2_data = parse_text_into_stage_data(file_text, tokenizer)
    all_stage1.extend(stage1_data)
    all_stage2.extend(stage2_data)

    print(f"Processed file: {filename}")

print(f"\nTotal Stage1 examples: {len(all_stage1)}")
print(f"Total Stage2 examples: {len(all_stage2)}")



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Offset Mappings: [(0, 6), (7, 10), (11, 16), (17, 20), (21, 22), (23, 26), (27, 36), (37, 41), (42, 46), (47, 51), (52, 55), (56, 66), (67, 72), (73, 77), (78, 81), (82, 84), (85, 91), (91, 92), (92, 93), (94, 101), (101, 102), (103, 106), (107, 112), (112, 113), (113, 115), (116, 123), (123, 125), (125, 127), (128, 131), (132, 139), (140, 142), (143, 148), (149, 156), (157, 161), (162, 163), (163, 165), (165, 167), (168, 170), (171, 185), (185, 186), (187, 193), (194, 196), (197, 199), (200, 203), (204, 206), (207, 213), (214, 218), (219, 226), (226, 227)]
Chunk 1: 'Nissan has plans for a big expansion that will turn the Sunderland plant into one of Europe's largest, and start-up Britishvolt has pledged to build another near Blyth in Northumberland, though it is yet to secure full funding.' - No Labels Assigned

Processing Sentence 23: That is the extent of our capabilities.
Identified Chunks: [{'chunk_text': 'That is th

In [6]:
################################################################################
# CELL 3.1: TESTING AND VERIFICATION OF PARSED DATA
################################################################################

import random

# Function to display Stage1 and Stage2 data side by side
def display_sample(stage1_example, stage2_examples, num_samples=3):
    for _ in range(num_samples):
        idx = random.randint(0, len(stage1_example) - 1)
        stage1 = stage1_example[idx]
        print(f"\n--- Example {idx + 1} ---")
        print("Tokens and Labels:")
        for token, label in zip(stage1["tokens"], stage1["labels"]):
            print(f"{token}: {label}")

        # Find corresponding Stage2 examples for this Stage1 example
        # Assuming that Stage2 examples are in the same order as chunks in Stage1
        # This may need adjustment based on how data is aligned
        # Here, we'll print the first few Stage2 examples as an example
        print("\nAssociated Stage2 Chunks and Labels:")
        sample_stage2 = random.sample(stage2_examples, min(3, len(stage2_examples)))
        for chunk_idx, chunk in enumerate(sample_stage2, 1):
            labels = [SCHEMA_ID2LABEL[i] for i, val in enumerate(chunk["labels"]) if val]
            print(f"Chunk {chunk_idx}: {chunk['chunk']}")
            print(f"Assigned Labels: {labels}")

# Function to display label distribution
def display_label_distribution(stage1_example, stage2_examples):
    from collections import Counter

    # Count Stage1 labels
    stage1_labels = []
    for example in stage1_example:
        stage1_labels.extend(example["labels"])
    label_counts_stage1 = Counter(stage1_labels)
    print("\nStage1 Label Distribution:")
    for label, count in label_counts_stage1.items():
        print(f"{label}: {count}")

    # Count Stage2 labels
    stage2_label_counts = Counter()
    for chunk in stage2_examples:
        for label in chunk["labels"]:
            stage2_label_counts[label] += 1
    print("\nStage2 Label Distribution:")
    for label, count in stage2_label_counts.items():
        print(f"{label}: {count}")

# Display random samples
print("Displaying random samples from Stage1 and Stage2 data:")
display_sample(all_stage1, all_stage2, num_samples=3)

# Display label distributions
display_label_distribution(all_stage1, all_stage2)


Displaying random samples from Stage1 and Stage2 data:

--- Example 1737 ---
Tokens and Labels:
he: B-CHUNK
tells: I-CHUNK
me: I-CHUNK
that: I-CHUNK
ai: I-CHUNK
##to: I-CHUNK
en: I-CHUNK
##cr: I-CHUNK
##yp: I-CHUNK
##ts: I-CHUNK
the: I-CHUNK
arm: I-CHUNK
controller: I-CHUNK
chip: I-CHUNK
,: I-CHUNK
and: I-CHUNK
then: I-CHUNK
as: I-CHUNK
a: I-CHUNK
second: I-CHUNK
stage: I-CHUNK
,: I-CHUNK
creates: I-CHUNK
a: I-CHUNK
custom: I-CHUNK
controller: I-CHUNK
chip: I-CHUNK
,: I-CHUNK
called: I-CHUNK
an: I-CHUNK
as: I-CHUNK
##ic: I-CHUNK
,: I-CHUNK
putting: I-CHUNK
the: I-CHUNK
logic: I-CHUNK
behind: I-CHUNK
a: I-CHUNK
second: I-CHUNK
wall: I-CHUNK
of: I-CHUNK
protection: I-CHUNK
,: I-CHUNK
which: I-CHUNK
is: I-CHUNK
something: I-CHUNK
that: I-CHUNK
cannot: I-CHUNK
be: I-CHUNK
reverse: I-CHUNK
engineered: O
.: O

Associated Stage2 Chunks and Labels:
Chunk 1: Last year, the Treasury raked in a record £6.1bn
Assigned Labels: ['P', 'F', 'C']
Chunk 2: George Davey, financial planner at Charles Stan

In [None]:
################################################################################
# CELL 4: BUILD DATASETS FOR STAGE 1 & 2
################################################################################

import os
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer

# Define SCHEMA_LABELS and SCHEMA2ID (Ensure consistency from previous cells)
SCHEMA_LABELS = ["P", "F", "L", "B", "C"]
SCHEMA2ID = {s: i for i, s in enumerate(SCHEMA_LABELS)}
SCHEMA_ID2LABEL = {i: s for i, s in enumerate(SCHEMA_LABELS)}

# Define boundary labels and their corresponding IDs
label2id_boundary = {
    "O": 0,
    "B-CHUNK": 1,
    "I-CHUNK": 2
}

id2label_boundary = {v: k for k, v in label2id_boundary.items()}

# Define maximum sequence length for tokenization
MAX_LEN = 128  # Adjust based on your specific requirements

# Define your model name (ensure consistency across all cells)
MODEL_NAME = "bert-base-uncased"  # Replace with your specific model name if different

# Initialize a single tokenizer instance if both tasks use the same tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Split the data into training and validation sets
boundary_train_data, boundary_val_data = train_test_split(
    all_stage1, test_size=0.1, random_state=42
)
schema_train_data, schema_val_data = train_test_split(
    all_stage2, test_size=0.1, random_state=42
)

print(f"Boundary Train: {len(boundary_train_data)}, Boundary Val: {len(boundary_val_data)}")
print(f"Schema Train: {len(schema_train_data)}, Schema Val: {len(schema_val_data)}")

############################
# Stage 1 boundary dataset #
############################

def encode_boundary_example(example, tokenizer, max_length=MAX_LEN):
    tokens = example["tokens"]
    labels = example["labels"]

    enc = tokenizer(
        tokens,
        is_split_into_words=True,
        truncation=True,
        padding="max_length",
        max_length=max_length
    )

    word_ids = enc.word_ids()
    label_ids = []
    prev_word_id = None

    for word_id in word_ids:
        if word_id is None:
            label_ids.append(-100)
        elif word_id != prev_word_id:
            # First token of the word
            label_ids.append(label2id_boundary.get(labels[word_id], 0))
        else:
            # Subsequent token in the word
            label_ids.append(-100)
        prev_word_id = word_id

    enc["labels"] = label_ids
    return enc

class BoundaryDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __len__(self):
        return len(self.encodings)
    def __getitem__(self, idx):
        item = {k: torch.tensor(v) for k, v in self.encodings[idx].items()}
        return item

# Encode Stage1 data
boundary_train_enc = [encode_boundary_example(x, tokenizer, MAX_LEN) for x in boundary_train_data]
boundary_val_enc   = [encode_boundary_example(x, tokenizer, MAX_LEN) for x in boundary_val_data]

# Create BoundaryDataset instances
boundary_train_dataset = BoundaryDataset(boundary_train_enc)
boundary_val_dataset   = BoundaryDataset(boundary_val_enc)

############################
# Stage 2 schema dataset   #
############################

def encode_schema_example(example, tokenizer, max_length=MAX_LEN):
    """
    Encodes the chunk text for multi-label classification.
    """
    chunk_text = example["chunk"]
    labels = example["labels"]  # This is already a multi-hot vector

    enc = tokenizer(
        chunk_text,
        truncation=True,
        padding="max_length",
        max_length=max_length
    )

    enc["labels"] = labels  # Assuming labels are already multi-hot vectors
    return enc

class SchemaDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __len__(self):
        return len(self.encodings)
    def __getitem__(self, idx):
        item = {}
        for k, v in self.encodings[idx].items():
            if k == "labels":
                item[k] = torch.tensor(v, dtype=torch.float)
            else:
                item[k] = torch.tensor(v, dtype=torch.long)
        return item

# Encode Stage2 data
schema_train_enc = [encode_schema_example(x, tokenizer, MAX_LEN) for x in schema_train_data]
schema_val_enc   = [encode_schema_example(x, tokenizer, MAX_LEN) for x in schema_val_data]

# Create SchemaDataset instances
schema_train_dataset = SchemaDataset(schema_train_enc)
schema_val_dataset   = SchemaDataset(schema_val_enc)

print(f"Boundary Train Dataset Size: {len(boundary_train_dataset)}")
print(f"Boundary Val Dataset Size: {len(boundary_val_dataset)}")
print(f"Schema Train Dataset Size: {len(schema_train_dataset)}")
print(f"Schema Val Dataset Size: {len(schema_val_dataset)}")


Boundary Train: 1974, Boundary Val: 220
Schema Train: 1434, Schema Val: 160
Boundary Train Dataset Size: 1974
Boundary Val Dataset Size: 220
Schema Train Dataset Size: 1434
Schema Val Dataset Size: 160


In [None]:
################################################################################
# CELL 4.1: TESTING AND VERIFICATION OF BUILD DATASETS FOR STAGE 1 & 2
################################################################################

import random
from collections import Counter

# Ensure SCHEMA_ID2LABEL is defined as per Cell 4
# SCHEMA_ID2LABEL = {0: 'P', 1: 'F', 2: 'L', 3: 'B', 4: 'C'}

# Function to display Stage1 and Stage2 data side by side
def display_sample(stage1_examples, stage2_examples, num_samples=3):
    print("\n--- Displaying Random Samples ---")
    for _ in range(num_samples):
        # Randomly select a Stage1 example
        idx_stage1 = random.randint(0, len(stage1_examples) - 1)
        stage1 = stage1_examples[idx_stage1]
        print(f"\n--- Stage1 Example {idx_stage1 + 1} ---")
        print("Tokens and Labels:")
        for token, label in zip(stage1["tokens"], stage1["labels"]):
            # Directly use label strings
            label_str = label
            print(f"{token}: {label_str}")

        # Since Stage2 is a collection of all chunks, we'll display a random Stage2 example
        if stage2_examples:
            idx_stage2 = random.randint(0, len(stage2_examples) - 1)
            stage2 = stage2_examples[idx_stage2]
            schema_labels = [SCHEMA_ID2LABEL[i] for i, val in enumerate(stage2["labels"]) if val]
            print("\n--- Associated Stage2 Chunk ---")
            print(f"Chunk Text: {stage2['chunk']}")
            print(f"Assigned Schema Labels: {schema_labels}")

# Function to display label distribution
def display_label_distribution(stage1_examples, stage2_examples):
    print("\n--- Label Distribution ---")

    # Stage1 Label Distribution
    stage1_label_counts = Counter()
    for example in stage1_examples:
        stage1_label_counts.update(example["labels"])

    print("\nStage1 Label Distribution:")
    for label, count in stage1_label_counts.items():
        print(f"{label}: {count}")

    # Stage2 Label Distribution
    stage2_label_counts = Counter()
    for chunk in stage2_examples:
        for i, val in enumerate(chunk["labels"]):
            if val:
                label_name = SCHEMA_ID2LABEL.get(i, "Unknown")
                stage2_label_counts[label_name] +=1

    print("\nStage2 Label Distribution:")
    for label_name, count in stage2_label_counts.items():
        print(f"{label_name}: {count}")

# Display random samples from the raw data
print("Displaying random samples from Stage1 and Stage2 data:")
display_sample(boundary_train_data, schema_train_data, num_samples=3)

# Display label distributions from the raw data
display_label_distribution(boundary_train_data, schema_train_data)

Displaying random samples from Stage1 and Stage2 data:

--- Displaying Random Samples ---

--- Stage1 Example 1336 ---
Tokens and Labels:
so: O
it: O
does: O
,: O
but: O
it: O
': O
s: O
not: O
what: O
the: O
eu: O
likes: O
.: O

--- Associated Stage2 Chunk ---
Chunk Text: and they can be blown off course by highly unpredictable external events
Assigned Schema Labels: ['P', 'F', 'L']

--- Stage1 Example 872 ---
Tokens and Labels:
finally: B-CHUNK
,: I-CHUNK
in: I-CHUNK
september: I-CHUNK
2014: I-CHUNK
,: I-CHUNK
the: I-CHUNK
water: I-CHUNK
tank: I-CHUNK
in: I-CHUNK
the: I-CHUNK
loft: I-CHUNK
imp: I-CHUNK
##lo: I-CHUNK
##ded: I-CHUNK
,: B-CHUNK
and: I-CHUNK
the: I-CHUNK
ceiling: I-CHUNK
fell: I-CHUNK
in: I-CHUNK
.: O

--- Associated Stage2 Chunk ---
Chunk Text: Ellie Henderson, of the group, said millions of families would struggle
Assigned Schema Labels: ['P', 'F', 'B', 'C']

--- Stage1 Example 989 ---
Tokens and Labels:
but: B-CHUNK
it: I-CHUNK
remains: I-CHUNK
a: I-CHUNK
worth: I-CHUN

In [None]:
################################################################################
# CELL 5 (REVISED): TRAIN + SAVE STAGE 1 (BOUNDARY DETECTION) WITH TOKEN-LEVEL F1
################################################################################

!pip install seqeval  # install seqeval for token-level metrics

import numpy as np
from seqeval.metrics import precision_score, recall_score, f1_score
from transformers import AutoModelForTokenClassification, Trainer, TrainingArguments

# Define boundary labels
BOUNDARY_LABELS = ["O", "B-CHUNK", "I-CHUNK"]

# Create label to ID and ID to label mappings
label2id_boundary = {label: idx for idx, label in enumerate(BOUNDARY_LABELS)}
id2label_boundary = {idx: label for label, idx in label2id_boundary.items()}

def align_predictions(predictions, label_ids):
    """
    Convert logits (or preds) + label_ids to lists of label strings,
    ignoring subword/padding indices (-100).
    """
    preds = np.argmax(predictions, axis=2)  # shape: (batch_size, seq_len)

    batch_size, seq_len = preds.shape
    out_label_list = []
    out_pred_list = []

    for i in range(batch_size):
        pred_ids = preds[i]
        label_id_slice = label_ids[i]

        pred_str = []
        label_str = []

        for p, l in zip(pred_ids, label_id_slice):
            # skip subwords/padding tokens marked with -100
            if l == -100:
                continue
            pred_str.append(id2label_boundary.get(p, "Unknown"))
            label_str.append(id2label_boundary.get(l, "Unknown"))

        out_pred_list.append(pred_str)
        out_label_list.append(label_str)

    return out_pred_list, out_label_list

def compute_metrics_token(eval_pred):
    """
    For token-level classification, compute seqeval metrics (precision, recall, F1).
    """
    predictions, label_ids = eval_pred
    pred_labels, true_labels = align_predictions(predictions, label_ids)

    precision = precision_score(true_labels, pred_labels)
    recall = recall_score(true_labels, pred_labels)
    f1 = f1_score(true_labels, pred_labels)

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# Initialize the model for token classification
model_bc = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(BOUNDARY_LABELS),
    label2id=label2id_boundary,
    id2label=id2label_boundary
)

# Define training arguments
training_args_bc = TrainingArguments(
    output_dir="./boundary_out",          # local checkpoint directory
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=4,                   # adjust as needed
    logging_dir="./logs_boundary",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1",           # pick F1 as the best model metric
    greater_is_better=True,
)

# Initialize the Trainer
trainer_bc = Trainer(
    model=model_bc,
    args=training_args_bc,
    train_dataset=boundary_train_dataset,
    eval_dataset=boundary_val_dataset,
    tokenizer=tokenizer,                   # use the correct tokenizer
    compute_metrics=compute_metrics_token,
)

# 1. Train the model
trainer_bc.train()

# 2. Evaluate the model
metrics = trainer_bc.evaluate()
print("Evaluation metrics:", metrics)

# 3. Save the final model
# Note: 'trainer_bc' has already loaded the best model at end if load_best_model_at_end=True.
# So you can now save it:
trainer_bc.save_model("./boundary_finalSCHEMASGODHELPME")


In [None]:
################################################################################
# CELL 6: TRAIN + SAVE STAGE 2 (SCHEMA CLASSIFICATION) WITH MULTI-LABEL F1
################################################################################

# Install scikit-learn if not already installed
!pip install -U scikit-learn

import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import (
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
)
import torch
import wandb

# Initialize wandb (ensure you've already logged in in a previous cell)
# If not, uncomment the next line and follow the authentication steps
# wandb.login()

# Initialize wandb with a unique run name to avoid conflicts
wandb.init(project="schema_classification_colab", name="Stage2_Schema_Classification_Run")

# Define compute metrics for multi-label classification
def compute_metrics_multilabel(eval_pred):
    """
    Compute precision, recall, f1-score, and accuracy for multi-label classification.
    Uses micro averaging to handle multiple labels per instance.
    """
    logits, labels = eval_pred
    # Apply sigmoid to logits and threshold at 0.5 to get binary predictions
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.tensor(logits))
    preds = (probs >= 0.5).int().numpy()
    labels = labels.astype(int)

    # Calculate metrics using sklearn
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='micro', zero_division=0
    )
    accuracy = accuracy_score(labels, preds)

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "accuracy": accuracy,
    }

# Initialize the model for Stage 2 (Schema Classification)
model_schema = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(SCHEMA_LABELS),
    problem_type="multi_label_classification",  # Ensures sigmoid activation
)

# Define training arguments for Stage 2
training_args_schema = TrainingArguments(
    output_dir="/content/drive/MyDrive/schema_classification/models/schema_out",  # Directory to save checkpoints
    eval_strategy="epoch",                                           # Evaluate at the end of each epoch
    save_strategy="epoch",                                           # Save model at the end of each epoch
    num_train_epochs=10,                                             # Number of training epochs (adjust as needed)
    per_device_train_batch_size=16,                                  # Training batch size (adjust based on GPU memory)
    per_device_eval_batch_size=16,                                   # Evaluation batch size (adjust based on GPU memory)
    logging_dir="/content/drive/MyDrive/schema_classification/logs/logs_schema",  # Directory for logs
    logging_steps=500,                                               # Log every 500 steps to reduce output clutter
    save_total_limit=2,                                              # Limit the total number of checkpoints
    load_best_model_at_end=True,                                     # Load the best model at the end of training
    metric_for_best_model="f1",                                      # Metric to determine the best model
    greater_is_better=True,                                          # Indicates that higher F1 is better
    disable_tqdm=True,                                               # Disable progress bars to prevent IOPub data rate exceeded
    gradient_accumulation_steps=2,                                   # Accumulate gradients over 2 steps
    fp16=True,                                                       # Enable mixed precision training for faster computation
    report_to=["wandb"],                                             # Report metrics to Weights & Biases
    run_name="Stage2_Schema_Classification_Run"                      # Unique run name to avoid conflicts
)

# Initialize the Trainer for Stage 2
trainer_schema = Trainer(
    model=model_schema,
    args=training_args_schema,
    train_dataset=schema_train_dataset,                   # Training dataset for Stage 2
    eval_dataset=schema_val_dataset,                     # Validation dataset for Stage 2
    compute_metrics=compute_metrics_multilabel,           # Evaluation metrics
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Callback for early stopping
    # Removed WandbCallback() to prevent duplicate callbacks
)

# Start training Stage 2 Model
trainer_schema.train()

# Evaluate the Stage 2 Model on Validation Set
metrics_schema = trainer_schema.evaluate()
print("Stage 2 Evaluation Metrics:", metrics_schema)

# Save the final Stage 2 model to Google Drive
trainer_schema.save_model("/content/drive/MyDrive/schema_classification/models/SegmentationModelFinalSCHEMASGODHELPME")

print("Stage 2 Model training and saving completed successfully!")




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.2901412546634674, 'eval_precision': 0.98125, 'eval_recall': 0.7584541062801933, 'eval_f1': 0.8555858310626703, 'eval_accuracy': 0.41875, 'eval_runtime': 74.2976, 'eval_samples_per_second': 2.154, 'eval_steps_per_second': 0.135, 'epoch': 1.0}
{'eval_loss': 0.24522633850574493, 'eval_precision': 0.9152119700748129, 'eval_recall': 0.8864734299516909, 'eval_f1': 0.9006134969325154, 'eval_accuracy': 0.59375, 'eval_runtime': 74.9965, 'eval_samples_per_second': 2.133, 'eval_steps_per_second': 0.133, 'epoch': 2.0}
{'eval_loss': 0.23460260033607483, 'eval_precision': 0.9321608040201005, 'eval_recall': 0.8961352657004831, 'eval_f1': 0.9137931034482759, 'eval_accuracy': 0.6625, 'eval_runtime': 74.1808, 'eval_samples_per_second': 2.157, 'eval_steps_per_second': 0.135, 'epoch': 3.0}
{'eval_loss': 0.24091163277626038, 'eval_precision': 0.9148418491484185, 'eval_recall': 0.9082125603864735, 'eval_f1': 0.9115151515151515, 'eval_accuracy': 0.65, 'eval_runtime': 72.8238, 'eval_samples_pe

In [7]:
################################################################################
# CELL 7: INFERENCE PIPELINE FOR CHUNKING AND SCHEMA CLASSIFICATION
################################################################################

import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModelForSequenceClassification
import os
import nltk

# Download NLTK sentence tokenizer data if not already done
nltk.download('punkt')

# Define global configurations (ensure consistency with previous cells)
MODEL_NAME = "bert-base-uncased"   # Must match the model used during training
MAX_LEN = 128

# Define schema labels and mappings (ensure consistency)
SCHEMA_LABELS = ["P", "F", "L", "B", "C"]
SCHEMA2ID = {s: i for i, s in enumerate(SCHEMA_LABELS)}
SCHEMA_ID2LABEL = {i: s for i, s in enumerate(SCHEMA_LABELS)}

# Define boundary labels and mappings
BOUNDARY_LABELS = ["O", "B-CHUNK", "I-CHUNK"]
label2id_boundary = {label: idx for idx, label in enumerate(BOUNDARY_LABELS)}
id2label_boundary = {idx: label for label, idx in label2id_boundary.items()}

# Paths to the trained models (ensure these paths are correct)
BOUNDARY_MODEL_PATH = "/content/drive/MyDrive/schema_classification/models/FinalBoundary"
SCHEMA_MODEL_PATH = "/content/drive/MyDrive/schema_classification/models/SegmentationModelFinalSCHEMASGODHELPME"

# Initialize the tokenizer (same as used during training)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Load the Stage 1 (Boundary Detection) model
model_boundary = AutoModelForTokenClassification.from_pretrained(
    BOUNDARY_MODEL_PATH,
    num_labels=len(BOUNDARY_LABELS),
    label2id=label2id_boundary,
    id2label=id2label_boundary
)

# Set the model to evaluation mode and move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_boundary.to(device)
model_boundary.eval()

# Load the Stage 2 (Schema Classification) model
model_schema = AutoModelForSequenceClassification.from_pretrained(
    SCHEMA_MODEL_PATH,
    num_labels=len(SCHEMA_LABELS),
    problem_type="multi_label_classification"
)

# Set the model to evaluation mode and move to GPU if available
model_schema.to(device)
model_schema.eval()

import re
from nltk.tokenize import sent_tokenize

def chunk_text(text):
    """
    Splits the input text into chunks using the Stage 1 Boundary Detection model.

    Args:
        text (str): The input text to be chunked.

    Returns:
        List[str]: A list of text chunks identified by the model.
    """
    # Split text into sentences
    sentences = sent_tokenize(text)
    chunks = []

    # Retrieve the set of special tokens to exclude
    special_tokens = set(tokenizer.all_special_tokens)

    for sentence in sentences:
        # Tokenize the sentence (includes special tokens by default)
        inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=MAX_LEN, is_split_into_words=False)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Get model predictions
        with torch.no_grad():
            outputs = model_boundary(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=2)

        # Convert predictions to labels
        predicted_labels = [id2label_boundary.get(p.item(), "O") for p in predictions[0]]

        # Split sentence into tokens
        tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

        # Reconstruct words from tokens to handle wordpiece tokens
        words = []
        current_word = ""
        for token in tokens:
            if token.startswith("##"):
                current_word += token[2:]
            else:
                if current_word:
                    words.append(current_word)
                current_word = token
        if current_word:
            words.append(current_word)

        # Reassign labels to words (assuming one label per word)
        # This assumes that during training, labels were aligned similarly
        word_labels = []
        word_idx = 0
        for word in words:
            # Find the first occurrence of the word in tokens to get its label
            token_indices = [i for i, token in enumerate(tokens) if token.replace("##", "") == word or token.startswith("##" + word)]
            if token_indices:
                first_label = predicted_labels[token_indices[0]]
                word_labels.append(first_label)
            else:
                word_labels.append("O")  # Default to 'O' if not found

        # Reconstruct chunks based on BIO labels
        current_chunk = []
        for word, label in zip(words, word_labels):
            if word in special_tokens:
                continue  # Skip special tokens
            if label == "B-CHUNK":
                if current_chunk:
                    chunks.append(" ".join(current_chunk))
                    current_chunk = []
                current_chunk.append(word)
            elif label == "I-CHUNK":
                if current_chunk:
                    current_chunk.append(word)
                else:
                    # Treat as B-CHUNK if 'I-CHUNK' appears without a preceding 'B-CHUNK'
                    current_chunk.append(word)
            else:
                if current_chunk:
                    chunks.append(" ".join(current_chunk))
                    current_chunk = []

        # Append any remaining chunk
        if current_chunk:
            chunks.append(" ".join(current_chunk))

    return chunks

def classify_chunk(chunk):
    """
    Classifies a text chunk into one or more schemas using the Stage 2 Schema Classification model.

    Args:
        chunk (str): The text chunk to classify.

    Returns:
        List[str]: A list of schemas identified for the chunk.
    """
    # Tokenize the chunk
    inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=MAX_LEN)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Get model predictions
    with torch.no_grad():
        outputs = model_schema(**inputs)
    logits = outputs.logits
    probs = torch.sigmoid(logits)
    preds = (probs >= 0.5).int().cpu().numpy()[0]

    # Map predictions to schema labels
    schemas = [SCHEMA_ID2LABEL[i] for i, pred in enumerate(preds) if pred == 1]

    return schemas

def process_text(text):
    """
    Processes the input text by chunking and classifying each chunk into schemas.

    Args:
        text (str): The input text to process.

    Returns:
        List[Dict]: A list of dictionaries containing chunks and their identified schemas.
    """
    # Step 1: Chunk the text
    chunks = chunk_text(text)

    # Step 2: Classify each chunk
    results = []
    for chunk in chunks:
        schemas = classify_chunk(chunk)
        results.append({
            "chunk": chunk,
            "schemas": schemas
        })

    return results

# Example input text
input_text = """
Brent crude's rise above that milestone. The market reacts to the new data. Oil prices stabilize after the spike. Investors are cautious about the downturn.
"""

# Process the text
results = process_text(input_text)

# Display the results
for idx, res in enumerate(results, 1):
    print(f"Chunk {idx}: {res['chunk']}")
    print(f"Identified Schemas: {', '.join(res['schemas'])}\n")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Chunk 1: brent crude ' s rise above that milestone
Identified Schemas: P, F, L

Chunk 2: oil prices stabilize
Identified Schemas: P, F

Chunk 3: after the spike
Identified Schemas: P, F

Chunk 4: investors are cautious about the
Identified Schemas: P, F, L

