#### Pre-Process Data

In [15]:
import os
import re
from cassis import * 
path = "./curation"
#example path = "./curation/elections/file.json"
files = []
file_names= []
for sub_folder in os.listdir(path):
    if sub_folder.startswith('input'):
        for file in os.listdir(os.path.join(path, sub_folder)):
            if file.endswith('.json'):
                with open(os.path.join(path, sub_folder,file), 'rb') as f:
                    cas = load_cas_from_json(f)
                    files.append(cas)
                file_names.append(sub_folder)

len(files)

1308

In [4]:
f_train_split = open("train_split.txt")
f_test_split = open("test_split.txt")

train_split = f_train_split.read().split('\n')[:-1]
test_split = f_test_split.read().split('\n')[:-1]

print(train_split)

#get train and test files
train_files = []
test_files = []
train_file_names = []
test_file_names = []
for i, file_name in enumerate(file_names):
    if file_name in train_split:
        train_files.append(files[i])
        train_file_names.append(file_name)
    elif file_name in test_split:
        test_files.append(files[i])
        test_file_names.append(file_name)

['input_part026.txt', 'input_part950.txt', 'input_part368.txt', 'input_part_1161.txt', 'input_part011.txt', 'input_part029.txt', 'input_part541.txt', 'input_part297.txt', 'input_part670.txt', 'input_part956.txt', 'input_part_1118.txt', 'input_part482.txt', 'input_part946.txt', 'input_part503.txt', 'input_part449.txt', 'input_part_1195.txt', 'input_part567.txt', 'input_part_1083.txt', 'input_part402.txt', 'input_part195.txt', 'input_part686.txt', 'input_part591.txt', 'input_part624.txt', 'input_part911.txt', 'input_part815.txt', 'input_part037.txt', 'input_part792.txt', 'input_part_1039.txt', 'input_part_1196.txt', 'input_part536.txt', 'input_part584.txt', 'input_part980.txt', 'input_part_1164.txt', 'input_part360.txt', 'input_part_1211.txt', 'input_part408.txt', 'input_part676.txt', 'input_part512.txt', 'input_part473.txt', 'input_part602.txt', 'input_part_1000.txt', 'input_part058.txt', 'input_part310.txt', 'input_part975.txt', 'input_part_1324.txt', 'input_part103.txt', 'input_part29

In [5]:
len(train_file_names),len(test_file_names) , len(train_files), len(test_files)

(881, 209, 881, 209)

In [6]:
import os
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')


  from .autonotebook import tqdm as notebook_tqdm


In [11]:
def cas_to_dataset(cas, label2id): 
    text = cas.sofa_string 
    entities = [(e.begin, e.end, e.label) for e in cas.select("custom.Span") if e.label and (e.label == "Claim" or e.label == "Non-claim")] # sort entities by begin 
    entities = sorted(entities, key=lambda x: x[0]) # initialize tokenization with offsets 
    
    encoding = tokenizer(text, truncation=True, max_length=512, padding="max_length", return_offsets_mapping=True) 
    labels = ["O"] * len(encoding["input_ids"]) 
    
    for start, end, label in entities:
        for i, (tok_start, tok_end) in enumerate(encoding["offset_mapping"]):
            if tok_start >= end or tok_end <= start:
                continue
            if tok_start == start:
                labels[i] = f"B-{label}"
            elif tok_start < end:
                labels[i] = f"I-{label}"
                
    # convert labels to ids
    label_ids = []
    for i, l in enumerate(labels):
        token = tokenizer.convert_ids_to_tokens(encoding["input_ids"][i])
        if token.startswith("##"):
            label_ids.append(-100)  # ignore subword in loss
        else:
            label_ids.append(label2id[l])
                
    return [{ "input_ids": encoding["input_ids"], "attention_mask": encoding["attention_mask"], "labels": label_ids }]

In [None]:
def cas_to_dataset_stride(cas, label2id, doc_id, max_length=512, stride=128):
    text = cas.sofa_string
    entities = [
        (e.begin, e.end, e.label)
        for e in cas.select("custom.Span")
        if e.label and (e.label == "Claim" or e.label == "Non-claim")
    ]
    entities = sorted(entities, key=lambda x: x[0])

    # Tokenize with stride
    encoding = tokenizer(
        text,
        max_length=max_length,
        stride=stride,
        truncation=True,
        padding="max_length",
        return_offsets_mapping=True,
        return_overflowing_tokens=True
    )

    dataset = []

    for chunk_idx in range(len(encoding["input_ids"])):
        offsets = encoding["offset_mapping"][chunk_idx]
        input_ids = encoding["input_ids"][chunk_idx]
        attention_mask = encoding["attention_mask"][chunk_idx]

        labels = ["O"] * len(input_ids)

        # Assign labels for entities
        for start, end, label in entities:
            for i, (tok_start, tok_end) in enumerate(offsets):
                if tok_start >= end or tok_end <= start:
                    continue
                if tok_start == start:
                    labels[i] = f"B-{label}"
                elif tok_start < end:
                    labels[i] = f"I-{label}"

        # Convert labels to ids
        label_ids = []
        for i, l in enumerate(labels):
            token = tokenizer.convert_ids_to_tokens(int(input_ids[i]))
            if token.startswith("##") or input_ids[i] in tokenizer.all_special_ids:
                label_ids.append(-100)  # ignore subwords & specials
            else:
                label_ids.append(label2id[l])

        dataset.append({
            "doc_id": doc_id,
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": label_ids,
            "offset_mapping": offsets
        })

    return dataset


In [None]:

def cas_to_dataset_sent(cas, label2id, doc_id):
    text = cas.sofa_string
    entities = [(e.begin, e.end, e.label) for e in cas.select("custom.Span") if e.label and (e.label == "Claim" or e.label == "Non-claim")]
    entities = sorted(entities, key=lambda x: x[0])

    sentences = text.split('\n')
    dataset = []
    
    offset = 0
    for sent in sentences:
        if not sent.strip():  # skip empty lines
            offset += len(sent) + 1  # +1 for newline
            continue

        sent_start = offset
        sent_end = offset + len(sent)


        ents = []
        for (e_start, e_end, label) in entities:
            if e_end <= sent_start or e_start >= sent_end:
                continue
            ents.append((e_start - sent_start, e_end - sent_start, label))


        encoding = tokenizer(sent, truncation=True, max_length=512, padding="max_length", return_offsets_mapping=True)
        labels = ["O"] * len(encoding["input_ids"])

        for start, end, label in ents:
            for i, (tok_start, tok_end) in enumerate(encoding["offset_mapping"]):
                if tok_start >= end or tok_end <= start:
                    continue
                if tok_start == start:
                    labels[i] = f"B-{label}"
                elif tok_start < end:
                    labels[i] = f"I-{label}"
                    
        label_ids = []
        for i, l in enumerate(labels):
            token = tokenizer.convert_ids_to_tokens(encoding["input_ids"][i])
            if token.startswith("##"):
                label_ids.append(-100)  # ignore subword in loss
            else:
                label_ids.append(label2id[l])
                
        global_offsets = [
            (sent_start + tok_start, sent_start + tok_end)
            for (tok_start, tok_end) in encoding["offset_mapping"]
        ]

        dataset.append({
            "input_ids": encoding["input_ids"],
            "attention_mask": encoding["attention_mask"],
            "labels": label_ids,
            "offset_mapping": global_offsets,
            "doc_id": doc_id,
        })
    
        offset = sent_end + 1  # +1 for the newline

    return dataset

In [None]:
from datasets import Dataset

#label2id = {'O': 0, 'B-Claim': 1, 'I-Claim': 2, 'B-Claim object': 3, 'I-Claim object': 4,
# 'B-Claim span': 5, 'I-Claim span': 6, 'B-Claimer': 7, 'I-Claimer': 8,
# 'B-Non-claim': 9, 'I-Non-claim': 10, 'B-Time': 11, 'I-Time': 12}

label2id = {'O': 0, 'B-Claim': 1, 'I-Claim': 2,
 'B-Non-claim': 3, 'I-Non-claim': 4}

id2label = {i: label for i, label in enumerate(label2id.keys())}

def generateDataset(files, file_names):
    data_list = []
    for file, filename in zip(files, file_names):
        data_list = data_list + cas_to_dataset_sent(file, label2id, filename)
    return Dataset.from_list(data_list)

train_dataset = generateDataset(train_files, train_file_names)
test_dataset = generateDataset(test_files, test_file_names)
print(len(train_dataset), len(test_dataset))

train_dataset

14846 3609


Dataset({
    features: ['input_ids', 'attention_mask', 'labels', 'offset_mapping', 'doc_id'],
    num_rows: 14846
})

### Train the model

In [18]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [19]:
model_checkpoint = "neuralmind/bert-base-portuguese-cased"
batch_size = 16

In [20]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label2id),

                                                        id2label=id2label, label2id=label2id)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:



output_model_name = "my_model_sent_2"
args = TrainingArguments(
    output_model_name,
    report_to="none",
    eval_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16, #16
    per_device_eval_batch_size=16, #16
    num_train_epochs=100,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,   
    metric_for_best_model="f1",      
    greater_is_better=True,           
    save_total_limit=1
    #push_to_hub=True,
)

In [22]:
import evaluate
metric = evaluate.load("seqeval")

import numpy as np

def compute_metrics(p):
    print(p)
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    #return results
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [23]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [24]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0438,0.016537,0.295361,0.659487,0.407995,0.99279
2,0.0132,0.01611,0.370853,0.642051,0.470146,0.993468
3,0.0076,0.019715,0.426737,0.749744,0.543899,0.993124
4,0.0036,0.02527,0.478842,0.661538,0.555556,0.993508
5,0.0022,0.024953,0.481086,0.6,0.534003,0.993429
6,0.0011,0.034274,0.532999,0.654359,0.587477,0.993567
7,0.0011,0.034048,0.527304,0.633846,0.575687,0.99336
8,0.0006,0.035291,0.497724,0.672821,0.572176,0.993188
9,0.0006,0.031808,0.520833,0.692308,0.594452,0.993116
10,0.0006,0.045828,0.542178,0.725128,0.620448,0.993339


<transformers.trainer_utils.EvalPrediction object at 0x7fe5907514d0>
<transformers.trainer_utils.EvalPrediction object at 0x7fe7322a93d0>
<transformers.trainer_utils.EvalPrediction object at 0x7fe70ec8ab10>
<transformers.trainer_utils.EvalPrediction object at 0x7fe70f271cd0>
<transformers.trainer_utils.EvalPrediction object at 0x7fe70ee9ebd0>
<transformers.trainer_utils.EvalPrediction object at 0x7fe70fc1d2d0>
<transformers.trainer_utils.EvalPrediction object at 0x7fe70d5db3d0>
<transformers.trainer_utils.EvalPrediction object at 0x7fe733032910>
<transformers.trainer_utils.EvalPrediction object at 0x7fe70eec2850>
<transformers.trainer_utils.EvalPrediction object at 0x7fe70d77bdd0>
<transformers.trainer_utils.EvalPrediction object at 0x7fe70e5d2850>
<transformers.trainer_utils.EvalPrediction object at 0x7fe70e41ab90>
<transformers.trainer_utils.EvalPrediction object at 0x7fe5659e8710>
<transformers.trainer_utils.EvalPrediction object at 0x7fe7101d1a50>


KeyboardInterrupt: 

In [26]:
trainer.save_model("my_model_sent_2")

### Evaluate the model (stride)

In [19]:
from transformers import AutoModelForTokenClassification
#load model from "my_model_/checkpoint-epoch-xx"
model_path = "my_model_stride"
model = AutoModelForTokenClassification.from_pretrained(model_path)

In [None]:
from collections import defaultdict

from collections import defaultdict

def evaluate_with_tolerance(y_true_spans, y_pred_spans, tolerance=5):

    stats = defaultdict(lambda: {"tp": 0, "fp": 0, "fn": 0})

    for doc_true, doc_pred in zip(y_true_spans, y_pred_spans):
        matched_gold = set()

        # Count true positives and false positives
        for p_start, p_end, p_label in doc_pred:
            found_match = False
            for i, (g_start, g_end, g_label) in enumerate(doc_true):
                if i in matched_gold:
                    continue
                if g_label == p_label and abs(p_start - g_start) <= tolerance and abs(p_end - g_end) <= tolerance:
                    stats[p_label]["tp"] += 1
                    matched_gold.add(i)
                    found_match = True
                    break
            if not found_match:
                stats[p_label]["fp"] += 1

        # Count false negatives
        for i, (g_start, g_end, g_label) in enumerate(doc_true):
            if i not in matched_gold:
                stats[g_label]["fn"] += 1

    # Compute per-class precision, recall, F1
    results = {}
    total_tp, total_fp, total_fn = 0, 0, 0
    for label, s in stats.items():
        tp, fp, fn = s["tp"], s["fp"], s["fn"]
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        results[label] = {"precision": precision, "recall": recall, "f1": f1}
        total_tp += tp
        total_fp += fp
        total_fn += fn

    # Micro-averaged metrics
    micro_precision = total_tp / (total_tp + total_fp) if total_tp + total_fp > 0 else 0
    micro_recall = total_tp / (total_tp + total_fn) if total_tp + total_fn > 0 else 0
    micro_f1 = 2 * micro_precision * micro_recall / (micro_precision + micro_recall) if micro_precision + micro_recall > 0 else 0
    results["micro"] = {"precision": micro_precision, "recall": micro_recall, "f1": micro_f1}

    return results




def tokens_to_spans(pred_labels, offset_mapping):
    spans = []
    current_label = None
    start_char = None
    prev_end_char = None

    for (tok_start, tok_end), label in zip(offset_mapping, pred_labels):
        # Skip special tokens or padding
        if tok_start is None or tok_end is None or tok_start == tok_end:
            continue

        # Subword token: extend current span if inside one
        if label == -100:
            if current_label is not None:
                prev_end_char = tok_end
            continue

        if label.startswith("B-"):
            if current_label:
                spans.append((start_char, prev_end_char, current_label))
            current_label = label[2:]
            start_char = tok_start
        elif label.startswith("I-") and current_label == label[2:]:
            prev_end_char = tok_end
        else:  # O or mismatch
            if current_label:
                spans.append((start_char, prev_end_char, current_label))
                current_label = None

        prev_end_char = tok_end

    if current_label:
        spans.append((start_char, prev_end_char, current_label))

    return spans


In [None]:
from collections import defaultdict
import evaluate
metric = evaluate.load("seqeval")
from collections import defaultdict
import torch
import torch.nn.functional as F




def merge_stride_predictions(predictions, label_map):

    doc_tokens = defaultdict(dict)  # doc_id -> {token_start: (best_conf, pred_label, true_label)}
    doc_offsets = defaultdict(list) # doc_id -> list of (start_char, end_char)
    for sample in predictions:
        doc_id = sample["doc_id"]
        offsets = sample["offset_mapping"]
        logits_list = sample["logits"]
        labels = sample["labels"]

        for (start, end), logits, t in zip(offsets, logits_list, labels):
            #if t == -100:
            #    continue  # skip subword/special tokens

            probs = F.softmax(torch.tensor(logits), dim=-1)
            best_conf, best_idx = torch.max(probs, dim=-1)
            pred_label = label_map[int(best_idx)]
            #true_label = label_map[int(t)]

            if t == -100:
                true_label = -100  # keep so subwords can extend span later
            else:
                true_label = label_map[int(t)]

            # if token already seen, keep the most confident prediction
            if start in doc_tokens[doc_id]:
                #if best_conf > doc_tokens[doc_id][start][0]:
                #    doc_tokens[doc_id][start] = (best_conf, pred_label, true_label)
                pass
            else:
                doc_tokens[doc_id][start] = (best_conf, pred_label, true_label)
            
            # keep track of document-level offsets
            if start not in [s for s, e in doc_offsets[doc_id]]:
                doc_offsets[doc_id].append((start, end))

    y_true_per_doc = []
    y_pred_per_doc = []

    for doc_id in doc_tokens:
        token_starts = sorted(doc_tokens[doc_id].keys())
        doc_true_labels = []
        doc_pred_labels = []

        for start in token_starts:
            _, pred_label, true_label = doc_tokens[doc_id][start]
            doc_pred_labels.append(pred_label)
            doc_true_labels.append(true_label)

        y_true_per_doc.append(doc_true_labels)
        y_pred_per_doc.append(doc_pred_labels)

        # sort offsets for the document
        doc_offsets[doc_id] = sorted(doc_offsets[doc_id], key=lambda x: x[0])
    return y_true_per_doc, y_pred_per_doc, doc_offsets



pred_spans_ner = []
gold_spans_ner = []


def compute_metrics_stride(eval_preds):
    global pred_spans_ner
    global gold_spans_ner
    logits, labels = eval_preds  # logits shape: (batch_size, seq_len, num_labels)
    
    predictions = []
    for i in range(len(logits)):
        predictions.append({
            "doc_id": test_dataset[i]["doc_id"],
            "offset_mapping": test_dataset[i]["offset_mapping"],
            "logits": logits[i],   # store raw logits, not argmax
            "labels": labels[i]
        })

    # merge overlapping chunk predictions using most confident token
    y_true, y_pred, doc_offsets  = merge_stride_predictions(predictions, model.config.id2label)
    all_pred_spans = []
    all_gold_spans = []

    for i, doc_id in enumerate(doc_offsets.keys()):
        pred_spans = tokens_to_spans(y_pred[i], doc_offsets[doc_id])
        gold_spans = tokens_to_spans(y_true[i], doc_offsets[doc_id])
        all_pred_spans.append(pred_spans)
        all_gold_spans.append(gold_spans)
    pred_spans_ner = all_pred_spans
    gold_spans_ner = all_gold_spans
    return evaluate_with_tolerance(all_gold_spans, all_pred_spans, tolerance=0)
    

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_stride
)

In [None]:
trainer.evaluate()

### Evaluate the model (sentence)

In [None]:
from collections import defaultdict
import evaluate
metric = evaluate.load("seqeval")
from collections import defaultdict
import torch
import torch.nn.functional as F


pred_spans_ner = []
gold_spans_ner = []
def compute_metrics_sent(eval_preds):
    global pred_spans_ner
    global gold_spans_ner
    logits, labels = eval_preds  # logits shape: (batch_size, seq_len, num_labels)
    
    predictions = []
    for i in range(len(logits)):
        predictions.append({
            "doc_id": test_dataset[i]["doc_id"],
            "logits": logits[i],   # store raw logits, not argmax
            "labels": labels[i],
            "offset_mapping": test_dataset[i]["offset_mapping"],
            
        })

    # get y_true and y_pred without merging
    y_true = []
    y_pred = []
    for sample in predictions:
        
        logits_list = sample["logits"]
        labels = sample["labels"]
        doc_id = sample["doc_id"]
        doc_true_labels = []
        doc_pred_labels = []

        for logits, t in zip(logits_list, labels):
            if t == -100:
                doc_true_labels.append(-100)
                doc_pred_labels.append(-100)
                continue  # skip subword/special tokens

            probs = F.softmax(torch.tensor(logits), dim=-1)
            best_conf, best_idx = torch.max(probs, dim=-1)
            pred_label = model.config.id2label[int(best_idx)]
            true_label = model.config.id2label[int(t)]

            doc_pred_labels.append(pred_label)
            doc_true_labels.append(true_label)

        y_true.append(doc_true_labels)
        y_pred.append(doc_pred_labels)

    all_pred_spans = []
    all_gold_spans = []

    pred_spans_per_doc = defaultdict(list)
    gold_spans_per_doc = defaultdict(list)
    
    for i, sample in enumerate(predictions):
        doc_id = sample["doc_id"]
        offsets = sample["offset_mapping"]
        pred_spans = tokens_to_spans(y_pred[i], offsets)
        gold_spans = tokens_to_spans(y_true[i], offsets)
        pred_spans_per_doc[doc_id].extend(pred_spans)
        gold_spans_per_doc[doc_id].extend(gold_spans)

    # convert to list of lists per document
    all_pred_spans = [sorted(pred_spans_per_doc[doc_id]) for doc_id in pred_spans_per_doc.keys()]
    all_gold_spans = [sorted(gold_spans_per_doc[doc_id]) for doc_id in gold_spans_per_doc.keys()]

    pred_spans_ner = all_pred_spans
    gold_spans_ner = all_gold_spans

    return evaluate_with_tolerance(all_gold_spans, all_pred_spans, tolerance=0)
    

In [95]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_sent
)

  trainer = Trainer(


In [87]:
trainer.evaluate()

{'eval_loss': 0.04148254171013832,
 'eval_model_preparation_time': 0.0056,
 'eval_Non-claim': {'precision': 0.6334586466165414,
  'recall': 0.764172335600907,
  'f1': 0.6927029804727647},
 'eval_Claim': {'precision': 0.375,
  'recall': 0.25806451612903225,
  'f1': 0.3057324840764331},
 'eval_micro': {'precision': 0.6187943262411347,
  'recall': 0.7158974358974359,
  'f1': 0.6638135996195911},
 'eval_runtime': 165.5313,
 'eval_samples_per_second': 21.803,
 'eval_steps_per_second': 1.365}