In [None]:
#Randomly sample 10,000 rows
import pandas as pd

# Load data
df = pd.read_csv("D:/Python_WC/Final_project/Multi-Task_News_Intelligence_System/Data/news.tsv", sep="\t")

# Randomly sample 10,000 rows
df = df.sample(n=10000, random_state=42).reset_index(drop=True)

In [None]:
df.to_csv("D:/Python_WC/Final_project/Multi-Task_News_Intelligence_System/NER/Balanced_10000_records.csv", index=False)


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install spacy
!python -m spacy download en_core_web_trf


Collecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import pandas as pd
PATH ="/content/drive/MyDrive/DSIPYNB/NER/Balanced_10000_records.csv"
df = pd.read_csv(PATH)
df.head()

Unnamed: 0,News ID,Category,Topic,Headline,News body,Title entity,Entity content
0,N107100,entertainment,entertainment-celebrity,Stars who came out,Find out how these celebs revealed their sexua...,{},{}
1,N104494,foodanddrink,recipes,19 Ice Cream Pies You'll Want to Make All Summ...,Everyone will want a second slice! Cherry and ...,{},{}
2,N37185,finance,financenews,Mixed-used development will bring variety to d...,Seven of 26 expected vendors have already sign...,{},{}
3,N54647,news,newspolitics,Paul Manafort Seemed Headed to Rikers. Then th...,[What you need to know to start the day: Get N...,{'Justice Department': 'United States Departme...,{'United States Department of Justice': {'type...
4,N112983,sports,mma,Sean Shelby's Shoes: What's next for Junior Do...,(ALSO SEE: Sean Shelby's Shoes: What's next fo...,"{'Junior Dos Santos': 'Junior dos Santos', 'UF...","{'Junior dos Santos': {'type': 'item', 'id': '..."


In [None]:
import re

def clean_text_only_chars(text):
    # Remove everything except alphabets and spaces
    text = re.sub(r'[^A-Za-z\s]', ' ', text)
    # Replace multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [None]:
# GPU load

import spacy

# MUST CALL BEFORE LOADING MODEL
spacy.require_gpu()

nlp = spacy.load("en_core_web_trf")

# OPTIONAL SPEED BOOST
nlp.disable_pipes("parser", "lemmatizer", "attribute_ruler")

print("Loaded spaCy Transformer Model on GPU 🚀")


Loaded spaCy Transformer Model on GPU 🚀


In [None]:
df["Headline"] = df["Headline"].astype(str).apply(clean_text_only_chars)
df["News body"] = df["News body"].astype(str).apply(clean_text_only_chars)
df["Title entity"] = df["Title entity"].astype(str)

In [None]:
import ast

def generate_bio_using_spacy_and_title(doc, title_value):
    tokens = [tok.text for tok in doc]
    tags = ["O"] * len(tokens)

    # 1️⃣ spaCy labeling
    for ent in doc.ents:
        tags[ent.start] = "B-" + ent.label_
        for i in range(ent.start + 1, ent.end):
            tags[i] = "I-" + ent.label_

    # 2️⃣ Title entity fallback
    try:
        ent_dict = ast.literal_eval(title_value)
    except:
        ent_dict = {}

    for surface, expanded in ent_dict.items():
        surface_clean = surface.replace("'s", "").strip()
        stoks = surface_clean.split()
        n = len(stoks)

        for i in range(len(tokens)-n+1):
            # If spaCy already labeled → skip
            if tokens[i:i+n] == stoks and tags[i] == "O":
                tags[i] = "B-MISC"
                for j in range(i+1, i+n):
                    tags[j] = "I-MISC"

    return tokens, tags

In [None]:
from tqdm import tqdm
import numpy as np
import gc
import torch

texts = (df["Headline"] + ". " + df["News body"]).tolist()

sentences = []
labels = []

BATCH_SIZE = 16  # 🔥 LOWER Batch = LOWER GPU USAGE

print("Starting processing 10K rows...")

docs = nlp.pipe(texts, batch_size=BATCH_SIZE)

for doc, (_, row) in tqdm(zip(docs, df.iterrows()), total=len(df)):
    tokens, tags = generate_bio_using_spacy_and_title(doc, row["Title entity"])
    sentences.append(tokens)
    labels.append(tags)

# FREE GPU CACHE
torch.cuda.empty_cache()
gc.collect()

np.save("/content/drive/MyDrive/DSIPYNB/NER/ner_sentences.npy", np.array(sentences, dtype=object), allow_pickle=True)
np.save("/content/drive/MyDrive/DSIPYNB/NER/ner_labels.npy", np.array(labels, dtype=object), allow_pickle=True)

print("\n🎉 DONE — Successfully processed 10K rows!")

Starting processing 10K rows...


  dlpack_tensor = xp_tensor.toDlpack()  # type: ignore
100%|██████████| 10000/10000 [14:00<00:00, 11.90it/s]



🎉 DONE — Successfully processed 10K rows!


In [None]:
sentences

In [None]:
import numpy as np

sentences = np.load("/content/drive/MyDrive/DSIPYNB/NER/ner_sentences.npy", allow_pickle=True)
labels = np.load("/content/drive/MyDrive/DSIPYNB/NER/ner_labels.npy", allow_pickle=True)

print(len(sentences), len(labels))
print(sentences[0])
print(labels[0])

10000 10000
['Stars', 'who', 'came', 'out', '.', 'Find', 'out', 'how', 'these', 'celebs', 'revealed', 'their', 'sexual', 'and', 'gender', 'identities', 'to', 'the', 'world', 'Love', 'is', 'love', 'Wonderwall', 'com', 'is', 'taking', 'a', 'look', 'at', 'all', 'the', 'ways', 'the', 'stars', 'have', 'told', 'the', 'world', 'about', 'their', 'sexual', 'and', 'gender', 'identities', 'starting', 'with', 'rapper', 'Lil', 'Nas', 'X', 'Fresh', 'off', 'his', 'massive', 'hit', 'Old', 'Town', 'Road', 'Lil', 'Nas', 'X', 'came', 'out', 'as', 'gay', 'at', 'the', 'end', 'of', 'Pride', 'month', 'on', 'June', 'While', 'posting', 'a', 'link', 'to', 'his', 'new', 'song', 'C', 'osure', 'the', 'rapper', 'tweeted', 'Some', 'of', 'y', 'all', 'already', 'know', 'some', 'of', 'y', 'all', 'don', 't', 'care', 'some', 'of', 'y', 'all', 'not', 'going', 'to', 'f', 'with', 'me', 'no', 'more', 'But', 'before', 'this', 'month', 'ends', 'I', 'want', 'y', 'all', 'to', 'listen', 'closely', 'to', 'c', 'osure', 'He', 'added

In [None]:
tag_list = sorted(list({tag for seq in labels for tag in seq}))
tag2id = {tag: i for i, tag in enumerate(tag_list)}
id2tag = {i: tag for tag, i in tag2id.items()}

print(tag2id)

{'B-CARDINAL': 0, 'B-DATE': 1, 'B-EVENT': 2, 'B-FAC': 3, 'B-GPE': 4, 'B-LANGUAGE': 5, 'B-LAW': 6, 'B-LOC': 7, 'B-MISC': 8, 'B-MONEY': 9, 'B-NORP': 10, 'B-ORDINAL': 11, 'B-ORG': 12, 'B-PERCENT': 13, 'B-PERSON': 14, 'B-PRODUCT': 15, 'B-QUANTITY': 16, 'B-TIME': 17, 'B-WORK_OF_ART': 18, 'I-CARDINAL': 19, 'I-DATE': 20, 'I-EVENT': 21, 'I-FAC': 22, 'I-GPE': 23, 'I-LAW': 24, 'I-LOC': 25, 'I-MISC': 26, 'I-MONEY': 27, 'I-NORP': 28, 'I-ORG': 29, 'I-PERCENT': 30, 'I-PERSON': 31, 'I-PRODUCT': 32, 'I-QUANTITY': 33, 'I-TIME': 34, 'I-WORK_OF_ART': 35, 'O': 36}


In [None]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

encodings = tokenizer(
    sentences.tolist(),
    is_split_into_words=True,
    padding=True,
    truncation=True,
    return_offsets_mapping=True
)

aligned_labels = []

for i in range(len(sentences)):
    word_ids = encodings.word_ids(batch_index=i)
    sample_labels = labels[i]
    prev_word = None
    aligned = []

    for w in word_ids:
        if w is None:
            aligned.append(-100)  # ignore in loss
        else:
            aligned.append(tag2id[sample_labels[w]])

    aligned_labels.append(aligned)

encodings.pop("offset_mapping")

In [None]:

from torch.utils.data import Dataset


class NerDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = {k: torch.tensor(v) for k, v in encodings.items()}
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

# ✅ CREATE THE DATASET HERE
dataset = NerDataset(encodings, aligned_labels)


In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

train_idx, val_idx = train_test_split(
    np.arange(len(aligned_labels)),
    test_size=0.1,
    random_state=42
)

train_dataset = torch.utils.data.Subset(dataset, train_idx)
val_dataset   = torch.utils.data.Subset(dataset, val_idx)

In [None]:
from transformers import BertForTokenClassification
model = BertForTokenClassification.from_pretrained(
        "bert-base-cased",
        num_labels=len(tag2id),
        id2label=id2tag,
        label2id=tag2id
    )

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

BertForTokenClassification LOAD REPORT from: bert-base-cased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
bert.pooler.dense.weight                   | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
bert.pooler.dense.bias                     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized beca

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./bert_ner_output",
    save_strategy="epoch",
    eval_strategy="epoch",  # valid dataset only
    logging_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    report_to="none"
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset  # only to monitor val loss
)


In [None]:
trainer.train()


Epoch,Training Loss,Validation Loss
1,0.289114,0.172528
2,0.148883,0.155903


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

TrainOutput(global_step=1126, training_loss=0.21899853802913036, metrics={'train_runtime': 2129.8655, 'train_samples_per_second': 8.451, 'train_steps_per_second': 0.529, 'total_flos': 4704829913088000.0, 'train_loss': 0.21899853802913036, 'epoch': 2.0})

In [None]:
trainer.save_model("/content/drive/MyDrive/DSIPYNB/NER/bert_ner_model")
tokenizer.save_pretrained("/content/drive/MyDrive/DSIPYNB/NER/bert_ner_model")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

('/content/drive/MyDrive/DSIPYNB/NER/bert_ner_model/tokenizer_config.json',
 '/content/drive/MyDrive/DSIPYNB/NER/bert_ner_model/tokenizer.json')

In [None]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=546ffef0d77697f800c9d627789a046683fa5581c4224d4f72d05228d1161712
  Stored in directory: /root/.cache/pip/wheels/5f/b8/73/0b2c1a76b701a677653dd79ece07cfabd7457989dbfbdcd8d7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
# EVAL
import torch
import numpy as np
from seqeval.metrics import classification_report, precision_score, recall_score, f1_score
import pandas as pd
from transformers import BertForTokenClassification

# ---------------------------------------
# 1. Load trained BERT model
# ---------------------------------------
MODEL_PATH = "/content/drive/MyDrive/DSIPYNB/NER/bert_ner_model"  # change if needed
model = BertForTokenClassification.from_pretrained(MODEL_PATH)
model.eval()

print("Loaded Success")

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

Loaded Success


In [None]:
# ---------------------------------------
# 2. Create validation dataset again
# ---------------------------------------
X_val = torch.tensor([encodings['input_ids'][i] for i in val_idx])
mask_val = torch.tensor([encodings['attention_mask'][i] for i in val_idx])
Y_val = [aligned_labels[i] for i in val_idx]

print("Validation items:", len(Y_val))

Validation items: 1000


In [None]:
# ---------------------------------------
# 1. Move model + tensors to GPU
# ---------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

X_val = X_val.to(device)
mask_val = mask_val.to(device)

# ---------------------------------------
# 2. Build DataLoader (MUCH faster)
# ---------------------------------------
from torch.utils.data import DataLoader, TensorDataset

val_ds = TensorDataset(X_val, mask_val)
val_loader = DataLoader(val_ds, batch_size=128, shuffle=False)  # try 128 or 256

# ---------------------------------------
# 3. Fast Inference Loop
# ---------------------------------------
all_preds = []

model.eval()
with torch.inference_mode():   # faster than no_grad
    for batch_ids, batch_mask in val_loader:
        outputs = model(batch_ids, attention_mask=batch_mask)
        preds = outputs.logits.argmax(-1)
        all_preds.extend(preds.cpu().tolist())

# all_preds now contains predictions for all validation samples
print("Inference complete. Total predictions:", len(all_preds))


Inference complete. Total predictions: 1000


In [None]:
# ---------------------------------------
# 4. Convert predictions to tag text
# ---------------------------------------
y_true = []
y_pred = []

for true_seq, pred_seq in zip(Y_val, all_preds):
    for t, p in zip(true_seq, pred_seq):
        if t != -100:
            y_true.append(id2tag[t])
            y_pred.append(id2tag[p])


In [None]:
# ---------------------------------------
# 5. Calculate Metrics
# ---------------------------------------
precision = precision_score([y_true], [y_pred])
recall = recall_score([y_true], [y_pred])
f1 = f1_score([y_true], [y_pred])

print("\nClassification Report:")
print(classification_report([y_true], [y_pred]))

print("\nPrecision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)



Classification Report:


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

    CARDINAL       0.85      0.93      0.89      1947
        DATE       0.77      0.86      0.82      4258
       EVENT       0.60      0.73      0.66       797
         FAC       0.64      0.74      0.68      1250
         GPE       0.83      0.85      0.84      5367
    LANGUAGE       0.00      0.00      0.00        13
         LAW       0.68      0.65      0.67        46
         LOC       0.60      0.64      0.62       571
        MISC       0.41      0.18      0.25      1264
       MONEY       0.49      0.40      0.44        62
        NORP       0.88      0.82      0.85       849
     ORDINAL       0.89      0.96      0.93       975
         ORG       0.78      0.82      0.80     11501
     PERCENT       0.48      0.71      0.58        21
      PERSON       0.86      0.89      0.87     13394
     PRODUCT       0.56      0.53      0.54      1267
    QUANTITY       0.48      0.57      0.52       178
        TIME       0.54    

In [None]:
# ---------------------------------------
# 6. Load previous model result CSV
# ---------------------------------------
csv_path = "/content/drive/MyDrive/DSIPYNB/NER/model_comparison_NER.csv"
df_prev = pd.read_csv(csv_path)

In [None]:
# ---------------------------------------
# 7. Append new row
# ---------------------------------------
new_row = {
    "Model": "Transformer",
    "Embedding": "BERT-base",
    "Precision": round(precision, 4),
    "Recall": round(recall, 4),
    "F1_Score": round(f1, 4)
}

df_prev = pd.concat([df_prev, pd.DataFrame([new_row])], ignore_index=True)

# ---------------------------------------
# 8. Save updated file
# ---------------------------------------
df_prev.to_csv(csv_path, index=False)
print("\nUpdated saved to:", csv_path)

print("\nFinal Table:")
print(df_prev)


Updated saved to: /content/drive/MyDrive/DSIPYNB/NER/model_comparison_NER.csv

Final Table:
         Model  Embedding  Precision    Recall  F1 Score  F1_Score
0       BiLSTM      GloVe   0.922337  0.010013  0.012809       NaN
1  Transformer  BERT-base   0.786200  0.817500       NaN    0.8015


In [None]:
import pandas as pd

# Load your results table
results = pd.read_csv(csv_path)

# Use the correct F1 column
f1_col = "F1_Score"

# Ensure F1 column is numeric
results[f1_col] = pd.to_numeric(results[f1_col], errors="coerce")

# Find the best model based on F1 score
best_model = results.loc[results[f1_col].idxmax()]

print("=== Model Comparison for NER ===")
print(results)

print("\n=== Best Model Based on F1 Score ===")
print(best_model)

print(
    f"\nConclusion: The best NER model is '{best_model['Model']}' "
    f"with embedding '{best_model['Embedding']}' "
    f"achieving F1 = {best_model[f1_col]:.4f}."
)



=== Model Comparison for NER ===
         Model  Embedding  Precision    Recall  F1 Score  F1_Score
0       BiLSTM      GloVe   0.922337  0.010013  0.012809       NaN
1  Transformer  BERT-base   0.786200  0.817500       NaN    0.8015

=== Best Model Based on F1 Score ===
Model        Transformer
Embedding      BERT-base
Precision         0.7862
Recall            0.8175
F1 Score             NaN
F1_Score          0.8015
Name: 1, dtype: object

Conclusion: The best NER model is 'Transformer' with embedding 'BERT-base' achieving F1 = 0.8015.


In [3]:
import torch
from transformers import BertTokenizerFast, BertForTokenClassification

model_path = "/content/drive/MyDrive/NER/bert_ner_model"
tokenizer = BertTokenizerFast.from_pretrained(model_path)
model = BertForTokenClassification.from_pretrained(model_path)
model.eval()

# LOAD REAL TAG MAP FROM MODEL
id2tag = model.config.id2label

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

In [4]:
def extract_entities(tokens, tags):
    entities = {}
    current_entity_tokens = []
    current_type = None

    for token, tag in zip(tokens, tags):

        if tag.startswith("B-"):
            # save previous entity
            if current_type:
                entity_text = " ".join(current_entity_tokens)
                entities.setdefault(current_type, []).append(entity_text)

            # start new entity
            current_type = tag.split("-")[1]
            current_entity_tokens = [token]

        elif tag.startswith("I-") and current_type:
            current_entity_tokens.append(token)

        else:
            # save previous entity if exists
            if current_type:
                entity_text = " ".join(current_entity_tokens)
                entities.setdefault(current_type, []).append(entity_text)

            current_type = None
            current_entity_tokens = []

    # catch last entity
    if current_type:
        entity_text = " ".join(current_entity_tokens)
        entities.setdefault(current_type, []).append(entity_text)

    ## FIX: remove single-letter splits like ['V','ira','t'] → 'Virat'
    for ent_type, ent_list in entities.items():
        fixed_list = []
        for ent in ent_list:
            # remove spaces inside a broken name
            if len(ent.split()) > 1 and len(ent.replace(" ", "")) <= 15:
                ent = ent.replace(" ", "")
            fixed_list.append(ent)
        entities[ent_type] = fixed_list

    return entities


In [5]:
def predict_entities(input_text):
    encoding = tokenizer(
        input_text,
        return_tensors="pt",
        truncation=True,
        padding=True
    )

    with torch.no_grad():
        output = model(
            input_ids=encoding["input_ids"],
            attention_mask=encoding["attention_mask"]
        )
        logits = output.logits
        predictions = torch.argmax(logits, dim=2)[0].tolist()

    tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"][0])
    tags = [id2tag[p] for p in predictions]

    # Remove special tokens [CLS] and [SEP]
    clean_tokens = []
    clean_tags = []

    for token, tag in zip(tokens, tags):
        if token not in ["[CLS]", "[SEP]"]:
            clean_tokens.append(token.replace("##", ""))  # remove BPE splits
            clean_tags.append(tag)

    entities = extract_entities(clean_tokens, clean_tags)
    return clean_tokens, clean_tags, entities


In [6]:
sample_text = "Oliver Gavin says he finds it baffling that Corvette stablemate Marcel Fassler was judged to blame for the crash that eliminated the car from the Le Mans Hours Fassler in the car he shared with Gavin and Tommy Milner suffered a heavy crash at the Porsche Curves just beyond one quarter distance after the Swiss driver tagged the Dempsey Proton Porsche of Japanese gentleman driver Satoshi Hoshino The three time Le Mans winner was taken to the medical centre and then a local hospital for a CT scan but escape the impact with nothing worse than bruising While the stewards assessed Fassler a euro fine and six penalty points on his licence Gavin said he held Hoshino entirely to blame for the incident pointing out the bronze rated driver had changed his line at the last second What happened with Marcel is just crazy Gavin told Motorsport com For us to get turned into the wall by an Am driver who is clearly out of his depth looks terrified he d been off numerous times already When you are competing in multi class racing and you re making split decisions you re looking for cues you re picking up all the body language from the car in front And he Hoshino gave absolutely every single indication he was keeping out of the way He was km h slower he let the previous two cars go by on the inside and then he just does something random and pull right down on the racing line takes us clean out of the race That sort of thing is something that needs to be looked at again He added The guy driving our car Fassler has won Le Mans three times so he certainly knows what he s doing He knows the deal and how to get it done So for the fine to be given to him and the penalty points it s just I m battling to understand how that s right Fassler s crash left the car of Antonio Garcia and Mike Rockenfeller and Jan Magnussen carrying Corvette s hopes and heading into the morning hours the trio were locked in a close fight with the leading AF Corse Ferrari However when Magnussen pitted under the safety car in hour he was held at the end of the pitlane giving the Ferrari crew a one minute lead it could nurse to the finish Shortly after Magnussen spun at the Porsche Curves while trying to claw back the lost ground hitting the barriers an incident for which the Dane accepted the blame When the safety car split us the win was gone for sure but we could have still had second Magnussen told Motorsport com Then I didn t keep enough heat in the tyres So when I clipped the kerb at the Porsche Curves I spun and hit the wall and damaged the suspension Horrible feeling I can t even begin how to describe how I felt for the guys all that hard work to come away from nothing They deserved a lot better Gavin said watching the challenge of the crew fall apart in the final hours was painful for him to watch It was very hard to see what happened to the sister car with the safety car and then Jan s accident he said They deserved the victory today they were fast all race and led for long periods That s just very tough but it s the nature of the race It chooses you and it chose the Ferrari car today"

tokens, labels, entities = predict_entities(sample_text)

print("\nToken Predictions:")
for t, l in zip(tokens, labels):
    print(f"{t:12} --> {l}")

print("\nExtracted Entities:")
print(entities)



Token Predictions:
Oliver       --> B-PERSON
Gavin        --> I-PERSON
says         --> O
he           --> O
finds        --> O
it           --> O
b            --> O
af           --> O
f            --> O
ling         --> O
that         --> O
Co           --> B-PRODUCT
rvette       --> B-PRODUCT
stable       --> O
mate         --> O
Marcel       --> B-PERSON
F            --> I-PERSON
ass          --> I-PERSON
ler          --> I-PERSON
was          --> O
judged       --> O
to           --> O
blame        --> O
for          --> O
the          --> O
crash        --> O
that         --> O
eliminated   --> O
the          --> O
car          --> O
from         --> O
the          --> B-EVENT
Le           --> I-EVENT
Mans         --> I-EVENT
Hours        --> I-EVENT
F            --> B-PERSON
ass          --> B-PERSON
ler          --> B-PERSON
in           --> O
the          --> O
car          --> O
he           --> O
shared       --> O
with         --> O
Gavin        --> B-PERSON
and          --