In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.set_default_device(device)

In [18]:

!pip install datasets -q

import datasets

dialogre = datasets.load_dataset(
    "dataset-org/dialog_re",
    download_mode="force_redownload",
    trust_remote_code=True,
)

dialog_re.py:   0%|          | 0.00/4.83k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/7.45k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.34M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/752k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/726k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1073 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/357 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/358 [00:00<?, ? examples/s]

In [19]:
!wget -O glove.6B.zip https://nlp.stanford.edu/data/glove.6B.zip
!python -m spacy init vectors en glove.6B.zip glove_vectors

import spacy

nlp = spacy.load("glove_vectors")

--2025-03-04 13:26:31--  https://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-03-04 13:26:31--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2025-03-04 13:29:11 (5.15 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]

[38;5;4mℹ Creating blank nlp object for language 'en'[0m
400000it [00:05, 73916.59it/s]
[38;5;2m✔ Successfully converted 400000 vectors[0m
[38;5;2m✔ Saved nlp object with vectors to ou

In [20]:
matcher = spacy.matcher.PhraseMatcher(nlp.vocab)

# Add entities to match on
for split in dialogre.values():
    for example in split:
        for entity in example['relation_data']['x'] + example['relation_data']['y']:
            matcher.add(entity, [nlp.make_doc(entity)])

In [21]:
encoder = nn.LSTM(50, 50, bidirectional=True)
predictor = nn.Linear(200, 37)

for example in dialogre["train"]:
    dialog = nlp("\n".join(example["dialog"]))
    dialog_vectors = torch.stack([
        torch.from_numpy(token.vector)
        for token in dialog
    ])
    embeddings, _ = encoder(dialog_vectors)

    entity_embeddings = {
        entity: []
        for entity in example['relation_data']['x'] + example['relation_data']['y']
    }

    for match_id, start, end in matcher(dialog):
        entity = nlp.vocab.strings[match_id]
        if entity in entity_embeddings:
            entity_embedding = embeddings[start:end].mean(dim=0)
            entity_embeddings[entity].append(entity_embedding)

    for entity in entity_embeddings:
        entity_embeddings[entity] = torch.stack(entity_embeddings[entity]).mean(dim=0)

    loss = 0
    for x, y, rid in zip(
        example['relation_data']['x'],
        example['relation_data']['y'],
        example['relation_data']['rid']
    ):
        logits = predictor(torch.cat((entity_embeddings[x], entity_embeddings[y])))
        truth = torch.zeros_like(logits)
        truth[torch.tensor(rid) - 1] = 1
        loss += F.binary_cross_entropy_with_logits(logits, truth)
    loss.backward()
    break

In [26]:
'''
The micro F1 score calculates the overall precision and recall by treating all
predictions equally.
The macro F1 score calculates the F1 score for each relationship
type separately and then averages them (makes sure that rare relationships are just
as important as common ones) This helps see if the model is performing well across
all relationship types, not just the most frequent ones.
'''

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from sklearn.metrics import f1_score

# Function to evaluate the model performance
def evaluate_model(model, data_loader, device):
    model.eval()  # Set model to evaluation mode
    all_preds = []  # Store all predictions
    all_labels = []  # Store all ground truth labels

    with torch.no_grad():  # No gradient calculation needed during evaluation
        for example in data_loader:
            dialog = nlp("\n".join(example["dialog"]))  # Combine dialog lines into one text

            # Convert word vectors to a NumPy array before making a tensor (better performance)
            dialog_vectors = np.array([
                token.vector for token in dialog
            ])
            dialog_vectors = torch.tensor(dialog_vectors, dtype=torch.float32, device=device)

            # Pass dialog through the encoder (LSTM in this case)
            embeddings, _ = encoder(dialog_vectors)

            # Dictionary to store embeddings for entities
            entity_embeddings = {entity: [] for entity in example['relation_data']['x'] + example['relation_data']['y']}

            # Match entities in the text
            for match_id, start, end in matcher(dialog):
                entity = nlp.vocab.strings[match_id]
                if entity in entity_embeddings:
                    entity_embedding = embeddings[start:end].mean(dim=0)  # Average token embeddings for entity
                    entity_embeddings[entity].append(entity_embedding)

            # Final entity embeddings - average multiple occurrences
            for entity in entity_embeddings:
                if entity_embeddings[entity]:
                    entity_embeddings[entity] = torch.stack(entity_embeddings[entity]).mean(dim=0)
                else:
                    entity_embeddings[entity] = torch.zeros(100, device=device)  # Default zero vector if no match

            # Compute predictions for each relation
            for x, y, rid in zip(
                example['relation_data']['x'],
                example['relation_data']['y'],
                example['relation_data']['rid']
            ):
                if x in entity_embeddings and y in entity_embeddings:
                    logits = predictor(torch.cat((entity_embeddings[x], entity_embeddings[y])))  # Model prediction
                    preds = torch.sigmoid(logits).cpu().numpy() > 0.5  # Convert logits to binary (0/1) predictions
                    truth = np.zeros_like(logits.cpu().numpy())  # Initialize ground truth labels with zeros

                    # Convert 1-based `rid` indices to 0-based indices to avoid indexing errors
                    rid = [r - 1 for r in rid if 0 < r <= 37]  # Only keep valid indices

                    if rid:  # Check if there are valid labels to assign
                        truth[rid] = 1  # Assign ground truth labels

                    all_preds.append(preds)  # Store predictions
                    all_labels.append(truth)  # Store ground truth labels

    # Convert lists to NumPy arrays for faster processing
    all_preds = np.array(all_preds).reshape(-1, 37)
    all_labels = np.array(all_labels).reshape(-1, 37)

    # Compute F1 scores (micro and macro)
    micro_f1 = f1_score(all_labels, all_preds, average='micro')
    macro_f1 = f1_score(all_labels, all_preds, average='macro')

    # Print evaluation results
    print(f"Micro F1 Score: {micro_f1:.4f}")
    print(f"Macro F1 Score: {macro_f1:.4f}")

    return micro_f1, macro_f1

# Example usage - Check if GPU is available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
micro_f1, macro_f1 = evaluate_model(predictor, dialogre["test"], device)


Micro F1 Score: 0.0585
Macro F1 Score: 0.0419
