In [19]:
import re
from sklearn.metrics import precision_recall_fscore_support

In [20]:
def read_model_output(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        model_output_text = file.read()
    return model_output_text

# Specify the path to your model's output text file
file_path = 'predictions_output2.txt'

# Load the model output text
model_output_text = read_model_output(file_path)

In [21]:
def parse_conll(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        token_labels = [line.strip().split() for line in file if line.strip()]
    return {token: label for token, label in token_labels if label}

def parse_predictions(model_output_text):
    predictions = {}
    # Split the text block into segments for each "Text"
    segments = model_output_text.strip().split('Text:')
    for segment in segments[1:]:  # Skip the first split as it will be empty
        lines = segment.strip().split('\n')
        text = lines[0].strip()  # The first line after 'Text:' is the actual text
        # The following lines contain predictions
        for line in lines[1:]:
            if line.startswith('{'):
                # Evaluate the line as a dictionary and merge into predictions
                line_dict = eval(line)
                predictions.update(line_dict)
    return predictions


In [22]:
# Assuming you have file paths or string contents for both
gold_labels = parse_conll('test_d.conll')
predicted_labels = parse_predictions(model_output_text)

y_true = [gold_labels[token] for token in gold_labels if token in predicted_labels]
y_pred = [predicted_labels[token] for token in gold_labels if token in predicted_labels]

# Check which labels were never predicted
all_labels = set(y_true)
predicted_labels = set(y_pred)
unpredicted_labels = all_labels.difference(predicted_labels)

if unpredicted_labels:
    print(f"Warning: No predictions were made for the following labels: {unpredicted_labels}")

precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted', zero_division=0)


print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

labels = sorted(set(y_true + y_pred))  # Combine and sort labels to ensure consistent order

precision, recall, f1, support = precision_recall_fscore_support(y_true, y_pred, labels=labels, zero_division=0)

# Print metrics for each label
for label, p, r, f in zip(labels, precision, recall, f1):
    print(f"Label: {label} - Precision: {p:.4f}, Recall: {r:.4f}, F1-Score: {f:.4f}")

Precision: 0.8476520145643108
Recall: 0.8621574186580262
F1 Score: 0.8532131870749726
Label: ! - Precision: 0.0000, Recall: 0.0000, F1-Score: 0.0000
Label: , - Precision: 0.6521, Recall: 0.5040, F1-Score: 0.5686
Label: - - Precision: 0.3333, Recall: 0.2247, F1-Score: 0.2685
Label: . - Precision: 0.6768, Recall: 0.6282, F1-Score: 0.6516
Label: ... - Precision: 0.0000, Recall: 0.0000, F1-Score: 0.0000
Label: : - Precision: 0.4348, Recall: 0.2128, F1-Score: 0.2857
Label: ; - Precision: 0.0000, Recall: 0.0000, F1-Score: 0.0000
Label: ? - Precision: 0.3810, Recall: 0.3200, F1-Score: 0.3478
Label: B - Precision: 0.9105, Recall: 0.9548, F1-Score: 0.9321
