In [2]:
from collections import defaultdict

def calculate_scores(output_file, ground_truth_file, classes):
    # Read the contents of the files
    with open(output_file, 'r') as f:
        output_data = f.read().strip().split('\n')

    with open(ground_truth_file, 'r') as f:
        ground_truth_data = f.read().strip().split('\n')

    # Initialize dictionaries to store TP, FP, and FN for each class
    tp = defaultdict(int)
    fp = defaultdict(int)
    fn = defaultdict(int)

    # Initialize variables to calculate accuracy
    total_correct = 0
    total_tokens = 0

    # Iterate through each token in both lists simultaneously
    for output_line, ground_truth_line in zip(output_data, ground_truth_data):
        output_tokens = output_line.strip().split()
        ground_truth_tokens = ground_truth_line.strip().split()

        total_tokens += len(ground_truth_tokens)

        for output_token, ground_truth_token in zip(output_tokens, ground_truth_tokens):
            if output_token == ground_truth_token:
                tp[output_token] += 1
                total_correct += 1
            else:
                fp[output_token] += 1
                fn[ground_truth_token] += 1

    # Calculate precision, recall, and F1-score for each class
    precision = {}
    recall = {}
    f1_score = {}
    macro_f1_score = 0.0

    for class_label in classes:
        precision[class_label] = tp[class_label] / (tp[class_label] + fp[class_label]) if (tp[class_label] + fp[class_label]) > 0 else 0
        recall[class_label] = tp[class_label] / (tp[class_label] + fn[class_label]) if (tp[class_label] + fn[class_label]) > 0 else 0
        f1_score[class_label] = 2 * (precision[class_label] * recall[class_label]) / (precision[class_label] + recall[class_label]) if (precision[class_label] + recall[class_label]) > 0 else 0
        macro_f1_score += f1_score[class_label]

    macro_f1_score /= len(classes)
    accuracy = total_correct / total_tokens

    return precision, recall, f1_score, macro_f1_score, accuracy

# Define the classes
classes = ['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG', 'B-MISC', 'I-MISC']

# Calculate scores
precision, recall, f1_score, macro_f1_score, accuracy = calculate_scores('/kaggle/input/data-chatgpt/output_chatgpt.txt', '/kaggle/input/data-chatgpt/groundTruth.txt', classes)

# Print results
for class_label in classes:
    print(f"Class: {class_label}")
    print(f"Precision: {precision[class_label]}")
    print(f"Recall: {recall[class_label]}")
    print(f"F1-score: {f1_score[class_label]}")
    print()

print(f"Macro F1-score: {macro_f1_score}")
print(f"Accuracy: {accuracy}")


Class: O
Precision: 0.9336823734729494
Recall: 0.9870848708487084
F1-score: 0.9596412556053812

Class: B-PER
Precision: 0.7222222222222222
Recall: 0.7222222222222222
F1-score: 0.7222222222222222

Class: I-PER
Precision: 0.5882352941176471
Recall: 0.625
F1-score: 0.6060606060606061

Class: B-LOC
Precision: 0.8571428571428571
Recall: 0.8571428571428571
F1-score: 0.8571428571428571

Class: I-LOC
Precision: 0
Recall: 0
F1-score: 0

Class: B-ORG
Precision: 1.0
Recall: 0.8888888888888888
F1-score: 0.9411764705882353

Class: I-ORG
Precision: 0.8571428571428571
Recall: 1.0
F1-score: 0.923076923076923

Class: B-MISC
Precision: 0.25
Recall: 0.045454545454545456
F1-score: 0.07692307692307693

Class: I-MISC
Precision: 0.5
Recall: 0.15
F1-score: 0.23076923076923075

Macro F1-score: 0.5907791824876147
Accuracy: 0.9088098918083463


In [3]:
from collections import defaultdict
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# Function to read the file and extract NER tags
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        data = []
        for line in lines:
            line = eval(line.strip())  # Convert string representation to list of tuples
            tags = [tag[1] for tag in line]
            data.append(tags)
        return data

# Function to calculate precision, recall, F1 score, and accuracy
def calculate_metrics(ground_truth, output):
    metrics = defaultdict(lambda: defaultdict(float))
    accuracy = 0.0
    total_samples = 0

    for label in ['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG', 'B-MISC', 'I-MISC']:
        y_true = [tag == label for tags in ground_truth for tag in tags]
        y_pred = [tag == label for tags in output for tag in tags]
        precision, recall, f1_score, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
        metrics[label]['precision'] = precision
        metrics[label]['recall'] = recall
        metrics[label]['f1_score'] = f1_score
    
    # Calculate accuracy
    y_true_flat = [tag for tags in ground_truth for tag in tags]
    y_pred_flat = [tag for tags in output for tag in tags]
    accuracy = accuracy_score(y_true_flat, y_pred_flat)

    return metrics, accuracy

# Function to print the metrics
def print_metrics(metrics, accuracy):
    for label, values in metrics.items():
        print(f"Metrics for {label}:")
        print(f"Precision: {values['precision']:.4f}")
        print(f"Recall: {values['recall']:.4f}")
        print(f"F1 Score: {values['f1_score']:.4f}")
        print()
    print(f"Accuracy: {accuracy:.4f}")

# Paths to input files
ground_truth_file = '/kaggle/input/data-bert/gt_BERT.txt'
output_file = '/kaggle/input/data-bert/indicBERT_pred.txt'

# Read data from files
ground_truth_data = read_file(ground_truth_file)
output_data = read_file(output_file)

# Calculate metrics
metrics, accuracy = calculate_metrics(ground_truth_data, output_data)

# Print individual metrics
print_metrics(metrics, accuracy)

# Calculate and print macro F1 score
macro_f1 = sum(values['f1_score'] for values in metrics.values()) / len(metrics)
print(f"Macro F1 Score: {macro_f1:.4f}")


Metrics for O:
Precision: 0.9026
Recall: 1.0000
F1 Score: 0.9488

Metrics for B-PER:
Precision: 0.5882
Recall: 0.6250
F1 Score: 0.6061

Metrics for I-PER:
Precision: 0.9231
Recall: 0.7500
F1 Score: 0.8276

Metrics for B-LOC:
Precision: 0.9091
Recall: 0.7692
F1 Score: 0.8333

Metrics for I-LOC:
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for B-ORG:
Precision: 0.7143
Recall: 0.5556
F1 Score: 0.6250

Metrics for I-ORG:
Precision: 0.5000
Recall: 0.3333
F1 Score: 0.4000

Metrics for B-MISC:
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for I-MISC:
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Accuracy: 0.8893
Macro F1 Score: 0.4712


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [4]:
from collections import defaultdict
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# Function to read the file and extract NER tags
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        data = []
        for line in lines:
            line = eval(line.strip())  # Convert string representation to list of tuples
            tags = [tag[1] for tag in line]
            data.append(tags)
        return data

# Function to calculate precision, recall, F1 score, and accuracy
def calculate_metrics(ground_truth, output):
    metrics = defaultdict(lambda: defaultdict(float))
    accuracy = 0.0
    total_samples = 0

    for label in ['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG', 'B-MISC', 'I-MISC']:
        y_true = [tag == label for tags in ground_truth for tag in tags]
        y_pred = [tag == label for tags in output for tag in tags]
        precision, recall, f1_score, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
        metrics[label]['precision'] = precision
        metrics[label]['recall'] = recall
        metrics[label]['f1_score'] = f1_score
    
    # Calculate accuracy
    y_true_flat = [tag for tags in ground_truth for tag in tags]
    y_pred_flat = [tag for tags in output for tag in tags]
    accuracy = accuracy_score(y_true_flat, y_pred_flat)

    return metrics, accuracy

# Function to print the metrics
def print_metrics(metrics, accuracy):
    for label, values in metrics.items():
        print(f"Metrics for {label}:")
        print(f"Precision: {values['precision']:.4f}")
        print(f"Recall: {values['recall']:.4f}")
        print(f"F1 Score: {values['f1_score']:.4f}")
        print()
    print(f"Accuracy: {accuracy:.4f}")

# Paths to input files
ground_truth_file = '/kaggle/input/data-ner/gt_NER.txt'
output_file = '/kaggle/input/data-ner/indicNER_pred.txt'

# Read data from files
ground_truth_data = read_file(ground_truth_file)
output_data = read_file(output_file)

# Calculate metrics
metrics, accuracy = calculate_metrics(ground_truth_data, output_data)

# Print individual metrics
print_metrics(metrics, accuracy)

# Calculate and print macro F1 score
macro_f1 = sum(values['f1_score'] for values in metrics.values()) / len(metrics)
print(f"Macro F1 Score: {macro_f1:.4f}")


Metrics for O:
Precision: 0.9241
Recall: 0.9878
F1 Score: 0.9549

Metrics for B-PER:
Precision: 0.7000
Recall: 0.7778
F1 Score: 0.7368

Metrics for I-PER:
Precision: 0.7000
Recall: 1.0000
F1 Score: 0.8235

Metrics for B-LOC:
Precision: 0.7333
Recall: 0.6111
F1 Score: 0.6667

Metrics for I-LOC:
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for B-ORG:
Precision: 0.5455
Recall: 0.6000
F1 Score: 0.5714

Metrics for I-ORG:
Precision: 1.0000
Recall: 0.5000
F1 Score: 0.6667

Metrics for B-MISC:
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for I-MISC:
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Accuracy: 0.8978
Macro F1 Score: 0.4911


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
