## Span Identification baseline

In [1]:
import pandas as pd
import os
import random
import ast
import csv

In [2]:
# Function for open files
def open_file(file):
    result_dict = {}
    with open(file, "r") as f:
        reader = csv.reader(f, delimiter="\t")
        for row in reader:
          if len(row)>=3:
            id_ = row[0]
            start = int(row[1])
            end = int(row[2])
            if id_ not in result_dict:
                result_dict[id_] = [(start, end)]
            else:
                result_dict[id_].append((start, end))
    return result_dict

In [6]:
# Functions for merged data in true files
def merge_overlapping(indices_list):
    """
    Merges overlapping indices and sorts indices from list of tuples.
    """
    if not indices_list:
        return []

    indices_list = sorted(indices_list)
    merged = [indices_list[0]]

    for current in indices_list[1:]:
        last = merged[-1]
        if current[0] <= last[1]:  # Overlap condition
            merged[-1] = (last[0], max(last[1], current[1]))
        else:
            merged.append(current)

    return merged

def merge_all_intervals(data_dict):
    """
    Merges overlapping intervals for each key in the dictionary.
    """
    merged_dict = {}
    for key, intervals in data_dict.items():
        merged_dict[key] = merge_overlapping(intervals)
    return merged_dict

In [8]:
true_file = "C:/N/st/UCU/5sem2024_DIPLOMA/Manipulation/report/diploma_final_files/manipulation-techniques-detection-in-news-uk/bert/datasets/SI.labels_true.txt"
true_dict = open_file(true_file)
merged_true_dict = merge_all_intervals(true_dict)
merged_true_dict

{'209813': [(0, 3), (19, 36), (49, 65), (235, 249)],
 '210028': [(0, 6)],
 '210117': [(174, 181)],
 '210306': [(151, 168)],
 '210365': [(861, 901)],
 '210441': [(0, 39)],
 '210456': [(0, 29), (220, 235), (329, 336)],
 '210467': [(147, 205)],
 '210725': [(0, 50)],
 '210903': [(201, 209)],
 '210911': [(0, 36), (63, 65)],
 '211006': [(0, 28), (183, 191)],
 '211041': [(248, 302)],
 '211142': [(170, 175), (278, 280)],
 '211258': [(0, 4), (271, 314), (440, 449)],
 '211326': [(181, 197)],
 '211430': [(13, 29), (64, 79), (106, 180)],
 '211495': [(0, 61), (209, 216), (306, 334), (499, 517), (571, 621)],
 '211692': [(45, 53)],
 '211919': [(6, 12), (430, 481)],
 '212018': [(0, 28), (108, 112), (129, 156), (158, 160)],
 '212123': [(240, 250)],
 '212219': [(0, 31)],
 '212348': [(1, 7)],
 '315138': [(6, 14)],
 '315330': [(266, 308)],
 '315353': [(238, 273), (596, 600)],
 '315420': [(362, 380)],
 '315425': [(3, 34), (47, 60), (156, 174)],
 '315436': [(132, 137), (140, 218), (249, 324), (369, 407)],
 

#### Generate random span and check overlaping on character level

In [23]:
def generate_random_spans(num_spans, text_length):
    """
    Generates random spans within the given text length.
    """
    spans = []
    for _ in range(num_spans):
        start = random.randint(0, text_length - 1)
        end = random.randint(start + 1, text_length)
        spans.append((start, end))
    return spans

def calculate_overlaps(true_dict, text_length):
    overlaps = []

    # Generate predicted spans randomly for each key in true_dict
    pred_dict = {id: generate_random_spans(len(spans), text_length) for id, spans in true_dict.items()}

    # Iterate through each ID in the true dictionary
    for id, true_spans in true_dict.items():

        # Check if the ID is present in the predicted dictionary
        if id in pred_dict:

            # Get the predicted spans for the current ID
            pred_spans = pred_dict[id]

            # Iterate through each true span
            for true_span in true_spans:
                max_overlap = 0

                # Extract start and end positions of the true span
                true_start, true_end = true_span

                # Iterate through each predicted span
                for pred_span in pred_spans:
                    # Extract start and end positions of the predicted span
                    pred_start, pred_end = pred_span

                    # Calculate the overlap between the true and predicted spans
                    overlap = max(0, min(true_end, pred_end) - max(true_start, pred_start))
                    max_overlap = max(max_overlap, overlap)

                # Append the maximum overlap for the current true span to the list
                overlaps.append(max_overlap)

    # Calculate the average overlap
    average_overlap = sum(overlaps) / len(overlaps) if overlaps else 0
    return average_overlap

average_overlap = calculate_overlaps(true_dict, len(true_dict))
print(f"Accuracy on character level: {average_overlap:.2f}")

Accuracy on character level: 11.26
