## Span Identification evaluation
Accuracy on character level for model with initial data: 25.90
Accuracy on character level for model with initial data: 56.66

In [2]:
import pandas as pd
import numpy as np
import csv

In [3]:
# Function for open files
def open_file(file):
    result_dict = {}
    with open(file, "r") as f:
        reader = csv.reader(f, delimiter="\t")
        for row in reader:
          if len(row)>=3:
            id_ = row[0]
            start = int(row[1])
            end = int(row[2])
            if id_ not in result_dict:
                result_dict[id_] = [(start, end)]
            else:
                result_dict[id_].append((start, end))
    return result_dict

In [4]:
# Functions for merged data in true files (pred is already merged)
def merge_overlapping(indices_list):
    """
    Merges overlapping indices and sorts indices from list of tuples.
    """
    if not indices_list:
        return []

    indices_list = sorted(indices_list)
    merged = [indices_list[0]]

    for current in indices_list[1:]:
        last = merged[-1]
        if current[0] <= last[1]:  # Overlap condition
            merged[-1] = (last[0], max(last[1], current[1]))
        else:
            merged.append(current)

    return merged

def merge_all_intervals(data_dict):
    """
    Merges overlapping intervals for each key in the dictionary.
    """
    merged_dict = {}
    for key, intervals in data_dict.items():
        merged_dict[key] = merge_overlapping(intervals)
    return merged_dict

In [5]:
# Convert -1 to 0 in pred file
def beginning_shifting(dict):
    for key, value in dict.items():
        # Iterate through each tuple in the list
        for i, (start, end) in enumerate(value):
            # Check if the start value is -1
            if start == -1:
                # Replace -1 with 0
                dict[key][i] = (0, end)
    return dict        

In [6]:
# Statistics
def statistics(data_dict):
    # Calculate the total number of spans 
    total_spans = sum(len(spans) for spans in data_dict.values())
    # Calculate the average number of spans per key
    average_spans_per_key = total_spans / len(data_dict)
    print(f"Average number of spans per post: {average_spans_per_key:.2f}")
    # Calculate the total length between end and start for all spans
    total_length = sum(end - start for spans in data_dict.values() for start, end in spans)
    # Calculate the average length
    average_length = total_length / total_spans
    print(f"Average length of the span: {average_length:.2f}")

# Compare indices overlaping

#### For SI with initial data

#### True file

In [9]:
true_file = "C:/N/st/UCU/5sem2024_DIPLOMA/Manipulation/report/diploma_final_files/manipulation-techniques-detection-in-news-uk/bert/datasets/SI.labels_true.txt"
true_dict = open_file(true_file)
print(len(true_dict))
true_dict

572


{'209813': [(0, 3), (19, 36), (49, 65), (235, 249)],
 '210028': [(0, 6), (0, 6)],
 '210117': [(174, 181)],
 '210306': [(151, 168)],
 '210365': [(861, 901)],
 '210441': [(0, 39)],
 '210456': [(220, 235), (329, 336), (0, 29)],
 '210467': [(147, 205)],
 '210725': [(0, 50)],
 '210903': [(201, 209)],
 '210911': [(63, 65), (0, 36)],
 '211006': [(183, 191), (0, 28)],
 '211041': [(248, 302)],
 '211142': [(170, 175), (278, 280)],
 '211258': [(440, 449), (0, 4), (271, 314)],
 '211326': [(181, 197)],
 '211430': [(64, 79), (13, 29), (106, 180), (116, 180)],
 '211495': [(0, 61),
  (209, 216),
  (306, 326),
  (306, 334),
  (506, 510),
  (499, 517),
  (571, 621)],
 '211692': [(45, 53)],
 '211919': [(6, 12), (430, 481)],
 '212018': [(0, 28), (108, 112), (158, 160), (129, 156), (158, 159)],
 '212123': [(240, 250), (240, 250)],
 '212219': [(0, 31)],
 '212348': [(1, 7)],
 '315138': [(6, 14), (6, 14)],
 '315330': [(266, 308)],
 '315353': [(238, 273), (596, 600), (238, 273), (596, 600)],
 '315420': [(362, 

In [10]:
merged_true_dict = merge_all_intervals(true_dict)
merged_true_dict

{'209813': [(0, 3), (19, 36), (49, 65), (235, 249)],
 '210028': [(0, 6)],
 '210117': [(174, 181)],
 '210306': [(151, 168)],
 '210365': [(861, 901)],
 '210441': [(0, 39)],
 '210456': [(0, 29), (220, 235), (329, 336)],
 '210467': [(147, 205)],
 '210725': [(0, 50)],
 '210903': [(201, 209)],
 '210911': [(0, 36), (63, 65)],
 '211006': [(0, 28), (183, 191)],
 '211041': [(248, 302)],
 '211142': [(170, 175), (278, 280)],
 '211258': [(0, 4), (271, 314), (440, 449)],
 '211326': [(181, 197)],
 '211430': [(13, 29), (64, 79), (106, 180)],
 '211495': [(0, 61), (209, 216), (306, 334), (499, 517), (571, 621)],
 '211692': [(45, 53)],
 '211919': [(6, 12), (430, 481)],
 '212018': [(0, 28), (108, 112), (129, 156), (158, 160)],
 '212123': [(240, 250)],
 '212219': [(0, 31)],
 '212348': [(1, 7)],
 '315138': [(6, 14)],
 '315330': [(266, 308)],
 '315353': [(238, 273), (596, 600)],
 '315420': [(362, 380)],
 '315425': [(3, 34), (47, 60), (156, 174)],
 '315436': [(132, 137), (140, 218), (249, 324), (369, 407)],
 

In [11]:
true_stata = statistics(merged_true_dict)
true_stata

Average number of spans per post: 1.77
Average length of the span: 30.29


#### Pred file

In [7]:
pred_file = "C:/N/st/UCU/5sem2024_DIPLOMA/Manipulation/report/diploma_final_files/manipulation-techniques-detection-in-news-uk/bert/datasets/SI.labels_pred.txt"
pred_dict = open_file(pred_file)
print(len(pred_dict))
pred_dict = beginning_shifting(pred_dict)

pred_dict

572


{'1062476': [(0, 290)],
 '1062480': [(0, 258)],
 '1062575': [(0, 249)],
 '1062722': [(0, 432)],
 '1062744': [(0, 258)],
 '1062751': [(0, 690)],
 '1063058': [(0, 358)],
 '1063082': [(0, 248)],
 '1063177': [(0, 250)],
 '1063217': [(0, 338)],
 '1063289': [(0, 206), (208, 237)],
 '1063460': [(0, 345)],
 '1063475': [(0, 336)],
 '1063526': [(0, 241)],
 '1063603': [(0, 271)],
 '1063878': [(0, 269)],
 '1063931': [(0, 262), (275, 304)],
 '1063951': [(0, 330)],
 '1063972': [(0, 202)],
 '1064259': [(0, 240)],
 '1064382': [(0, 324)],
 '1064385': [(0, 264)],
 '1064554': [(0, 235)],
 '1064725': [(0, 288)],
 '1064774': [(0, 227)],
 '1064815': [(0, 273)],
 '1064924': [(0, 346)],
 '1065013': [(0, 253)],
 '1065063': [(0, 292)],
 '1065091': [(0, 192)],
 '1065132': [(0, 308)],
 '1065168': [(0, 308)],
 '1065183': [(0, 329)],
 '1065286': [(0, 277)],
 '1065320': [(0, 319)],
 '1065403': [(0, 273)],
 '1065537': [(0, 243)],
 '1065950': [(0, 224)],
 '1066016': [(0, 274)],
 '1066039': [(0, 342)],
 '1066110': [(0,

In [8]:
pred_stata = statistics(pred_dict)
pred_stata

Average number of spans per post: 1.16
Average length of the span: 308.91


In [12]:
# Standart overlap
overlaps = []

# Iterate through each ID in the true dictionary
for id, true_spans in true_dict.items():
    
    # Check if the ID is present in the predicted dictionary
    if id in pred_dict:
        
        # Get the predicted spans for the current ID
        pred_spans = pred_dict[id]
        
        true_length = 0
        pred_length = 0
    
         # Iterate through each true span
        for true_span in true_spans:
            max_overlap = 0
             
             # Extract start and end positions of the true span
            true_start, true_end = true_span

            
            # Iterate through each predicted span
            for pred_span in pred_spans:
                # Extract start and end positions of the predicted span
                pred_start, pred_end = pred_span
                  
                 # Calculate the overlap between the true and predicted spans
                overlap = max(0, min(true_end, pred_end) - max(true_start, pred_start))
            overlaps.append(overlap)

# # Calculate the accuracy
average_overlap = sum(overlaps) / len(overlaps) if overlaps else 0

print(f"Accuracy on character level: {average_overlap:.2f}")

Accuracy on character level: 25.90


#### For SI with upsampling

In [13]:
true_file = "C:/N/st/UCU/5sem2024_DIPLOMA/Manipulation/report/diploma_final_files/manipulation-techniques-detection-in-news-uk/bert/datasets/SI.labels_true_upsampling.txt"
true_dict = open_file(true_file)
print(len(true_dict))
merged_true_dict = merge_all_intervals(true_dict)
statistics(merged_true_dict)

595
Average number of spans per post: 1.37
Average length of the span: 62.65


In [15]:
pred_file = "C:/N/st/UCU/5sem2024_DIPLOMA/Manipulation/report/diploma_final_files/manipulation-techniques-detection-in-news-uk/bert/datasets/SI.labels_pred_upsampling.txt"
pred_dict = open_file(pred_file)
print(len(pred_dict))
pred_dict = beginning_shifting(pred_dict)
statistics(pred_dict)

595
Average number of spans per post: 1.27
Average length of the span: 198.39


In [17]:
# Standart overlap
overlaps = []

# Iterate through each ID in the true dictionary
for id, true_spans in true_dict.items():
    
    # Check if the ID is present in the predicted dictionary
    if id in pred_dict:
        
        # Get the predicted spans for the current ID
        pred_spans = pred_dict[id]
        
        true_length = 0
        pred_length = 0
    
         # Iterate through each true span
        for true_span in true_spans:
            max_overlap = 0
             
             # Extract start and end positions of the true span
            true_start, true_end = true_span

            
            # Iterate through each predicted span
            for pred_span in pred_spans:
                # Extract start and end positions of the predicted span
                pred_start, pred_end = pred_span
                  
                 # Calculate the overlap between the true and predicted spans
                overlap = max(0, min(true_end, pred_end) - max(true_start, pred_start))
            overlaps.append(overlap)

# # Calculate the accuracy
average_overlap = sum(overlaps) / len(overlaps) if overlaps else 0

print(f"Accuracy on character level: {average_overlap:.2f}")

Accuracy on character level: 56.66
