### Annotation Analysis - Label: Negative Word

* Number of papers annotated: 22

In [1]:
import json
import os
import re
import copy
import random
from sklearn.model_selection import KFold

In [2]:
# Define the directory path containing the annotation.json files and the targeted label
directory_path = './data'  # Update this to your actual directory path
targeted_label = 'NEGATIVE WORD'
random.seed(11824061)

In [3]:
def read_json_file(file_path):
    """Function to read json files"""
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

In [4]:
def extract_words_from_category(text, entities, category):
    """Function to extract all words which are tagged with a given category"""
    words = []
    for start, end, cat in entities:
        if cat == category:
            words.append(text[start:end])
    return words

In [5]:
def extract_category_words_all(data):
    """Iterate through annotations and extract words for each category"""
    category_words_dict = {}
    for annotation in data['annotations']:
        if annotation is None:
            continue
        text = annotation[0]
        entities = annotation[1]['entities']
        for category in set([entity[2] for entity in entities]): 
            words = extract_words_from_category(text, entities, category)
            if category in category_words_dict:
                category_words_dict[category].extend(words)
            else:
                category_words_dict[category] = words
    return category_words_dict

In [6]:
# Read all annotations from the specified directory
annotations = []
for filename in os.listdir(directory_path):
    if filename.endswith('.json'):
        file_path = os.path.join(directory_path, filename)
        data = read_json_file(file_path)
        annotations.append(data)

In [7]:
# Remove None entries from annotations
def remove_null_entries(annotations):
    for paper in annotations:
        paper['annotations'] = [annotation for annotation in paper['annotations'] if annotation is not None]
remove_null_entries(annotations)

In [8]:
# Function to clean keywords
def clean_word(word):
    word = re.sub(r'-[\r\n]+', '', word)
    word = word.replace('\n', '').replace('\r', '') 
    word = word.rstrip('.')
    word = word.strip()
    return word

In [9]:
# Function to remove entities from annotations
def remove_entities(annotations):
    for paper in annotations:
        for annotation in paper['annotations']:
            annotation[1]['entities'] = []
    return annotations

In [10]:
# Function to clean text before keyword search
def clean_text_before_keyword_search(text):
    replacements = [
        ("-\r\n", "   "),  
        ("\r\n", "  "),  
        ("\r", " "),  
        ("\n", " "),  
    ]
    for old, new in replacements:
        text = text.replace(old, new)
    return text

In [11]:
# Function to find keyword positions in the text
def find_keyword_positions(keywordlist, text, label):
    annotations = set()
    for keyword in keywordlist:
        pattern = re.compile(r'\b' + r'\s*'.join(re.escape(char) for char in keyword) + r'\b', re.IGNORECASE)
        matches = pattern.finditer(text)
        for match in matches:
            start = match.start()
            end = match.end()
            if text[end:end + 1] == ".":
                end += 1
            annotations.add((start, end, label))
    return [list(annotation) for annotation in annotations]

In [12]:
def annotate_with_keyword_list(data, keyword_list, entity_type="NEGATIVE WORD"):
    for paper in data:
        for annotation in paper['annotations']:
            text = annotation[0]
            annotation[1]['entities'] = find_keyword_positions(keyword_list, text, entity_type)

In [13]:
# Function to filter annotations by label
def filter_annotations(data, label):
    for paper in data:
        for annotation in paper['annotations']:
            filtered_entities = [entity for entity in annotation[1]['entities'] if entity[2] == label]
            annotation[1]['entities'] = filtered_entities if filtered_entities else []
    return data

In [14]:
# Function to create new structure for annotations
def create_new_structure_for_annotations(data):
    output_list = []
    for paper in data:
        for paragraph in paper["annotations"]:
            text_dict = {"text": paragraph[0], "entities": paragraph[1]["entities"]}
            output_list.append(text_dict)
    return output_list

In [15]:
# Function to calculate evaluation metrics
def calculate_metrics(predictions, ground_truth):
    def extract_entities(data):
        entities = set()
        for key, item in enumerate(data):
            for entity in item["entities"]:
                start, end, label = entity
                entities.add((key, start, end, label))
        return entities

    pred_entities = extract_entities(predictions)
    gt_entities = extract_entities(ground_truth)

    true_positives = len(pred_entities & gt_entities)
    false_positives = len(pred_entities - gt_entities)
    false_negatives = len(gt_entities - pred_entities)

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    accuracy = true_positives / (true_positives + false_positives + false_negatives) if (true_positives + false_positives + false_negatives) > 0 else 0

    return precision, recall, f1_score, accuracy

In [16]:
# Perform cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=11824061)
precisions, recalls, f1_scores, accuracies = [], [], [], []

for train_index, test_index in kf.split(annotations):
    annotations_train = [annotations[i] for i in train_index]
    annotations_test = [annotations[i] for i in test_index]
    
    # Create a keyword list from training data annotations
    keyword_list = []
    for annotation in annotations_train:
        category_words_dict = extract_category_words_all(annotation)
        keyword_list.extend(category_words_dict.get(targeted_label, []))
    
    keyword_list = [clean_word(word) for word in keyword_list]
    keyword_list = list(set(keyword_list))
    
    annotations_ground_truth = copy.deepcopy(annotations_test)
    annotations_pred = remove_entities(copy.deepcopy(annotations_test))
    
    annotate_with_keyword_list(annotations_pred, keyword_list, targeted_label)
    
    annotations_ground_truth_filtered = filter_annotations(annotations_ground_truth, targeted_label)
    
    ground_truth = create_new_structure_for_annotations(annotations_ground_truth_filtered)
    predictions = create_new_structure_for_annotations(annotations_pred)
    
    precision, recall, f1, accuracy = calculate_metrics(predictions, ground_truth)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)
    accuracies.append(accuracy)

print(f'Precision: {sum(precisions) / len(precisions)}')
print(f'Recall: {sum(recalls) / len(recalls)}')
print(f'F1 Score: {sum(f1_scores) / len(f1_scores)}')
print(f'Accuracy: {sum(accuracies) / len(accuracies)}')

Precision: 0.2848199515323574
Recall: 0.5578833730252097
F1 Score: 0.3757449458428427
Accuracy: 0.24126806959707858


### Version 2

In [3]:
import json
import os
import re
import copy
import random
import nltk
from sklearn.model_selection import KFold
from nltk.corpus import wordnet

# Ensure NLTK resources are downloaded
nltk.download('wordnet')
nltk.download('omw-1.4')

# Define the directory path containing the annotation.json files and the targeted label
directory_path = './data'  # Update this to your actual directory path
targeted_label = 'NEGATIVE WORD'
random.seed(11824061)

def read_json_file(file_path):
    """Function to read json files"""
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

def extract_words_from_category(text, entities, category):
    """Function to extract all words which are tagged with a given category"""
    words = []
    for start, end, cat in entities:
        if cat == category:
            words.append(text[start:end])
    return words

def extract_category_words_all(data):
    """Iterate through annotations and extract words for each category"""
    category_words_dict = {}
    for annotation in data['annotations']:
        if annotation is None:
            continue
        text = annotation[0]
        entities = annotation[1]['entities']
        for category in set([entity[2] for entity in entities]): 
            words = extract_words_from_category(text, entities, category)
            if category in category_words_dict:
                category_words_dict[category].extend(words)
            else:
                category_words_dict[category] = words
    return category_words_dict

# Read all annotations from the specified directory
annotations = []
for filename in os.listdir(directory_path):
    if filename.endswith('.json'):
        file_path = os.path.join(directory_path, filename)
        data = read_json_file(file_path)
        annotations.append(data)

# Remove None entries from annotations
def remove_null_entries(annotations):
    for paper in annotations:
        paper['annotations'] = [annotation for annotation in paper['annotations'] if annotation is not None]
remove_null_entries(annotations)

# Function to clean keywords
def clean_word(word):
    word = re.sub(r'-[\r\n]+', '', word)
    word = word.replace('\n', '').replace('\r', '') 
    word = word.rstrip('.')
    word = word.strip()
    return word

# Function to expand keywords using WordNet
def expand_keywords_with_wordnet(keyword_list):
    expanded_keywords = set(keyword_list)
    for word in keyword_list:
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                expanded_keywords.add(lemma.name())
    return list(expanded_keywords)

# Function to remove entities from annotations
def remove_entities(annotations):
    for paper in annotations:
        for annotation in paper['annotations']:
            annotation[1]['entities'] = []
    return annotations

# Function to clean text before keyword search
def clean_text_before_keyword_search(text):
    replacements = [
        ("-\r\n", "   "),  
        ("\r\n", "  "),  
        ("\r", " "),  
        ("\n", " "),  
    ]
    for old, new in replacements:
        text = text.replace(old, new)
    return text

# Function to find keyword positions in the text
def find_keyword_positions(keywordlist, text, label):
    annotations = set()
    for keyword in keywordlist:
        pattern = re.compile(r'\b' + re.escape(keyword) + r'\b', re.IGNORECASE)
        matches = pattern.finditer(text)
        for match in matches:
            start = match.start()
            end = match.end()
            annotations.add((start, end, label))
    return [list(annotation) for annotation in annotations]

def annotate_with_keyword_list(data, keyword_list, entity_type="NEGATIVE WORD"):
    for paper in data:
        for annotation in paper['annotations']:
            text = clean_text_before_keyword_search(annotation[0])
            annotation[1]['entities'] = find_keyword_positions(keyword_list, text, entity_type)

# Function to filter annotations by label
def filter_annotations(data, label):
    for paper in data:
        for annotation in paper['annotations']:
            filtered_entities = [entity for entity in annotation[1]['entities'] if entity[2] == label]
            annotation[1]['entities'] = filtered_entities if filtered_entities else []
    return data

# Function to create new structure for annotations
def create_new_structure_for_annotations(data):
    output_list = []
    for paper in data:
        for paragraph in paper["annotations"]:
            text_dict = {"text": paragraph[0], "entities": paragraph[1]["entities"]}
            output_list.append(text_dict)
    return output_list

# Function to calculate evaluation metrics
def calculate_metrics(predictions, ground_truth):
    def extract_entities(data):
        entities = set()
        for key, item in enumerate(data):
            for entity in item["entities"]:
                start, end, label = entity[:3]
                entities.add((key, start, end, label))
        return entities

    pred_entities = extract_entities(predictions)
    gt_entities = extract_entities(ground_truth)

    true_positives = len(pred_entities & gt_entities)
    false_positives = len(pred_entities - gt_entities)
    false_negatives = len(gt_entities - pred_entities)

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    accuracy = true_positives / (true_positives + false_positives + false_negatives) if (true_positives + false_positives + false_negatives) > 0 else 0

    return precision, recall, f1_score, accuracy

# Perform cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=11824061)
precisions, recalls, f1_scores, accuracies = [], [], [], []

for train_index, test_index in kf.split(annotations):
    annotations_train = [annotations[i] for i in train_index]
    annotations_test = [annotations[i] for i in test_index]
    
    # Create a keyword list from training data annotations
    keyword_list = []
    for annotation in annotations_train:
        category_words_dict = extract_category_words_all(annotation)
        keyword_list.extend(category_words_dict.get(targeted_label, []))
    
    keyword_list = [clean_word(word) for word in keyword_list]
    keyword_list = list(set(keyword_list))
    
    # Expand keyword list with WordNet
    keyword_list = expand_keywords_with_wordnet(keyword_list)
    
    annotations_ground_truth = copy.deepcopy(annotations_test)
    annotations_pred = remove_entities(copy.deepcopy(annotations_test))
    
    annotate_with_keyword_list(annotations_pred, keyword_list, targeted_label)
    
    annotations_ground_truth_filtered = filter_annotations(annotations_ground_truth, targeted_label)
    
    ground_truth = create_new_structure_for_annotations(annotations_ground_truth_filtered)
    predictions = create_new_structure_for_annotations(annotations_pred)
    
    precision, recall, f1, accuracy = calculate_metrics(predictions, ground_truth)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)
    accuracies.append(accuracy)

print(f'Precision: {sum(precisions) / len(precisions)}')
print(f'Recall: {sum(recalls) / len(recalls)}')
print(f'F1 Score: {sum(f1_scores) / len(f1_scores)}')
print(f'Accuracy: {sum(accuracies) / len(accuracies)}')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mimis\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\mimis\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Precision: 0.10562318434684195
Recall: 0.561773801564279
F1 Score: 0.1773787961126388
Accuracy: 0.09832872722694426
