In [4]:
# import json
# from sklearn.model_selection import train_test_split



# def load_data(fp):
#     with open(fp, 'r', encoding='utf-8') as f:
#         data = json.load(f)
#     return data

# def save_data(data, fp):
#     with open(fp, 'w', encoding='utf-8') as f:
#         json.dump(data, f, ensure_ascii=False, indent=4)

# def tokenize(text):
#     """Custom tokenize method that returns tokens and their character span indices."""
#     tokens = []
#     char_indices = []
#     start_index = None
#     for i, char in enumerate(text):
#         if char.isalnum():
#             if start_index is None:
#                 start_index = i  # Mark the start of a new token
#         else:
#             if start_index is not None:
#                 # End of the current token
#                 tokens.append(text[start_index:i])
#                 char_indices.append((start_index, i - 1))
#                 start_index = None
#     if start_index is not None:
#         # Capture the last token if the string ends with an alphanumeric character
#         tokens.append(text[start_index:])
#         char_indices.append((start_index, len(text) - 1))
#     return tokens, char_indices



# def bio_encode_entities(data):
#     bio_data = {}
#     ctr = 1
#     for item in data:
#         case_id = str(ctr)
#         text = item['data']['text']
#         annotations = item['annotations'][0]['result']
        
#         tokens, char_indices = tokenize(text)
#         labels = ['O'] * len(tokens)
        
#         for annotation in annotations:
#             label = annotation['value']['labels'][0]
#             start_char = annotation['value']['start']
#             end_char = annotation['value']['end'] - 1  # Adjust to inclusive end index
#             for i, (start, end) in enumerate(char_indices):
#                 if end_char < start:
#                     break  # Past the relevant span
#                 if start_char > end:
#                     continue  # Not yet reached the relevant span
#                 if start <= start_char <= end:
#                     labels[i] = 'B_' + label
#                 if start_char <= start and end <= end_char:
#                     labels[i] = 'I_' + label if labels[i] != 'B_' + label else labels[i]
        
#         bio_text = " ".join(tokens)
#         bio_data[case_id] = {'text': bio_text, 'labels': labels}
#         ctr+=1
#     return bio_data

# # Load, process, and save data
# train_data = load_data('DATASET_TASK1/task1_train.json')
# train_split, val_split = train_test_split(train_data, test_size=0.15, random_state=42)
# train_bio = bio_encode_entities(train_split)
# val_bio = bio_encode_entities(val_split)
# test_data = load_data('DATASET_TASK1/task1_test.json')
# test_bio = bio_encode_entities(test_data)

# save_data(train_bio, 'NER_train.json')
# save_data(val_bio, 'NER_val.json')
# save_data(test_bio, 'NER_test.json')


In [5]:
import json
from sklearn.model_selection import train_test_split


def save_data(processed_data, fp,ensure_ascii=False,indent=4):
    with open(fp, 'w', encoding='utf-8') as f:
        json.dump(processed_data, f, ensure_ascii=ensure_ascii, indent=indent)

def tokenize(text):
    char_indices = []
    start_index = -1
    tokens = []
    
    for i, char in enumerate(text):
        if char.isalnum():
            if start_index == -1:
                start_index = i
        else:
            if start_index != -1:
                tokens.append(text[start_index:i]) # appending current token
                char_indices.append((start_index, i - 1)) # appending req. pair
                start_index = -1

    if start_index!=-1:
        tokens.append(text[start_index:i]) # adding current token 
        char_indices.append((start_index, len(text) - 1)) # appending req. pair

    return tokens, char_indices


def bio_encode_entities(data):
    ctr = 1
    bio_data = {}
    for item in data:
        text = item['data']['text']
        annotations = item['annotations'][0]['result']
        
        tokens, char_indices = tokenize(text)
        labels = ['O' for _ in range(len(tokens))]
        
        for annotation in annotations:
            start_char = annotation['value']['start']
            end_char = annotation['value']['end'] - 1  # Adjust to inclusive end index
            label = annotation['value']['labels'][0]
            for i, (start, end) in enumerate(char_indices):
                if end_char < start:
                    break  # Past the relevant span
                if start_char > end:
                    continue  # Not yet reached the relevant span
                if start <= start_char and start_char <= end:
                    labels[i] = 'B_' + label
                if start_char <= start and end <= end_char:
                    if labels[i] != 'B_' + label:
                        labels[i] = 'I_' + label
                    else:
                        labels[i] = labels[i]
        
        bio_text = " ".join(tokens)
        bio_data[str(ctr)] = {'text': bio_text, 'labels': labels}
        ctr+=1
    return bio_data 

# Load, process, and save data
folder = 'DATASET_TASK1/'
file1 = folder + 'task1_train.json'
file2 = folder + 'task1_test.json'

with open(file1, 'r', encoding='utf-8') as f:
        train_data = json.load(f)
with open(file2, 'r', encoding='utf-8') as f:
        test_data = json.load(f)

train_split, val_split = train_test_split(train_data, test_size=0.15, random_state=42)

train_bio = bio_encode_entities(train_split)
val_bio = bio_encode_entities(val_split)
test_bio = bio_encode_entities(test_data)

save_data(train_bio, 'NER_train.json')
save_data(val_bio , 'NER_val.json')
save_data(test_bio, 'NER_test.json')