In [1]:
import json
# from TextUtils import CustomUtils 
import os

import spacy
from spacy import displacy
from spacy.tokens import Span

### Abrir archivo con datos en formato jsonl tras etiquetado en doccano

In [2]:
# open the jsonl file in all.jsonl

def read_jsonl(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    return data

data = read_jsonl('all.jsonl')

len(data)

454

In [3]:
# print the first record using json.dumps

print(json.dumps(data[0], indent=4))

{
    "id": "1.2.840.113619.2.373.202306131802469930109650",
    "text": "Ambas mamas son densas y heterog\u00e9neas.\nMicrocalcificaciones aisladas.\nN\u00f3dulo periareolar derecho bien delimitadp de 10mm.\nN\u00f3dulo calcifcado derecho.\nNo observo microcalcificaciones sospechosas agrupadas ni im\u00e1genes espiculadas.\nRegiones axilares sin adenopat\u00edas.\nImpresi\u00f3n: Mamas densas y n\u00f3dulo derecho presuntamente benigno.\nSugiero ecograf\u00eda mamaria.\nBI-RADS 3 ACR C",
    "Comments": [],
    "entities": [
        {
            "id": 4762,
            "label": "DENS",
            "start_offset": 0,
            "end_offset": 37
        },
        {
            "id": 4763,
            "label": "HALL_presente",
            "start_offset": 39,
            "end_offset": 59
        },
        {
            "id": 4764,
            "label": "CARACT",
            "start_offset": 60,
            "end_offset": 68
        },
        {
            "id": 4765,
            "label"

### Remover la llave de "Comments" de los datos (innecesaria) 

In [26]:
# remove the "Comments" key from all the records

for record in data:
    if 'Comments' in record:
        del record['Comments']

### Visualizando con displacy (solo entidades)

In [4]:
def convert_data_to_spacy_format(data):
    entities = [{'start': ent['start_offset'], 'end': ent['end_offset'], 'label': ent['label']} for ent in data['entities']]
    # for each relation search for the start and end of the entities that have an id that coincides with the "from_id" and "to_id" of the relation
    relations = [{'start': next(ent['start_offset'] for ent in data['entities'] if ent['id'] == rel['from_id']),
                    'end': next(ent['end_offset'] for ent in data['entities'] if ent['id'] == rel['to_id']),
                    'label': rel['type']} for rel in data['relations']]
    return {
        'text': data['text'],
        'ents': entities,
        'relations': relations
    }

# Convert data
spacy_data = convert_data_to_spacy_format(data[0])

print(spacy_data)

{'text': 'Ambas mamas son densas y heterogéneas.\nMicrocalcificaciones aisladas.\nNódulo periareolar derecho bien delimitadp de 10mm.\nNódulo calcifcado derecho.\nNo observo microcalcificaciones sospechosas agrupadas ni imágenes espiculadas.\nRegiones axilares sin adenopatías.\nImpresión: Mamas densas y nódulo derecho presuntamente benigno.\nSugiero ecografía mamaria.\nBI-RADS 3 ACR C', 'ents': [{'start': 0, 'end': 37, 'label': 'DENS'}, {'start': 39, 'end': 59, 'label': 'HALL_presente'}, {'start': 60, 'end': 68, 'label': 'CARACT'}, {'start': 70, 'end': 76, 'label': 'HALL_presente'}, {'start': 122, 'end': 128, 'label': 'HALL_presente'}, {'start': 160, 'end': 180, 'label': 'HALL_ausente'}, {'start': 289, 'end': 295, 'label': 'HALL_presente'}, {'start': 77, 'end': 88, 'label': 'REG'}, {'start': 89, 'end': 96, 'label': 'LAT'}, {'start': 97, 'end': 112, 'label': 'CARACT'}, {'start': 116, 'end': 120, 'label': 'CARACT'}, {'start': 129, 'end': 139, 'label': 'CARACT'}, {'start': 140, 'end': 147

In [8]:
nlp = spacy.blank("es")

# Create Doc object
doc = nlp(spacy_data['text'])

# Set entities using character offsets
ents = [doc.char_span(ent['start'], ent['end'], label=ent['label']) for ent in spacy_data['ents'] if doc.char_span(ent['start'], ent['end'], label=ent['label'])]

# Filter out None values in case of invalid spans
ents = [ent for ent in ents if ent is not None]

doc.ents = ents

# Render the visualization
options = {"ents": [ent.label_ for ent in ents], "colors": {"HALL_presente": "linear-gradient(90deg, #aa9cfc, #fc9ce7)", "DENS": "linear-gradient(90deg, #9cff9d, #4aff4a)", "HALL_ausente": "linear-gradient(90deg, #fffc9c, #ffec4a)", "CARACT": "linear-gradient(90deg, #9cfcfc, #4affff)", "REG": "linear-gradient(90deg, #ff9c9c, #ff4a4a)", "LAT": "linear-gradient(90deg, #fc9ccf, #fc4a7a)", "GANGLIOS": "linear-gradient(90deg, #fca9c9, #f84b74)"}}
displacy.render(doc, style='ent', options=options, jupyter=True)

### Hacer particiones de train, val y test

In [34]:
# make partitions in train, val and test

import random

random.shuffle(data, random=random.seed(42))

train_size = int(len(data) * 0.7)
val_size = int(len(data) * 0.15)
test_size = len(data) - train_size - val_size

train_data = data[:train_size]
val_data = data[train_size:train_size + val_size]
test_data = data[train_size+val_size:]

print(f"Train size: {len(train_data)}")
print(f"Val size: {len(val_data)}")
print(f"Test size: {len(test_data)}")

Train size: 317
Val size: 68
Test size: 69


In [36]:
# Save the data in jsonl format

def save_data(file_path, data):
    with open(file_path, 'w') as f:
        for record in data:
            f.write(json.dumps(record, ensure_ascii=False) + '\n')

save_data('train.jsonl', train_data)
save_data('val.jsonl', val_data)
save_data('test.jsonl', test_data)