#**Rule-based algorithm using basic text processing tools**

Group 7:
- Martina Carretta
- Meritxell Carvajal
- Mariona Pla
- Ares Sellart

### Import the necessary libraries

In [None]:
import json
import spacy
import nltk

!pip install medspacy
!pip install spacy nltk

import medspacy
from spacy.tokens import Token

import string

import spacy

import pandas as pd

Language model for Spanish provided by the spaCy library

In [None]:
!python3 -m spacy download es_core_news_sm

# Import the data

In [None]:
#!git clone https://github.com/Martinacarretta/githubTest.git
jsons = open('/home/martina/codi2/NLP - Med/negacio_train_v2024.json')
json_string = jsons.read()
json_object = json.loads(json_string)


In [None]:
instances = 0
for i in range(len(json_object)):
  instances += 1

print(f'The training dataset contains',instances,'instances of medical documents')

The training dataset contains 254 instances of medical documents


In [None]:
json_object[0]  #to check the format

{'data': {'cmbd': 'null',
  'id': '19026587',
  'docid': 'null',
  'page': 'null',
  'paragraph': 'null',
  'text': " nº historia clinica: ** *** *** nºepisodi: ******** sexe: home data de naixement: 16.05.1936 edat: 82 anys procedencia cex mateix hosp servei urologia data d'ingres 24.07.2018 data d'alta 25.07.2018 08:54:04 ates per ***************, *****; ****************, ****** informe d'alta d'hospitalitzacio motiu d'ingres paciente que ingresa de forma programada para realizacion de uretrotomia interna . antecedents alergia a penicilina y cloramfenicol . no habitos toxicos. antecedentes medicos: bloqueo auriculoventricular de primer grado hipertension arterial. diverticulosis extensa insuficiencia renal cronica colelitiasis antecedentes quirurgicos: exeresis de lesiones cutaneas con anestesia local protesis total de cadera cordectomia herniorrafia inguinal proces actual varon de 81a que a raiz de episodio de hematuria macroscopica se realiza cistoscopia que es negativa para lesion

#Train the model

In [None]:
def create_negations(preprocessed_data): # creates a set of the negation and uncertanty (train)
    # Calculate the total number of negation and uncertainty scopes
    total_negation_scopes = 0
    total_uncertainty_scopes = 0
    set_neg = set()  #Set to store the negation words. No repeated elements
    set_unc = set()
    for entry in preprocessed_data:
        text = entry.get('data')['text']
        for prediction in entry.get('predictions', []):
            for label_data in prediction['result']:
                label_value = label_data['value']
                labels = label_value['labels']
                start_index = label_value['start']
                end_index = label_value['end']
                text2 = text[start_index:end_index]  # Extract text based on start and end indexes
                # Add words to corresponding sets based on labels
                for label in labels:
                    if label == "NEG":
                        total_negation_scopes += 1
                        set_neg.add(text2)
                    elif label == "UNC":
                        total_uncertainty_scopes += 1
                        set_unc.add(text2)

    print(f"Total negation scopes: {total_negation_scopes}")
    print(f"Total uncertainty scopes: {total_uncertainty_scopes}")
    return(set_neg, set_unc)

set_neg, set_unc = create_negations(json_object)

Total negation scopes: 4307
Total uncertainty scopes: 458


In [None]:
print(set_neg)

{'se desestimo ', 'desaparicion del ', 'negativo ', 'negativa,', 'suspendido ', 'niega ', 'inestabilidad ', 'inespecifico:', 'negativos', 'asintomatico.', 'ceden ', 'ex', 'asintomatica ', 'ausencia ', 'negativo', 'inespecificos ', 'sense', 'rechaza ', ' afebril', 'asintomatico ', 'impide ', 'nega', 'negatividad de ', 'desorientado ', 'se suspende ', 'desorientacion.', 'ex fumador ', 'tampoco ', 'imposibilidad de ', 'negativa', 'sin', 'negativo.', 'niega', 'inespecificos', 'negativa.', 'desaparicion de ', 'ex-', 'negatiu.', 'negatiu', 'asintomatico', 'se retira ', 'retiro ', 'neg.', 'se retira', 'negativas ', ' no ', 'negativo)', 'negativas', 'cede ', 'indetectable.', 'afebril', 'ex ', 'neg ', 'niegan ', 'negativas.', 'negativos ', 'inespecifico', 'asintomatica', 'negatividad ', 'negatividad del ', 'inespecifico.', 'sin ', 'neg;', 'nega ', ' negativo', 'irregulares', 'retirar ', 'ausencia de ', 'exfumador ', 'incapacidad para ', 'asintomatica,', 'neg,', 'negativos,', 'negativos;', 'ines

Many of the words contain blankspaces at the end or punctuation signs.

To solve this problem, all blackspaces and punctuation signs in the last position of the string will be removed

In [None]:
def clean(s):
  punctuation = string.punctuation
  s2 = {word.strip(punctuation).strip() for word in s}
  return s2

In [None]:
set_neg2 = clean(set_neg)
set_unc2 = clean(set_unc)
print(set_neg2)

{'negativas', 'exfumador', 'retiro', 'desorientado', 'indetectable', 'tampoco', 'ausencia de', 'niegan', 'afebril', 'excepto', 'descarta', 'inestabilidad', 'negatividad de', 'descartada', 'rechaza', 'negatividad', 'impide', 'desorientacion', 'negativa', 'sin', 'niega', 'inespecificos', 'cede', 'no', 'incapacidad para', 'negativos', 'ausencia', 'suspendido', 'inespecifico', 'asintomatica', 'arritmicos', 'negatiu', 'negaitvo', 'ex', 'ninguno', 'desaparicion del', 'atipicos', 'negativo', 'asintomatico', 'ceden', 'retirar', 'sense', 'se suspende', 'imposibilidad', 'desaparicion de', 'negatividad del', 'en ninguna', 'se retira', 'desaparecen', 'ex fumador', 'nega', 'neg', 'imposibilidad de', 'irregulares', 'se desestimo', 'falta de'}


Some of the words (such as 'no') were repeated in negation and uncertainty in the tagged training dataset. Remove them from the uncertainty set to avoid repeated detected words

In [None]:
words_to_remove = {"no", "sin", "descartar"}
set_unc2_filtered = {word for word in set_unc2 if word not in words_to_remove}
print(set_unc2_filtered)

{'sospecha de', 'se orienta', 'probable', 'sugiere', 'sugestivas de', 'compatible con', 'no permite descartar', 'ssospechosas de', 'pudieran', 'sugestivos de', 'orienta como', 'aparentemente', 'dudosamente', 'posibles', 'dubtos', 'parece', 'sugestivo de', 'permite descartar', 'posiblemente', 'sospitosa de', 'sin poder descartar', 'falsa', 'no parece', 'probablemente', 'dudosa', 'impresiona', 'aparente', 'orientan', 'sospechosa de', 'sin clara', 'valorar', 'sugestivos con', 'sugiriendo', 'orientan como', 'aparentes', 'sugestiva de', 'clara', 'sugestiva como', 'sugiera de', 'no es posible descartar', 'dudosos', 'impresiona de', 'dudoso', 'sospechosas de', 'atribuida', 'sospechan de', 'podrian', 'sin claras', 'plantea', 'compatible amb', 'podria', 'desconocido', 'indiquen', 'probables', 'se desconoce', 'sugieren', 'posibilidad de', 'orienta', 'interpreta', 'sospecha', 'indeterminado', 'sin aparentes', 'parecen', 'al parecer', 'poco porque', 'puede', 'compatibles con', 'sin aparente', 'vs'

In [None]:
def convert(text, token_indexes): #to store the character index from tokens in text
    nlp_es = spacy.load('es_core_news_sm')
    text_t = nlp_es(text)  # Tokenized text
    start_t = token_indexes[0]
    end_t = token_indexes[1]

    # Extract tokens and their start indices
    tokens = [(token.text, token.idx) for token in text_t]

    # Find start and end indices of the specified tokens
    start = tokens[start_t][1]
    end = tokens[end_t][1] + len(tokens[end_t][0])

    return (start, end-1) #space

# Example usage
text = "Hola! món, aixo es una prova, per mirar si funciona"
print(convert(text, (2, 4))) # "mon, aixo" which have indices (6, 15) (15 not included since it's the space after aixo)


(6, 14)


In [None]:
def detect_negations(text, lista, llista2): # detect negations of a text given a list of negations
    nlp_es = spacy.load('es_core_news_sm')  #use spanish language model
    doc = nlp_es(text)
    negs = []
    scopes = []
    unc = []
    uscopes = []

    for token in doc:
        if token.lower_ in lista: # Negations
            negs.append(token)
            scope_start = token.i+1
            scope_end = min(token.i+1 + 5, len(doc) - 1)  # Set a maximum scope length of 5 tokens

            # Iterate over tokens within the scope
            for j in range(token.i + 1, scope_end + 1):
                if doc[j].is_punct:  # Check for punctuation
                    scope_end = j
                    break
            scopes.append((scope_start, scope_end))

        elif token.lower_ in llista2: # Uncertainties
            unc.append(token)
            scope_start = token.i+1
            scope_end = min(token.i+1 + 5, len(doc) - 1)  # Set a maximum scope length of 5 tokens

            # Iterate over tokens within the scope
            for j in range(token.i + 1, scope_end + 1):
                if doc[j].is_punct:  # Check for punctuation
                    scope_end = j
                    break
            uscopes.append((scope_start, scope_end))

    # Change from scope indices of tokens to scope indices of characters to then compare with the true values
    scopes2 = []
    for scope in scopes:
      scopes2.append(convert(text, scope))
    scopes3 = []
    for scope in uscopes:
      scopes3.append(convert(text, scope))
    return negs, scopes2, unc, scopes3

text = "nº historia clinica: ** *** *** nºepisodi: ******** sexe: home data de naixement: 16.05.1936 edat: 82 anys procedencia cex mateix hosp servei urologia data d'ingres 24.07.2018 data d'alta 25.07.2018 08:54:04 ates per ***************, *****; ****************, ****** informe d'alta d'hospitalitzacio motiu d'ingres paciente que ingresa de forma programada para realizacion de uretrotomia interna . antecedents alergia a penicilina y cloramfenicol . no habitos toxicos. antecedentes medicos: bloqueo auriculoventricular de primer grado hipertension arterial. diverticulosis extensa insuficiencia renal cronica colelitiasis antecedentes quirurgicos: exeresis de lesiones cutaneas con anestesia local protesis total de cadera cordectomia herniorrafia inguinal proces actual varon de 81a que a raiz de episodio de hematuria macroscopica se realiza cistoscopia que es negativa para lesiones malignas pero se objetiva estenosis de uretra . se intentan dilataciones progresivas en el gabinete de urologia sin exito. se solicita estudio de imagen que confirma la existencia de estenosis a nivel d uretra bulbar por lo que se indica uretrtomia interna. exploracio complementaria uretrocistografia retrograda + cums (11/2017): la uretrografia retrograda muestra una uretra anterior con dos estenosis focales a nivel de uretra peneana y bulbar, aunque se observa paso de contraste retrogrado a vejiga. vejiga de correcta capacidad (250 cc de contraste), de paredes trabeculadas y con diverticulos, el mayor de ellos en cara posterolateral izquierda, sin observarse defectos de replecion. la uretrografia miccional muestra una uretra prostatica dilatada, sin claras estenosis focales confirmandose la existencia de las dos estenosis de uretra anterior descritas previamente. moderado residuo postmiccional en vejiga asi como en el interior del diverticulo posterolateral izquierdo descrito. uretroscopia (10/2017) falsa via a nivel de uretra peneana, siguiendo la uretra se detecta gran estenosis que no permite el paso de una guia. nhc ** *** *** (********) age-v-uro 1/2 lopd evolucio clinica el 24 de julio de 2018 con el consentimiento informado del paciente y sin contraindicacion preoperatoria se realiza uretrotomia interna sin incidencias. tras el procedimiento el paciente es trasladado a la planta de hospitalizacion siendo portador de lavado vesical continuo. posteriormente se mantiene en buen estado general, afebril, hemodinamicamente estable y con buen control del dolor. aclarado progresivo de la orina con los lavados vesicales continuos, que permiten su retirada, conserva correcta diuresis. tolerancia correcta a dieta oral. dada la buena evolucion se decide alta domiciliaria siendo portador de sonda vesical. orientacio diagnostica n40.0 hiperplasia prostatica benigna sense simptomes en les vies urinaries inferiors procediments 04.81 injeccio en el nervi periferic d'anestesic per a analgesia 58.0 uretrotomia. excisio de septe uretral, uretrostomia perineal, extraccio de calcul uretral per incisio sonda vesical profilaxis antibiotica, antilucerosa y antitrombotica tractament i recomanacions a l'alta -abundante ingesta de liquidos entorno a dos litros y medio de agua al dia. -puede orinar con restos de sangre durante las proximas semanas. -es normal que sienta escozor al orinar y que tenga algun escape de orina y urgencia miccional al retirar la sonda vesical. mantener sonda vesical durante 14 dias (dos semanas). ciprofloxacino 500mg cada 12h durante dos semanas. -paracetamol 1 g cada 8 horas si molestias. -si fiebre mayor de 38ºc, empeoramiento claro del estado general o imposibilidad miccional por obstruccion de sonda vesical o despues de su retirada, consultar con el servicio de urgencias. -control en consultas externas de urologia segun cita en hoja adjunta. destinacio a l'alta: a domicili nhc ** *** *** (********) age-v-uro 2/2 lopd"
negs, nsco, unc, usco  = detect_negations(text, set_neg2, set_unc2)
print(negs)
print(nsco)
print(unc)
print(usco)

[no, negativa, sin, sin, sin, no, sin, sin, afebril, sense, retirar, imposibilidad]
[(451, 466), (871, 909), (1001, 1006), (1542, 1574), (1646, 1697), (1991, 2017), (2156, 2216), (2222, 2233), (2416, 2416), (2783, 2823), (3361, 3377), (3609, 3650)]
[falsa]
[(1907, 1935)]


In [None]:
negs, nsco, unc, usco  = detect_negations(text, set_neg2, set_unc2_filtered)
print(negs)
print(nsco)
print(unc)
print(usco)

[no, negativa, sin, sin, sin, no, sin, sin, afebril, sense, retirar, imposibilidad]
[(451, 466), (871, 909), (1001, 1006), (1542, 1574), (1646, 1697), (1991, 2017), (2156, 2216), (2222, 2233), (2416, 2416), (2783, 2823), (3361, 3377), (3609, 3650)]
[falsa]
[(1907, 1935)]


# Test

In [None]:
test = open('/content/githubTest/negacio_test_v2024.json')
test = test.read()
test = json.loads(test)

In [None]:
instances = 0
for i in range(len(test)):
  instances += 1

print(f'The test dataset contains',instances,'instances of medical documents')

The test dataset contains 64 instances of medical documents


In [None]:
def store_negations(preprocessed_data, y_true_neg, y_true_nsco, y_true_unc, y_true_usco): # Stores the true negations and uncertainties (test)
    for entry in preprocessed_data:
        neg = [] #create an emtpy list for every entry in the test dataset
        nsco = []
        unc = []
        usco = []
        text = entry.get('data')['text']
        for prediction in entry.get('predictions', []):
            for label_data in prediction['result']:
                label_value = label_data['value']
                labels = label_value['labels']
                start_index = label_value['start']
                end_index = label_value['end']
                text2 = text[start_index:end_index]  # Extract text based on start and end indexes
                # Add words to corresponding sets based on labels
                for label in labels:
                    if label == "NEG":
                        neg.append(text2)
                    elif label == "UNC":
                        unc.append(text2)
                    elif label == "NSCO":
                        nsco.append((start_index, end_index))
                    elif label == "USCO":
                        usco.append((start_index, end_index))
        y_true_neg.append(clean(neg))
        y_true_nsco.append(nsco)
        y_true_unc.append(clean(unc))
        y_true_usco.append(usco)
    #clean negation and uncertainty lists so there are not blanckspaces or punctuation signs at the end of the word
    return(y_true_neg, y_true_nsco, y_true_unc, y_true_usco)

In [None]:
#Look for the true labels in the test set
y_true_neg = []
y_true_nsco = []
y_true_unc = []
y_true_usco = []

neg, nsco, unc, usco = store_negations(test, y_true_neg, y_true_nsco, y_true_unc, y_true_usco) #test


In [None]:
#Calculate the predicted labels in the test set
y_pred_neg = [] #list to store all the negation cues list for each document
y_pred_nsco = []
y_pred_unc = []
y_pred_usco = []

for entry in test:
  text = entry.get('data')['text']
  negations, nscopes, uncertainties, uscopes = detect_negations(text, set_neg2, set_unc2_filtered)
  y_pred_neg.append(negations)
  y_pred_nsco.append(nscopes)
  y_pred_unc.append(uncertainties)
  y_pred_usco.append(uscopes)

In [None]:
print(y_true_neg[6]) #To check if the predicted and the true lists are somehow similar.
print(y_pred_neg[6])

{'no', 'sin', 'afebril'}
[sin, no, sin, sin, afebril, no]


Since the rule-based method is based on the negation/uncertainty cues from the training set, the algorithm will detect all the instances in the text. However, the subjective tagger may not categorize every instance of the same word as a negation or uncertainty cue.
Moreover, those words which were not tagged as negation/uncertainty cues in the training set won't be considered as such by the model.

In [None]:
print(y_true_nsco[6])
print(y_pred_nsco[6])

[(1867, 1884), (2877, 2912), (953, 964), (1201, 1212), (2309, 2320)]
[(953, 964), (1201, 1212), (1867, 1912), (2309, 2320), (2380, 2380), (2877, 2916)]


As evident from the data, the performance of identifying scopes is quite satisfactory. Achieving accuracy in determining the starting position is generally easier compared to determining the end of the scope. This discrepancy arises because the starting point typically follows immediately after the negation or uncertainty cue, whereas the end varies for each instance. The model defines the scope length as 5 tokens or until encountering a punctuation sign.

## Rule-based method performance

In [None]:
def compute_precision_cues(y_true, y_pred):
    if len(y_pred) == 0:
      return 1
    else:
      y_true2 = y_true.copy()
      num_correct = 0
      total_words = len(y_pred)

      for token in y_pred:
          if token.text in y_true2:
              y_true2.remove(token.text)  # Remove the matched token from y_true so that it counts as correct as many as there are true
                #This approach is to have a more realistic metric.
                #If the tokens weren't removed from the list, y_pred=['no', 'no', 'no'] and y_true=['no'] would have a precision of 1.
                #Based on the approach used here, y_pred=['no', 'no', 'no'] and y_true=['no'] would have a precision of 1/3, which makes more sense.
              num_correct += 1

      precision = num_correct / total_words #How many correct detections over the whole number of detected cues
      return precision

def compute_precision_scopes(y_true, y_pred):
  if len(y_pred) == 0:
      return 1
  else:
    num_correct = 0
    total = len(y_pred)
    for scope in y_pred:
      if scope in y_true:
        num_correct += 1 #If start and end point are exactly the same
      else:
        for real in y_true:
          if scope[0] == real[0]:
            num_correct += 0.5  #If starting point is correct, end point is  not, half the scored is summed
    precision = num_correct / total
  return precision

In [None]:
data = []
for i in range(len(test)):
    data.append([
        i,
        compute_precision_cues(y_true_neg[i], y_pred_neg[i]),
        compute_precision_scopes(y_true_nsco[i], y_pred_nsco[i]),
        compute_precision_cues(y_true_unc[i], y_pred_unc[i]),
        compute_precision_scopes(y_true_usco[i], y_pred_usco[i])
    ])

columns = ["Entry", "Precision Cues (Neg)", "Precision Scopes (NSco)", "Precision Cues (Unc)", "Precision Scopes (USco)"]

df = pd.DataFrame(data, columns=columns)

print("Dataset:")
df

Dataset:


Unnamed: 0,Entry,Precision Cues (Neg),Precision Scopes (NSco),Precision Cues (Unc),Precision Scopes (USco)
0,0,0.625000,0.500000,0.000000,0.000000
1,1,0.161290,0.677419,0.400000,0.300000
2,2,0.150943,0.707547,0.500000,0.437500
3,3,0.106383,0.617021,0.285714,0.357143
4,4,0.166667,0.690476,0.333333,0.250000
...,...,...,...,...,...
59,59,0.235294,0.411765,0.000000,0.000000
60,60,0.166667,0.416667,1.000000,1.000000
61,61,0.093023,0.406977,0.500000,0.375000
62,62,0.160000,0.600000,0.333333,0.166667


The metrics are calculated for each entry in the dataset and the mean will be later computed to assess the performance of the model based on the whole test set.

In [None]:
print("\nColumn-wise averages:")
print(df.drop(columns=['Entry']).mean())


Column-wise averages:
Precision Cues (Neg)       0.329263
Precision Scopes (NSco)    0.560762
Precision Cues (Unc)       0.485758
Precision Scopes (USco)    0.412872
dtype: float64


In [None]:
def compute_sensitivity_cues(y_true, y_pred):
    if len(y_true) == 0 and len(y_pred) != 0:  #If no true cues and some predicted cues, the precision is 0. Avoid decision by 0
      return 0
    elif len(y_pred) == 0 or len(y_true) == 0:  #If no predicted cues and no true cues the precision is 1
      return 1
    else:
      y_true2 = y_true.copy() #Avoid modifying the original list
      num_correct = 0
      total_words = len(y_pred)

      for token in y_pred:
          if token.text in y_true2:
              y_true2.remove(token.text)  # Remove the matched token from y_true so that it counts as correct as many as there are true
                #same approach used as in the precision metric
              num_correct += 1

      sensitivity = num_correct / len(y_true)
      return sensitivity

def compute_sensitivity_scopes(y_true, y_pred):
    if ((len(y_true) == 0) and (len(y_pred) != 0)):
      return 0
    elif len(y_pred) == 0 or len(y_true) == 0:
      return 1
    else:
      num_correct = 0
      total = len(y_pred)
      for scope in y_pred:
        if scope in y_true:
          num_correct += 1   #whole punctuation if exact scope
        else:
          for real in y_true:
            if scope[0] == real[0]:
              num_correct += 0.5  #half the punctuation if half correct
      sensitivity = num_correct / len(y_true)
    return sensitivity

In [None]:
data2 = []
for i in range(len(test)):
    data2.append([
        i,
        compute_sensitivity_cues(y_true_neg[i], y_pred_neg[i]),
        compute_sensitivity_scopes(y_true_nsco[i], y_pred_nsco[i]),
        compute_sensitivity_cues(y_true_unc[i], y_pred_unc[i]),
        compute_sensitivity_scopes(y_true_usco[i], y_pred_usco[i])
    ])

columns2 = ["Entry", "Sensitivity Cues (Neg)", "Sensitivity Scopes (NSco)", "senSensitivitysitivity Cues (Unc)", "Sensitivity Scopes (USco)"]

df2 = pd.DataFrame(data2, columns=columns2)

print("Dataset:")
df2


Dataset:


Unnamed: 0,Entry,Sensitivity Cues (Neg),Sensitivity Scopes (NSco),senSensitivitysitivity Cues (Unc),Sensitivity Scopes (USco)
0,0,1.000000,0.571429,0.000000,0.000000
1,1,1.000000,0.724138,1.000000,0.750000
2,2,1.000000,0.765306,0.571429,0.437500
3,3,0.714286,0.690476,0.333333,0.277778
4,4,0.875000,0.743590,0.400000,0.250000
...,...,...,...,...,...
59,59,0.666667,0.500000,0.000000,0.000000
60,60,1.000000,0.454545,1.000000,1.000000
61,61,0.666667,0.426829,0.400000,0.300000
62,62,1.000000,0.625000,1.000000,0.500000


The higher sensitivity values compared to precision can be attributed to the model's tendency to tag a higher number of instances as negation or uncertainty cues. This results in more predicted cues than actually present, leading to an increase in false positives.

As divided by the number of true tags, the values of the sensitivity tend to be higher.

In [None]:
print("\nColumn-wise averages:")
print(df2.drop(columns=['Entry']).mean())


Column-wise averages:
Sensitivity Cues (Neg)               0.931630
Sensitivity Scopes (NSco)            0.629012
senSensitivitysitivity Cues (Unc)    0.516034
Sensitivity Scopes (USco)            0.429201
dtype: float64
