#**Machine Learning Method**

Group 7:
- Martina Carretta
- Meritxell Carvajal
- Mariona Pla
- Ares Sellart

In [None]:
!pip install --quiet medspacy
!pip install --quiet spacy nltk
#!python -m spacy download ca_core_news_sm > /dev/null 2>&1
#!python -m spacy download es_core_news_sm > /dev/null 2>&1
#!python -m spacy download es_core_news_md > /dev/null 2>&1
!python3 -m spacy download es_core_news_lg > /dev/null 2>&1

--quiet = -q avoids unnecessary installing process output

In [None]:
import json
import numpy as np
import pandas as pd
import re
import string
import random
import spacy
import medspacy
from spacy.tokens import Token
from spacy.lang.ca.examples import sentences
import nltk

from sklearn.model_selection import train_test_split

from gensim.models import Word2Vec

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, classification_report

!pip install sklearn_crfsuite
import sklearn_crfsuite
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import make_scorer
from sklearn_crfsuite import CRF


# Import the data

In [None]:
#!git clone https://github.com/Martinacarretta/githubTest.git
jsons = open('/home/martina/codi2/NLP - Med/negacio_train_v2024.json')
json_string = jsons.read()
json_object = json.loads(json_string)


#Data pre-processing



---

In summary, while correcting misspelled words can be beneficial for improving the performance of ML models, it may not always be necessary or feasible, especially in domains like medical text where specialized vocabulary and limited resources can pose challenges. It's essential to weigh the potential benefits against the practical considerations and available resources before deciding on the preprocessing approach.

Furthermore, the text data in this domain often includes a mixture of Spanish and Catalan languages, which complicates the use of standard spell checkers and part-of-speech taggers. However, considering that providing additional linguistic information beyond the word itself might enhance the model's performance, a possible approach could involve tokenizing and tagging the text in either Spanish or Catalan.

It's worth noting that for out-of-vocabulary words, SpaCy will still attempt to process them. However, since these words are not recognized in its vocabulary, SpaCy may assign them default part-of-speech tags (e.g., "X") and dependency labels (e.g., "ROOT"), which may impact the accuracy of downstream NLP tasks. Therefore, careful consideration of language-specific preprocessing steps and the incorporation of domain-specific linguistic resources are crucial for developing robust models in such multilingual and specialized domains.

After opting to utilize SpaCy's part-of-speech tagger for Spanish, the documents will undergo tokenization using the same library. SpaCy offers robust support for Spanish language processing, providing accurate and reliable tokenization and part-of-speech tagging capabilities. By leveraging SpaCy for both tokenization and part-of-speech tagging, we ensure consistency in the preprocessing pipeline, which is essential for maintaining coherence and reliability in downstream NLP tasks.

In [None]:
def convert_token_to_char(text, token_indexes): #to store the character index from tokens in text
    nlp_es = spacy.load('es_core_news_lg')
    text_t = nlp_es(text)  # Tokenized text
    start_t = token_indexes[0]
    end_t = token_indexes[1]

    # Extract tokens and their start indices
    tokens = [(token.text, token.idx) for token in text_t]

    # Find start and end indices of the specified tokens
    start = tokens[start_t][1]
    print()
    end = tokens[end_t][1] + len(tokens[end_t][0])

    return (start, end-1) #space

In [None]:
def convert_char_to_token(tokenized_text, text, char_indexes):
    start_index, end_index = char_indexes
    if (text[end_index] == " ") or (text[end_index] in string.punctuation):  # Check for punctuation
        end_index -= 1
    start_token_index, end_token_index = None, None

    for i, token in enumerate(tokenized_text):
        if start_index >= token.idx and start_index < token.idx + len(token.text):
            start_token_index = i
        if end_index >= token.idx and end_index < token.idx + len(token.text):
            end_token_index = i

    # If the end_token_index is None, set it to the start_token_index + 1
    if end_token_index is None:
        end_token_index = start_token_index + 1

    return start_token_index, end_token_index

In [None]:
def convert_char_to_token(tokens, text, char_indices):
    start_char_index, end_char_index = char_indices
    token_start_index = next(i for i, token in enumerate(tokens) if token.idx >= start_char_index)
    token_end_index = next(i for i, token in enumerate(tokens) if token.idx >= end_char_index)
    return token_start_index, token_end_index

# Training

## Extraction of true and creation of lists

Variable names:

* X = list with token and POS
* list with nested list for doc (vec_n arrays)
    * y_neg
    * y_unc
    * y_nsco
    * y_usco



In [None]:
nlp_es = spacy.load('es_core_news_lg') #Outside the for loop as it can be used as the same variable for each entry

X = []  # Feature vectors
y_neg = []  # Labels (vec_n arrays)
y_unc = []
y_nsco = []
y_usco = []
y_neg_nsco = []
y_unc_usco = []

for entry in json_object:
    text = entry.get('data')['text']
    doc = nlp_es(text)

    # Create vectors of true labels
    vec_n = np.zeros(len(doc), dtype=int)
    vec_u =  np.zeros(len(doc), dtype=int)
    vec_n_sco = np.zeros(len(doc), dtype=int)
    vec_u_sco =  np.zeros(len(doc), dtype=int)
    vec_neg_nsco = np.zeros(len(doc), dtype=int)
    vec_unc_usco = np.zeros(len(doc), dtype=int)

    for prediction in entry.get('predictions', []):
        for label_data in prediction['result']:
          label_value = label_data['value']
          labels = label_value['labels']
          start_index = label_value['start']
          end_index = label_value['end']
          text2 = text[start_index:end_index]  # Extract text based on start and end indexes

          # Add words to corresponding sets based on labels
          for label in labels:
              if label == "NEG":
                  start, end = convert_char_to_token(doc, text, (start_index, end_index)) # get index of negation in token form
                  vec_n[start:end] = 1
                  vec_neg_nsco[start:end] = 2
              if label == "UNC":
                  start, end = convert_char_to_token(doc, text, (start_index, end_index)) # get index of uncertainty in token form
                  vec_u[start:end] = 1
                  vec_unc_usco[start:end] = 2
              if label == "NSCO":
                  start, end = convert_char_to_token(doc, text, (start_index, end_index)) # get index of negation scope in token form
                  vec_n_sco[start:end] = 1
                  vec_neg_nsco[start:end] = 1
              if label == "USCO":
                  start, end = convert_char_to_token(doc, text, (start_index, end_index)) # get index of uncertainty scope in token form
                  vec_u_sco[start:end] = 1
                  vec_unc_usco[start:end] = 1

    tokens_list = [token.text.lower() for token in doc] # Doc has object type, to work with word embeddings, we need a list of tokens. The lower() is to ensure consistency

    # Generate the feature vectors and labels
    list_for_dictionaries = [] # to append every word in a same doc
    for i, token in enumerate(tokens_list):
        x_vec = {'word': token , 'POS': doc[i].pos_} # For every token
        list_for_dictionaries.append(x_vec)

    X.append(list_for_dictionaries) # Here we should have a list for each doc with nested dictionaries for each word
    vec_n = list(vec_n)
    vec_u = list(vec_u)
    vec_n_sco = list(vec_n_sco)
    vec_u_sco = list(vec_u_sco)
    vec_neg_nsco = list(vec_neg_nsco)
    vec_unc_usco = list(vec_unc_usco)

    y_neg.append(vec_n) #appending the whole vector of 1 and 0
    y_unc.append(vec_u)
    y_nsco.append(vec_n_sco)
    y_usco.append(vec_u_sco)
    y_neg_nsco.append(vec_neg_nsco)
    y_unc_usco.append(vec_unc_usco)

y_neg = [[str(element) for element in sequence] for sequence in y_neg] # The model needs strings
y_unc = [[str(element) for element in sequence] for sequence in y_unc]
y_nsco = [[str(element) for element in sequence] for sequence in y_nsco]
y_usco = [[str(element) for element in sequence] for sequence in y_usco]
y_neg_nsco = [[str(element) for element in sequence] for sequence in y_neg_nsco]
y_unc_usco = [[str(element) for element in sequence] for sequence in y_unc_usco]

In [None]:
def count_ones(binary_vector):
    count = 0
    for bit in binary_vector:
        if bit == 1:
            count += 1
    return count

In [None]:
# Split the data into training and validation sets

n = 200
X_train = X[:200]

y_train_neg = y_neg[:200]
y_train_unc = y_unc[:200]
y_train_nsco = y_nsco[:200]
y_train_usco = y_usco[:200]
y_train_neg_nsco = y_neg_nsco[:200]
y_train_unc_usco = y_unc_usco[:200]

X_val = X[200:]

y_val_neg = y_neg[200:]
y_val_unc = y_unc[200:]
y_val_nsco = y_nsco[200:]
y_val_usco = y_usco[200:]
y_val_neg_nsco = y_neg_nsco[200:]
y_val_unc_usco = y_unc_usco[200:]

num_docs = len(X_val)
print("Num of training docs:", len(X_train))
print("Num of validation docs:", num_docs)

Num of training docs: 200
Num of validation docs: 54


## Training and validation set evaluation

In [None]:
def create_report(num_docs, true, prediction1, prediction2):
    num_documents = num_docs
    dfs = [] #list to store DataFrames for each document
    for n in range(num_documents):
        report1 = classification_report(true[n], prediction1[n], labels=[0, 1], output_dict=True)
        report2 = classification_report(true[n], prediction2[n], labels=[0, 1], output_dict=True)

        # Delete accuracy, macro-average, and weighted average
        for report in [report1, report2]:
            for key in ['accuracy', 'macro avg', 'micro avg', 'weighted avg']:
                if key in report:
                    del report[key]

        df_report1 = pd.DataFrame(report1).transpose()
        df_report1['Prediction'] = [prediction1[n].count(i) for i in [0, 1]]

        df_report2 = pd.DataFrame(report2).transpose()
        df_report2.columns = [f'{col}2' for col in df_report2.columns]  # Rename columns for report2
        df_report2['Prediction2'] = [prediction2[n].count(i) for i in [0, 1]]

        df_report = pd.concat([df_report1, df_report2], axis=1)  # Concatenate the two DataFrames horizontally
        df_report['Document'] = n + 1  # Add 'Document' column

        dfs.append(df_report)

    # Concatenate DataFrames for all documents
    classification_reports_df = pd.concat(dfs)
    classification_reports_df.reset_index(inplace=True)  # Reset index
    classification_reports_df.rename(columns={'index': 'Class'}, inplace=True)  # Rename the index column to 'Class'

    # Replace precision, recall, and F1-score with NaN if support is zero (since their measures can't be computed)
    classification_reports_df.loc[classification_reports_df['support'] == 0, ['precision', 'recall', 'f1-score']] = np.nan
    classification_reports_df.loc[classification_reports_df['support2'] == 0, ['precision2', 'recall2', 'f1-score2']] = np.nan

    classification_reports_df[['precision', 'recall', 'f1-score', 'precision2', 'recall2', 'f1-score2']] = classification_reports_df[['precision', 'recall', 'f1-score', 'precision2', 'recall2', 'f1-score2']].round(2)
    classification_reports_df[['support', 'support2']] = classification_reports_df[['support', 'support2']].astype(int)

    # Move 'Document' to the front
    cols = classification_reports_df.columns.tolist()
    cols = ['Document', 'Class'] + [col for col in cols if col not in ['Document', 'Class']]
    classification_reports_df = classification_reports_df[cols]

    # Add separator column
    classification_reports_df.insert(loc=classification_reports_df.columns.get_loc('Prediction')+1, column='|', value='|')
    return classification_reports_df


Two Conditional Random Field (CRF) models will be trained for each tag. The final selection among these models will be based on their performance on the validation set, ensuring robustness and generalization to unseen data.

While Grid Search is a widely used technique for hyperparameter tuning, it's important to note that the CRF model presents limitations in its compatibility with this method. Specifically, the 'keep_tempfiles' parameter, which controls whether temporary files are retained after training, poses a challenge for traditional Grid Search implementations.

### Negation detection

In [None]:
crf_neg = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=5,
    c2=5,
    max_iterations=1000,
    all_possible_transitions=True
)

crf2_neg = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=1000,
    all_possible_transitions=True
)

try:
  crf_neg.fit(X_train, y_train_neg)
  crf2_neg.fit(X_train, y_train_neg)

except AttributeError:
  pass

In [None]:
y_pred_neg_model1 = crf_neg.predict(X_val)
y_pred_neg_model2 = crf2_neg.predict(X_val)

y_val_neg_int = [[int(element) for element in sequence] for sequence in y_val_neg]

y_pred_neg_model1 = [[int(element) for element in sequence] for sequence in y_pred_neg_model1]
y_pred_neg_model2 = [[int(element) for element in sequence] for sequence in y_pred_neg_model2]

In [None]:
report_val_neg = create_report (num_docs, y_val_neg_int, y_pred_neg_model1, y_pred_neg_model2)

In [None]:
val_neg_metrics = report_val_neg.groupby('Class')[['precision', 'recall', 'f1-score']].mean()
print("Model 1: Average Metrics")
val_neg_metrics

Model 1: Average Metrics


Unnamed: 0_level_0,precision,recall,f1-score
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.997778,1.0,0.999444
1,0.98463,0.802778,0.878889


In [None]:
val_neg_metrics2 = report_val_neg.groupby('Class')[['precision2', 'recall2', 'f1-score2']].mean()
print("Model 2: Average Metrics")
val_neg_metrics2

Model 2: Average Metrics


Unnamed: 0_level_0,precision2,recall2,f1-score2
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.999259,1.0,0.999815
1,0.971481,0.911852,0.938148


### Uncertainty detection

In [None]:
crf_unc = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=10,
    c2=10,
    max_iterations=1000,
    all_possible_transitions=True
)

crf2_unc = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=1000,
    all_possible_transitions=True
)
try:
  crf_unc.fit(X_train, y_train_unc)
  crf2_unc.fit(X_train, y_train_unc)
except AttributeError:
  pass

In [None]:
y_pred_unc_model1 = crf_unc.predict(X_val)
y_pred_unc_model2 = crf2_unc.predict(X_val)

y_val_unc_int = [[int(element) for element in sequence] for sequence in y_val_unc]

y_pred_unc_model1 = [[int(element) for element in sequence] for sequence in y_pred_unc_model1]
y_pred_unc_model2 = [[int(element) for element in sequence] for sequence in y_pred_unc_model2]

In [None]:
report_val_unc = create_report(num_docs, y_val_unc_int, y_pred_unc_model1, y_pred_unc_model2)

In [None]:
val_unc_metrics = report_val_unc.groupby('Class')[['precision', 'recall', 'f1-score']].mean()
print("Model 1: Average Metrics")
val_unc_metrics

Model 1: Average Metrics


Unnamed: 0_level_0,precision,recall,f1-score
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.998519,1.0,0.99963
1,0.0,0.0,0.0


In [None]:
val_unc_metrics2 = report_val_unc.groupby('Class')[['precision2', 'recall2', 'f1-score2']].mean()
print("Model 2: Average Metrics")
val_unc_metrics2

Model 2: Average Metrics


Unnamed: 0_level_0,precision2,recall2,f1-score2
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1.0,1.0,1.0
1,0.723636,0.728182,0.706667


### Negation scope detection

In [None]:
crf_nsco = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=10,
    c2=10,
    max_iterations=1000,
    all_possible_transitions=True
)

crf2_nsco = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=1000,
    all_possible_transitions=True
)
try:
  crf_nsco.fit(X_train, y_train_nsco)
  crf2_nsco.fit(X_train, y_train_nsco)
except AttributeError:
  pass

In [None]:
y_pred_nsco_model1 = crf_nsco.predict(X_val)
y_pred_nsco_model2 = crf2_nsco.predict(X_val)

y_val_nsco_int = [[int(element) for element in sequence] for sequence in y_val_nsco]

y_pred_nsco_model1 = [[int(element) for element in sequence] for sequence in y_pred_nsco_model1]
y_pred_nsco_model2 = [[int(element) for element in sequence] for sequence in y_pred_nsco_model2]

In [None]:
report_val_nsco = create_report (num_docs, y_val_nsco_int, y_pred_nsco_model1, y_pred_nsco_model2)

In [None]:
val_nsco_metrics = report_val_nsco.groupby('Class')[['precision', 'recall', 'f1-score']].mean()
print("Model 1: Average Metrics")
val_nsco_metrics

Model 1: Average Metrics


Unnamed: 0_level_0,precision,recall,f1-score
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.950926,0.999444,0.974815
1,0.485741,0.089074,0.142407


In [None]:
val_nsco_metric2 = report_val_nsco.groupby('Class')[['precision2', 'recall2', 'f1-score2']].mean()
print("Model 2: Average Metrics")
val_nsco_metric2

Model 2: Average Metrics


Unnamed: 0_level_0,precision2,recall2,f1-score2
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.969444,0.992222,0.980741
1,0.773704,0.487037,0.570741




---



Given the suboptimal performance of both models, an alternative approach will be taken to enhance the obtained results.

Determining a negation scope poses a considerable challenge, as it lacks a consistent, universal characteristic. Rather, negation scopes are defined by their adherence to a negation cue. Consequently, the following model is trained to recognize and comprehend both negation cues and scopes. This dual focus allows the model to capture the nature of negation scopes, thereby improving its performance.

To ensure accurate metric calculations, words identified as negation cues are subsequently converted to '0'.



---



In [None]:
crf_neg_nsco = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=10,
    c2=10,
    max_iterations=1000,
    all_possible_transitions=True
)

crf_neg_nsco2 = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=1,
    c2=0.1,
    max_iterations=1000,
    all_possible_transitions=True
)
try:
  crf_neg_nsco.fit(X_train, y_train_neg_nsco)
  crf_neg_nsco2.fit(X_train, y_train_neg_nsco)
except AttributeError:
  pass

In [None]:
y_pred_neg_nsco_model1 = crf_neg_nsco.predict(X_val)
y_pred_neg_nsco_model2 = crf_neg_nsco2.predict(X_val)

y_val_neg_nsco_int = [[int(element) for element in sequence] for sequence in y_val_neg_nsco]

y_pred_neg_nsco_model1 = [[int(element) for element in sequence] for sequence in y_pred_neg_nsco_model1]
y_pred_neg_nsco_model2 = [[int(element) for element in sequence] for sequence in y_pred_neg_nsco_model2]


In [None]:
report_val_neg_nsco = create_report (num_docs, y_val_neg_nsco_int, y_pred_neg_nsco_model1, y_pred_neg_nsco_model2)

In [None]:
val_neg_nsco_metrics = report_val_neg_nsco.groupby('Class')[['precision', 'recall', 'f1-score']].mean()
print("Model 3: Average Metrics")
val_neg_nsco_metrics

Model 3: Average Metrics


Unnamed: 0_level_0,precision,recall,f1-score
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.967407,0.996852,0.981667
1,0.878889,0.553148,0.657222


In [None]:
val_neg_nsco_metrics2 = report_val_neg_nsco.groupby('Class')[['precision2', 'recall2', 'f1-score2']].mean()
print("Model 4: Average Metrics")
val_neg_nsco_metrics2

Model 4: Average Metrics


Unnamed: 0_level_0,precision2,recall2,f1-score2
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.977593,0.995926,0.986852
1,0.898333,0.68463,0.754259


As evident from the data, the results achieved by both models surpass those of models trained only on negation scopes. The latest one improves the general performance by 0.15 point, which is very considerable.

The obtained results suggest that the best model is the lattest one, which will be used in the unseen data of the test set.

### Uncertainty scope detection

In [None]:
crf_usco = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=10,
    c2=10,
    max_iterations=1000,
    all_possible_transitions=True
)

crf2_usco = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=1,
    c2=0.1,
    max_iterations=1000,
    all_possible_transitions=True
)
try:
  crf_usco.fit(X_train, y_train_usco)
  crf2_usco.fit(X_train, y_train_usco)
except AttributeError:
  pass

In [None]:
y_pred_usco_model1 = crf_usco.predict(X_val)
y_pred_usco_model2 = crf2_usco.predict(X_val)

y_val_usco_int = [[int(element) for element in sequence] for sequence in y_val_usco]

y_pred_usco_model1 = [[int(element) for element in sequence] for sequence in y_pred_usco_model1]
y_pred_usco_model2 = [[int(element) for element in sequence] for sequence in y_pred_usco_model2]


In [None]:
# For many documents, the prediction is 0,
# this is to check if there is any document where the prediction isn't 0
for i in range (54):
    if count_ones(y_pred_usco_model1[i]) > 0:
        print("MODEL 1")
    if count_ones(y_pred_usco_model2[i]) > 0:
        print("MODEL 2", i)

In [None]:
report_val_usco = create_report (num_docs, y_val_usco_int, y_pred_usco_model1, y_pred_usco_model2)

In [None]:
val_usco_metrics = report_val_usco.groupby('Class')[['precision', 'recall', 'f1-score']].mean()
print("Model 1: Average Metrics")
val_usco_metrics

Model 1: Average Metrics


Unnamed: 0_level_0,precision,recall,f1-score
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.994444,1.0,0.997593
1,0.0,0.0,0.0


In [None]:
val_usco_metric2 = report_val_usco.groupby('Class')[['precision2', 'recall2', 'f1-score2']].mean()
print("Model 2: Average Metrics")
val_usco_metric2

Model 2: Average Metrics


Unnamed: 0_level_0,precision2,recall2,f1-score2
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.994444,1.0,0.997593
1,0.0,0.0,0.0


---
Just like with negation scopes, uncertainty scopes pose a similar challenge. They don't play by any strict rules or show clear characteristics. But what ties them all together is their proximity to an uncertainty cue.

The methodology applied will mirror that of negation scopes. The forthcoming models will be trained using vectors encompassing both uncertainty cues and scopes, designated for their respective tasks.

After the prediction, those words labelled as uncertainty cues will be set to 0 for metric calculation.

---

In [None]:
crf_unc_usco = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=10,
    c2=10,
    max_iterations=1000,
    all_possible_transitions=True
)

crf_unc_usco2 = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=1,
    c2=0.1,
    max_iterations=1000,
    all_possible_transitions=True
)
try:
  crf_unc_usco.fit(X_train, y_train_unc_usco)
  crf_unc_usco2.fit(X_train, y_train_unc_usco)
except AttributeError:
  pass

In [None]:
y_pred_unc_usco_model1 = crf_unc_usco.predict(X_val)
y_pred_unc_usco_model2 = crf_unc_usco2.predict(X_val)

y_val_unc_usco_int = [[int(element) for element in sequence] for sequence in y_val_unc_usco]

y_pred_unc_usco_model1 = [[int(element) for element in sequence] for sequence in y_pred_unc_usco_model1]
y_pred_unc_usco_model2 = [[int(element) for element in sequence] for sequence in y_pred_unc_usco_model2]


In [None]:
report_val_unc_usco = create_report (num_docs, y_val_unc_usco_int, y_pred_unc_usco_model1, y_pred_unc_usco_model2)

In [None]:
val_unc_usco_metrics = report_val_unc_usco.groupby('Class')[['precision', 'recall', 'f1-score']].mean()
print("Model 3: Average Metrics")
val_unc_usco_metrics

Model 3: Average Metrics


Unnamed: 0_level_0,precision,recall,f1-score
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.991852,1.0,0.996296
1,0.0,0.0,0.0


In [None]:
val_unc_usco_metric2 = report_val_unc_usco.groupby('Class')[['precision2', 'recall2', 'f1-score2']].mean()
print("Model 4: Average Metrics")
val_unc_usco_metric2

Model 4: Average Metrics


Unnamed: 0_level_0,precision2,recall2,f1-score2
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.994259,0.999444,0.997407
1,0.538788,0.331515,0.365758


The fourth model is exhibiting superior performance compared to the others, with a substantial disparity in the metrics. The relatively poorer performance of the remaining models can be attributed to either the methodology employed or the parameters utilized.

Consequently, the most recent model will be deployed for making predictions on unseen data.

#Test

In [None]:
test = open('/content/githubTest/negacio_test_v2024.json')
test = test.read()
test = json.loads(test)

In [None]:
instances = 0
for i in range(len(test)):
  instances += 1

print(f'The test dataset contains',instances,'instances of medical documents')

The test dataset contains 64 instances of medical documents


##True labels extraction

In [None]:
X = []  # Feature vectors
y_neg = []  # Labels (vec_n arrays)
y_unc = []
y_nsco = []
y_usco = []

for entry in test:
    text = entry.get('data')['text']
    doc = nlp_es(text)
    tokens_list = [token.text.lower() for token in doc]

    # Create vectors of true labels
    vec_n = np.zeros(len(doc), dtype=int)
    vec_u =  np.zeros(len(doc), dtype=int)
    vec_n_sco = np.zeros(len(doc), dtype=int)
    vec_u_sco =  np.zeros(len(doc), dtype=int)

    for prediction in entry.get('predictions', []):
        for label_data in prediction['result']:
          label_value = label_data['value']
          labels = label_value['labels']
          start_index = label_value['start']
          end_index = label_value['end']
          text2 = text[start_index:end_index]  # Extract text based on start and end indexes

          # Add words to corresponding sets based on labels
          for label in labels:
              if label == "NEG":
                  start, end = convert_char_to_token(doc, text, (start_index, end_index)) # get index of negation in token form
                  vec_n[start:end] = 1
              if label == "UNC":
                  start, end = convert_char_to_token(doc, text, (start_index, end_index)) # get index of uncertainty in token form
                  vec_u[start:end] = 1
              if label == "NSCO":
                  start, end = convert_char_to_token(doc, text, (start_index, end_index)) # get index of negation scope in token form
                  vec_n_sco[start:end] = 1
              if label == "USCO":
                  start, end = convert_char_to_token(doc, text, (start_index, end_index)) # get index of uncertainty scope in token form
                  vec_u_sco[start:end] = 1

    # Generate the feature vectors and labels
    list_for_dictionaries = [] # to append every word in a same doc
    for i, token in enumerate(tokens_list):
        x_vec = {'word': token , 'POS': doc[i].pos_} # For every token
        list_for_dictionaries.append(x_vec)

    X.append(list_for_dictionaries) # Here we should have a list for each doc with nested dictionaries for each word
    vec_n = list(vec_n)
    vec_u = list(vec_u)
    vec_n_sco = list(vec_n_sco)
    vec_u_sco = list(vec_u_sco)

    y_neg.append(vec_n) #appending the whole vector of 1 and 0
    y_unc.append(vec_u)
    y_nsco.append(vec_n_sco)
    y_usco.append(vec_u_sco)

y_neg = [[str(element) for element in sequence] for sequence in y_neg] # The model needs strings
y_unc = [[str(element) for element in sequence] for sequence in y_unc]
y_nsco = [[str(element) for element in sequence] for sequence in y_nsco]
y_usco = [[str(element) for element in sequence] for sequence in y_usco]


In [None]:
def create_report(num_docs, true, prediction1):
    num_documents = num_docs
    dfs = [] #list to store DataFrames for each document
    for n in range(num_documents):
        report1 = classification_report(true[n], prediction1[n], labels=[0, 1], output_dict=True)

        # Delete accuracy, macro-average, and weighted average
        for report in [report1]:
            for key in ['accuracy', 'macro avg', 'micro avg', 'weighted avg']:
                if key in report:
                    del report[key]

        df_report1 = pd.DataFrame(report1).transpose()
        df_report1['Prediction'] = [prediction1[n].count(i) for i in [0, 1]]

        #df_report = pd.concat([df_report1, df_report2], axis=1)  # Concatenate the two DataFrames horizontally
        df_report1['Document'] = n + 1  # Add 'Document' column

        dfs.append(df_report1)

    # Concatenate DataFrames for all documents
    classification_reports_df = pd.concat(dfs)
    classification_reports_df.reset_index(inplace=True)  # Reset index
    classification_reports_df.rename(columns={'index': 'Class'}, inplace=True)  # Rename the index column to 'Class'

    # Replace precision, recall, and F1-score with NaN if support is zero (since their measures can't be computed)
    classification_reports_df.loc[classification_reports_df['support'] == 0, ['precision', 'recall', 'f1-score']] = np.nan

    classification_reports_df[['precision', 'recall', 'f1-score']] = classification_reports_df[['precision', 'recall', 'f1-score']].round(2)
    classification_reports_df[['support']] = classification_reports_df[['support']].astype(int)

    # Move 'Document' to the front
    cols = classification_reports_df.columns.tolist()
    cols = ['Document', 'Class'] + [col for col in cols if col not in ['Document', 'Class']]
    classification_reports_df = classification_reports_df[cols]

    # Add separator column
    return classification_reports_df


In [None]:
X_test = X

Only the best models are used to in the unseen data of the test set. Those model correspond to *model 2* for NEG, *model 2* for UNC, *model 4* for NSCO and *model 4* for USCO.

##Negation detection

In [None]:
y_pred_neg_model2 = crf2_neg.predict(X_test)

y_true_neg_int = [[int(element) for element in sequence] for sequence in y_neg]

y_pred_neg_model2 = [[int(element) for element in sequence] for sequence in y_pred_neg_model2]

In [None]:
report_test_neg = create_report (instances, y_true_neg_int, y_pred_neg_model2)

In [None]:
test_neg_metric = report_test_neg.groupby('Class')[['precision', 'recall', 'f1-score']].mean().round(2)
print("Model 2: Average Metrics")
test_neg_metric

Model 2: Average Metrics


Unnamed: 0_level_0,precision,recall,f1-score
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1.0,1.0,1.0
1,0.95,0.93,0.94


##Uncertainty detection

In [None]:
y_pred_unc_model2 = crf2_unc.predict(X_test)

y_true_unc_int = [[int(element) for element in sequence] for sequence in y_unc]

y_pred_unc_model2 = [[int(element) for element in sequence] for sequence in y_pred_unc_model2]

In [None]:
report_test_unc = create_report (instances, y_true_unc_int, y_pred_unc_model2)

In [None]:
test_unc_metric = report_test_unc.groupby('Class')[['precision', 'recall', 'f1-score']].mean().round(2)
print("Model 2: Average Metrics")
test_unc_metric

Model 2: Average Metrics


Unnamed: 0_level_0,precision,recall,f1-score
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1.0,1.0,1.0
1,0.85,0.74,0.75


## Negation scope detection

In [None]:
y_pred_nsco_model4 = crf_neg_nsco2.predict(X_test)

y_true_nsco_int = [[int(element) for element in sequence] for sequence in y_nsco]

y_pred_nsco_model4 = [[int(element) for element in sequence] for sequence in y_pred_nsco_model4]
y_pred_nsco_model4 = [[0 if x == 2 else x for x in sequence] for sequence in y_pred_nsco_model4]


In [None]:
report_test_nsco4 = create_report (instances, y_true_nsco_int, y_pred_nsco_model4)

In [None]:
test_nsco_metric4 = report_test_nsco4.groupby('Class')[['precision', 'recall', 'f1-score']].mean().round(2)
print("Model 4: Average Metrics")
test_nsco_metric4

Model 4: Average Metrics


Unnamed: 0_level_0,precision,recall,f1-score
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.99,1.0,0.99
1,0.91,0.71,0.78


##Uncertainty scope detection

In [None]:
y_pred_usco_model4 = crf_unc_usco2.predict(X_test)

y_true_usco_int = [[int(element) for element in sequence] for sequence in y_usco]

y_pred_usco_model4 = [[int(element) for element in sequence] for sequence in y_pred_usco_model4]
y_pred_usco_model4 = [[0 if x == 2 else x for x in sequence] for sequence in y_pred_usco_model4]


In [None]:
report_test_usco4 = create_report (instances, y_true_usco_int, y_pred_usco_model4)

In [None]:
test_usco_metric4 = report_test_usco4.groupby('Class')[['precision', 'recall', 'f1-score']].mean().round(2)
print("Model 4: Average Metrics")
test_usco_metric4

Model 4: Average Metrics


Unnamed: 0_level_0,precision,recall,f1-score
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1.0,1.0,1.0
1,0.65,0.35,0.41


The lower performance in uncertainty scope detection is likely due to the limited number of examples in the training set. The low recall performance indicates the model struggles to accurately predict these instances. However, the precision rate shows that 65% of the predictions are correct.

Additionally, the poor results may stem from the difficulty in accurately determining the range of the scope.

##Visualisation

- **y_true_neg_int**: true neg
  - **y_pred_neg_model2**: predicted negagation cues model 2

- **y_true_unc_int**: true unc
  - **y_pred_unc_model2**: predicted uncertainty cues model 2

- **y_true_nsco_int**: true nsco
  - **y_pred_nsco_model4**: preicted nsco model 4 (trained with both negation cues and scopes)

- **y_true_usco_int**: true usco
  - **y_pred_usco_model4**: predicted usco model 4 (trained with both uncertainty cues and scopes)

---
The following steps aim to present the results more clearly. Using a different color for each tag makes it easier to understand the model's actions and subjectively evaluate the results obtained.




In [None]:
def add_tags(tokens, NEG, UNC, NSCO, USCO, line_length=80):
    tagged_text = ""
    line_length_remaining = line_length
    for i in range(len(tokens)):
        word = str(tokens[i])
        tags = []

        if NEG[i] == 1:
            tags.append('\033[91m')  # Light red color for NEG
        if UNC[i] == 1:
            tags.append('\033[94m')  # Light blue color for UNC
        if NSCO[i] == 1:
            tags.append('\033[92m')  # Light green color for NSCO
        if USCO[i] == 1:
            tags.append('\033[38;5;208m')  # Orange color for USCO (ANSI color code)

        # Check if adding the word and tags exceeds the line length
        if len(word) + len(tags) * 9 + 1 > line_length_remaining:
            # Start a new line
            tagged_text += '\n'
            line_length_remaining = line_length

        if i+1 < len(tokens):
          if str(tokens[i+1]) in ",.?!:;*":
              space = ""
          else:
              space = " "

        if len(tags) > 0:
            for tag in tags:
                tagged_text += tag + word + '\033[0m' + space
        else:
            tagged_text += word + space

        # Adjust line_length_remaining considering ANSI escape codes
        line_length_remaining -= len(word) + len(tags) * (7 if len(tags) > 0 else 0) + 1

        # Calculate the length of the word considering ANSI escape codes
        word_length = len(word) + len(tags) #* 9

        # Check if adding the word and tags exceeds the line length
        if word_length > line_length_remaining:
            # Start a new line
            tagged_text += '\n'
            line_length_remaining = line_length

    return tagged_text

# Print the legend
def print_legend():
    print("Color legend:")
    print("\033[91mNEG\033[0m ")
    print("\033[94mUNC\033[0m ")
    print("\033[92mNSCO\033[0m ")
    print("\033[38;5;208mUSCO\033[0m \n")


In [None]:
n = random.randint(0, instances)
#n = 45

# Binary lists for each tag
NEG = y_pred_neg_model2[n]
UNC = y_pred_unc_model2[n]
NSCO = y_pred_nsco_model4[n]
USCO = y_pred_usco_model4[n]

text = test[n].get('data')['text']
tokens = nlp_es(text)

# Print out the legend
print_legend()

print('\033[1mDetected neg/unc/nsco/usco for document ' + str(n) + '\033[0m')

# Print out the entire text with added tags
print(add_tags(tokens, NEG, UNC, NSCO, USCO, line_length=140))

Color legend:
[91mNEG[0m 
[94mUNC[0m 
[92mNSCO[0m 
[38;5;208mUSCO[0m 

[1mDetected neg/unc/nsco/usco for document 45[0m
  nº historia clinica:******** nºepisodi:******** sexe: home data de naixement: 19.02.1957 edat: 61 anys procedencia 
cex mateix hosp servei urologia data d'ingres 20.04.2018 data d'alta 24.04.2018 12:00:00 ates per**************,******
*;************,***** informe d'alta d'hospitalitzacio motiu d'ingres paciente que ingresa de forma programada para 
nefrectomia parcial derecha laparoscopica asistida por robot. antecedents [91mno[0m [92malergias[0m [92mmedicamentosas[0m 
[92mconocidas[0m. fumador 6 cigarrillos al dia. [91mno[0m antecedentes quirurgicos. proces actual paciente que a raiz de tumefaccion 
en falange de dedo indice de mano derecha se realiza estudio con tc que detecta tumoracion renal derecha de 34 mm [94msugestiva[0m 
[94mde[0m [38;5;208mmalignidad[0m[38;5;208m.[0m se realiza biospia de lesion de falange pendiente de informe d

In [None]:
# True binary lists for each tag
NEG = y_true_neg_int[n]
UNC = y_true_unc_int[n]
NSCO = y_true_nsco_int[n]
USCO = y_true_usco_int[n]

text = test[n].get('data')['text']
tokens = nlp_es(text)

# Print out the legend
print_legend()

print('\033[1mTrue neg/unc/nsco/usco for document ' + str(n) + '\033[0m')

# Print out the entire text with added tags
print(add_tags(tokens, NEG, UNC, NSCO, USCO, line_length=140))

Color legend:
[91mNEG[0m 
[94mUNC[0m 
[92mNSCO[0m 
[38;5;208mUSCO[0m 

[1mTrue neg/unc/nsco/usco for document 45[0m
  nº historia clinica:******** nºepisodi:******** sexe: home data de naixement: 19.02.1957 edat: 61 anys procedencia 
cex mateix hosp servei urologia data d'ingres 20.04.2018 data d'alta 24.04.2018 12:00:00 ates per**************,******
*;************,***** informe d'alta d'hospitalitzacio motiu d'ingres paciente que ingresa de forma programada para 
nefrectomia parcial derecha laparoscopica asistida por robot. antecedents [91mno[0m [92malergias[0m [92mmedicamentosas[0m 
[92mconocidas[0m. fumador 6 cigarrillos al dia. [91mno[0m [92mantecedentes[0m [92mquirurgicos[0m. proces actual paciente que a raiz de 
tumefaccion en falange de dedo indice de mano derecha se realiza estudio con tc que detecta tumoracion renal derecha de 34 mm 
[94msugestiva[0m [94mde[0m [38;5;208mmalignidad[0m. se realiza biospia de lesion de falange pendiente de informe de