#**Deep Learning Method**

Group 7:
- Martina Carretta
- Meritxell Carvajal
- Mariona Pla
- Ares Sellart

In [None]:
!pip install --quiet medspacy
!pip install --quiet spacy nltk
!python -m spacy download es_core_news_lg > /dev/null 2>&1

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/243.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m184.3/243.8 kB[0m [31m5.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m243.8/243.8 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.4/67.4 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.1/71.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.9/69.9 kB[0m [31m8

In [None]:
#!pip install --quiet spacy
import spacy

In [None]:
import json
import nltk

import os

import medspacy
from spacy.tokens import Token

import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, TimeDistributed, Bidirectional
from sklearn.model_selection import train_test_split

import torch.nn as nn
import torch
import torch.optim as optim
import torch.nn.functional as F
from sklearn.metrics import classification_report

# Import the data

In [None]:
!git clone https://github.com/Martinacarretta/NLP_Med

repo_dir = '/content/NLP_Med'
if not os.path.exists(repo_dir):
    !git clone https://github.com/Martinacarretta/NLP_Med {repo_dir}

# Load data
json_path = os.path.join(repo_dir, 'negacio_train_v2024.json')

with open(json_path, 'r') as json_file:
    json_object = json.load(json_file)

Cloning into 'NLP_Med'...
remote: Enumerating objects: 17, done.[K
remote: Counting objects: 100% (17/17), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 17 (delta 5), reused 16 (delta 4), pack-reused 0[K
Receiving objects: 100% (17/17), 838.99 KiB | 2.49 MiB/s, done.
Resolving deltas: 100% (5/5), done.


In [None]:
def convert_char_to_token(tokens, text, char_indices):
    start_char_index, end_char_index = char_indices
    token_start_index = next(i for i, token in enumerate(tokens) if token.idx >= start_char_index)
    token_end_index = next(i for i, token in enumerate(tokens) if token.idx >= end_char_index)
    return token_start_index, token_end_index

In [None]:
nlp_es = spacy.load('es_core_news_lg')
X = []
y = []

for entry in json_object:
    text = entry.get('data')['text']
    doc = nlp_es(text)

    # Create vectors of true labels
    true = np.zeros(len(doc), dtype=int)

    for prediction in entry.get('predictions', []):
        for label_data in prediction['result']:
          label_value = label_data['value']
          labels = label_value['labels']
          start_index = label_value['start']
          end_index = label_value['end']
          text2 = text[start_index:end_index]  # Extract text based on start and end indexes

          # Add words to corresponding sets based on labels
          for label in labels:
              if label == "NEG":
                  start, end = convert_char_to_token(doc, text, (start_index, end_index)) # get index of negation in token form
                  true[start:end] = 1
              if label == "UNC":
                  start, end = convert_char_to_token(doc, text, (start_index, end_index)) # get index of uncertainty in token form
                  true[start:end] = 2
              if label == "NSCO":
                  start, end = convert_char_to_token(doc, text, (start_index, end_index)) # get index of negation scope in token form
                  true[start:end] = 3
              if label == "USCO":
                  start, end = convert_char_to_token(doc, text, (start_index, end_index)) # get index of uncertainty scope in token form
                  true[start:end] = 4


    tokens_list = [token.text.lower() for token in doc] # Doc has object type, to work with word embeddings, we need a list of tokens. The lower() is to ensure consistency

    X.append(tokens_list)
    true = list(true)
    y.append(true)

y = [[str(element) for element in sequence] for sequence in y]

In [None]:
vocab = set([item for doc in X for item in doc])
vocab.add('<UNK>')  # Add unkown word in case there are unseed words in test set

In [None]:
word_to_ix = {word: i for i, word in enumerate(vocab)}

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

#Model: LSTM

##Build the model

In this part of the process we introduce an LSTM-based neural network model, LSTMTagger. In summary, its architecture encompasses an embedding layer, an LSTM layer, and a linear layer, enabling the model to process input sequences, capture sequential dependencies, and map them to tag scores effectively.

We thought that an LSTM model could be a great choice for our project for several reasons. Firstly, LSTMs are supposed to excel in sequential modeling, crucial for understanding linguistic structures where word context is paramount. Secondly, their memory cells enable them to retain information over long sequences and they can handle varying sequence lengths.

In [None]:
class LSTMTagger(nn.Module):
  def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
    super(LSTMTagger, self).__init__()
    self.hidden_dim = hidden_dim

    self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

    # The LSTM takes word embeddings as inputs, and outputs hidden states
    # with dimensionality hidden_dim.
    self.lstm = nn.LSTM(embedding_dim, hidden_dim)

    # The linear layer that maps from hidden state space to tag space
    self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

  def forward(self, sentence):
    embeds = self.word_embeddings(sentence)
    lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
    tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
    tag_scores = F.log_softmax(tag_space, dim=1)
    return tag_scores

input_dim = len(vocab) # Number of unique words + '<UNK>' index
embedding_dim = 100
hidden_dim = 128
output_dim = 5  # Number of tags
n_layers = 2

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = LSTMTagger(embedding_dim, hidden_dim, input_dim, output_dim).to(device)

#### `input_dim = len(vocab)`
- **Description:** It's the numnber of unique words in the initial data set plus a <UNK> in case there are unseen words in the test set.

#### `embedding_dim = 100`
- **Increasing:** (e.g., 200) Better word representation, higher memory/computation cost.
- **Decreasing:** (e.g., 50) Faster training, lower memory usage, less detailed word representation.

#### `hidden_dim = 128`:
Number of features in LSTM hidden state.
- **Increasing:** (e.g., 256) Captures complex patterns, higher computational load.
- **Decreasing:** (e.g., 64) Simpler, faster, but might miss complex patterns.

#### `output_dim = 5`
- **Description:** Number of output tags/classes.
- **Note:** Fixed by task requirements.


Given that the complexity and size of the dataset are not very high, opting for medium values for the parameters can indeed provide a balanced model that performs well without overfitting or being computationally expensive.

##Train

In [None]:
# Convert sequences to PyTorch tensors
def prepare_sequence(seq, to_ix):
    idxs = []
    for word in seq:
      if word not in vocab:
        idxs.append(to_ix['<UNK>']) # Handle unseen words (not present in the vocabulary set)
      else:
        idxs.append(to_ix[word])
    return torch.tensor(idxs, dtype=torch.long)

In [None]:
# Define loss function and optimizer
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 10
# Training loop
for epoch in range(epochs):  # Define the number of epochs
    model.train()
    epoch_loss = 0
    for i in range(len(X_train)):
        model.zero_grad()

        sentence_in = prepare_sequence(X_train[i], word_to_ix)
        targets = torch.tensor([int(tag) for tag in y_train[i]], dtype=torch.long)

        sentence_in = sentence_in.to(device)
        targets = targets.to(device)

        tag_scores = model(sentence_in)

        loss = loss_function(tag_scores, targets)
        epoch_loss += loss.item()

        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch + 1}, Loss: {epoch_loss / len(X_train)}')

Epoch 1, Loss: 0.290883031751722
Epoch 2, Loss: 0.119965618288172
Epoch 3, Loss: 0.09234215576302608
Epoch 4, Loss: 0.07170884104518982
Epoch 5, Loss: 0.05615185164809777
Epoch 6, Loss: 0.04477982281184219
Epoch 7, Loss: 0.0372947342349721
Epoch 8, Loss: 0.0321215303268357
Epoch 9, Loss: 0.0272743701315462
Epoch 10, Loss: 0.024968351073462167


Setting the number of epochs to 10 strikes a balance between training efficiency, model convergence, and computational constraints. It allows for multiple training iterations while ensuring timely experimentation and resource utilization.

While a number of epochs of 15 decreases the loss in the training set, it decreases a bit the performance of the model in the validaiton and test set. A higher number of epochs can lead to overfitting, that's why 10 is a better choice.

##Validation

In [None]:
def predict_tags(model, docs, word_to_ix):
    predictions = []
    model.eval()
    for doc in docs:
        with torch.no_grad():
          inputs = prepare_sequence(doc, word_to_ix).to(device)
          tag_scores = model(inputs)
          _, predicted = torch.max(tag_scores, 1)
          predicted_tags = [str(tag.item()) for tag in predicted]
          predictions.append(predicted_tags)
    return predictions

In [None]:
# Evaluation
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for i in range(len(X_val)):
        inputs = prepare_sequence(X_val[i], word_to_ix).to(device)
        targets = torch.tensor([int(tag) for tag in y_val[i]], dtype=torch.long).to(device)
        tag_scores = model(inputs)
        _, predicted = torch.max(tag_scores, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

    accuracy = correct / total
    print(f'Accuracy on validation set: {accuracy}')

Accuracy on validation set: 0.9745001726850608


In [None]:
predictions = predict_tags(model, X_val, word_to_ix)

In [None]:
def create_report(num_docs, true, prediction1):
    num_documents = num_docs
    dfs = [] #list to store DataFrames for each document
    for n in range(num_documents):
        report1 = classification_report(true[n], prediction1[n], labels=[0, 1, 2, 3, 4], output_dict=True)

        # Delete accuracy, macro-average, and weighted average
        for report in [report1]:
            for key in ['accuracy', 'macro avg', 'micro avg', 'weighted avg']:
                if key in report:
                    del report[key]

        df_report1 = pd.DataFrame(report1).transpose()
        df_report1['Prediction'] = [prediction1[n].count(i) for i in [0, 1, 2, 3, 4]]

        df_report1['Document'] = n + 1  # Add 'Document' column

        dfs.append(df_report1)

    # Concatenate DataFrames for all documents
    classification_reports_df = pd.concat(dfs)
    classification_reports_df.reset_index(inplace=True)  # Reset index
    classification_reports_df.rename(columns={'index': 'Class'}, inplace=True)  # Rename the index column to 'Class'

    # Replace precision, recall, and F1-score with NaN if support is zero (since their measures can't be computed)
    classification_reports_df.loc[classification_reports_df['support'] == 0, ['precision', 'recall', 'f1-score']] = np.nan

    classification_reports_df[['precision', 'recall', 'f1-score']] = classification_reports_df[['precision', 'recall', 'f1-score']].round(2)
    classification_reports_df[['support']] = classification_reports_df[['support']].astype(int)

    # Move 'Document' to the front
    cols = classification_reports_df.columns.tolist()
    cols = ['Document', 'Class'] + [col for col in cols if col not in ['Document', 'Class']]
    classification_reports_df = classification_reports_df[cols]

    # Add separator column
    return classification_reports_df


In [None]:
report = create_report(len(X_val), y_val, predictions)

In [None]:
val_metric = report.groupby('Class')[['precision', 'recall', 'f1-score']].mean().round(2)
print("Validation set average metrics")
val_metric

Validation set average metrics


Unnamed: 0_level_0,precision,recall,f1-score
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.99,0.99,0.99
1,0.95,0.94,0.94
2,0.83,0.69,0.73
3,0.83,0.85,0.83
4,0.66,0.54,0.54


##Test

In [None]:
# Load data
json_path = os.path.join(repo_dir, 'negacio_test_v2024.json')

with open(json_path, 'r') as json_file:
    json_object = json.load(json_file)

In [None]:
X = []
y = []

for entry in json_object:
    text = entry.get('data')['text']
    doc = nlp_es(text)

    # Create vectors of true labels
    true = np.zeros(len(doc), dtype=int)

    for prediction in entry.get('predictions', []):
        for label_data in prediction['result']:
          label_value = label_data['value']
          labels = label_value['labels']
          start_index = label_value['start']
          end_index = label_value['end']
          text2 = text[start_index:end_index]  # Extract text based on start and end indexes

          # Add words to corresponding sets based on labels
          for label in labels:
              if label == "NEG":
                  start, end = convert_char_to_token(doc, text, (start_index, end_index)) # get index of negation in token form
                  true[start:end] = 1
              if label == "UNC":
                  start, end = convert_char_to_token(doc, text, (start_index, end_index)) # get index of uncertainty in token form
                  true[start:end] = 2
              if label == "NSCO":
                  start, end = convert_char_to_token(doc, text, (start_index, end_index)) # get index of negation scope in token form
                  true[start:end] = 3
              if label == "USCO":
                  start, end = convert_char_to_token(doc, text, (start_index, end_index)) # get index of uncertainty scope in token form
                  true[start:end] = 4


    tokens_list = [token.text.lower() for token in doc] # Doc has object type, to work with word embeddings, we need a list of tokens. The lower() is to ensure consistency

    X.append(tokens_list)
    true = list(true)
    y.append(true)

y = [[str(element) for element in sequence] for sequence in y]


In [None]:
X_test = X

In [None]:
y_test = y

In [None]:
predictions_test = predict_tags(model, X_test, word_to_ix)

In [None]:
report_test = create_report(len(X_test), y_test, predictions_test)

In [None]:
test_metric = report_test.groupby('Class')[['precision', 'recall', 'f1-score']].mean().round(2)
print("Test set average metrics")
test_metric

Test set average metrics


Unnamed: 0_level_0,precision,recall,f1-score
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.99,0.99,0.99
1,0.95,0.96,0.95
2,0.88,0.68,0.75
3,0.84,0.87,0.84
4,0.83,0.55,0.63


##Results visualisation

In [None]:
def add_tags(tokens, predictions, line_length=80):
    tagged_text = ""
    line_length_remaining = line_length
    for i in range(len(tokens)):
        word = str(tokens[i])
        tags = []

        if predictions[i] == '1':
            tags.append('\033[91m')  # Light red color for NEG
        if predictions[i] == '2':
            tags.append('\033[94m')  # Light blue color for UNC
        if predictions[i] == '3':
            tags.append('\033[92m')  # Light green color for NSCO
        if predictions[i] == '4':
            tags.append('\033[38;5;208m')  # Orange color for USCO (ANSI color code)

        # Check if adding the word and tags exceeds the line length
        if len(word) + len(tags) * 9 + 1 > line_length_remaining:
            # Start a new line
            tagged_text += '\n'
            line_length_remaining = line_length

        if i+1 < len(tokens):
          if str(tokens[i+1]) in ",.?!:;*":
              space = ""
          else:
              space = " "

        if len(tags) > 0:
            for tag in tags:
                tagged_text += tag + word + '\033[0m' + space
        else:
            tagged_text += word + space

        # Adjust line_length_remaining considering ANSI escape codes
        line_length_remaining -= len(word) + len(tags) * (7 if len(tags) > 0 else 0) + 1

        # Calculate the length of the word considering ANSI escape codes
        word_length = len(word) + len(tags) #* 9

        # Check if adding the word and tags exceeds the line length
        if word_length > line_length_remaining:
            # Start a new line
            tagged_text += '\n'
            line_length_remaining = line_length

    return tagged_text

# Print the legend
def print_legend():
    print("Color legend:")
    print("\033[91mNEG\033[0m ")
    print("\033[94mUNC\033[0m ")
    print("\033[92mNSCO\033[0m ")
    print("\033[38;5;208mUSCO\033[0m \n")


In [None]:
import random
n = random.randint(0, len(y_test))
#n = 45

text = json_object[n].get('data')['text']
tokens = nlp_es(text)

# Print out the legend
print_legend()

print('\033[1mDetected neg/unc/nsco/usco for document ' + str(n) + '\033[0m')

# Print out the entire text with added tags
print(add_tags(tokens, predictions_test[n], line_length=140))

Color legend:
[91mNEG[0m 
[94mUNC[0m 
[92mNSCO[0m 
[38;5;208mUSCO[0m 

[1mDetected neg/unc/nsco/usco for document 45[0m
  nº historia clinica:******** nºepisodi:******** sexe: home data de naixement: 19.02.1957 edat: 61 anys procedencia 
cex mateix hosp servei urologia data d'ingres 20.04.2018 data d'alta 24.04.2018 12:00:00 ates per**************,******
*;************,***** informe d'alta d'hospitalitzacio motiu d'ingres paciente que ingresa de forma programada para 
nefrectomia parcial derecha laparoscopica asistida por robot. antecedents [91mno[0m [92malergias[0m [92mmedicamentosas[0m 
[92mconocidas[0m. fumador 6 cigarrillos al dia. [91mno[0m [92mantecedentes[0m [92mquirurgicos[0m. proces actual paciente que a raiz de 
tumefaccion en falange de dedo indice de mano derecha se realiza estudio con tc que detecta tumoracion renal derecha de 34 mm 
[94msugestiva[0m [94mde[0m [38;5;208mmalignidad[0m[38;5;208m.[0m se realiza biospia de lesion de falange pend

In [None]:
text = json_object[n].get('data')['text']
tokens = nlp_es(text)

# Print out the legend
print_legend()

print('\033[1mTrue neg/unc/nsco/usco for document ' + str(n) + '\033[0m')

# Print out the entire text with added tags
print(add_tags(tokens, y_test[n], line_length=140))

Color legend:
[91mNEG[0m 
[94mUNC[0m 
[92mNSCO[0m 
[38;5;208mUSCO[0m 

[1mTrue neg/unc/nsco/usco for document 45[0m
  nº historia clinica:******** nºepisodi:******** sexe: home data de naixement: 19.02.1957 edat: 61 anys procedencia 
cex mateix hosp servei urologia data d'ingres 20.04.2018 data d'alta 24.04.2018 12:00:00 ates per**************,******
*;************,***** informe d'alta d'hospitalitzacio motiu d'ingres paciente que ingresa de forma programada para 
nefrectomia parcial derecha laparoscopica asistida por robot. antecedents [91mno[0m [92malergias[0m [92mmedicamentosas[0m 
[92mconocidas[0m. fumador 6 cigarrillos al dia. [91mno[0m [92mantecedentes[0m [92mquirurgicos[0m. proces actual paciente que a raiz de 
tumefaccion en falange de dedo indice de mano derecha se realiza estudio con tc que detecta tumoracion renal derecha de 34 mm 
[94msugestiva[0m [94mde[0m [38;5;208mmalignidad[0m. se realiza biospia de lesion de falange pendiente de informe de