#**Deep Learning Method**

Group 7:
- Martina Carretta
- Meritxell Carvajal
- Mariona Pla
- Ares Sellart

In [35]:
!pip install --quiet medspacy
!pip install --quiet spacy nltk
#!python -m spacy download ca_core_news_sm > /dev/null 2>&1
#!python -m spacy download es_core_news_sm > /dev/null 2>&1
#!python -m spacy download es_core_news_md > /dev/null 2>&1
!python -m spacy download es_core_news_lg > /dev/null 2>&1

In [36]:
import json
import spacy
import nltk

import medspacy
from spacy.tokens import Token

import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, TimeDistributed, Bidirectional
from sklearn.model_selection import train_test_split

# Import the data

In [37]:
!git clone https://github.com/Martinacarretta/githubTest.git
jsons = open('/content/githubTest/negacio_train_v2024.json')
json_string = jsons.read()
json_object = json.loads(json_string)


fatal: destination path 'githubTest' already exists and is not an empty directory.


In [38]:
def convert_char_to_token(tokens, text, char_indices):
    start_char_index, end_char_index = char_indices
    token_start_index = next(i for i, token in enumerate(tokens) if token.idx >= start_char_index)
    token_end_index = next(i for i, token in enumerate(tokens) if token.idx >= end_char_index)
    return token_start_index, token_end_index

In [96]:
nlp_es = spacy.load('es_core_news_lg') #Outside the for loop as it can be used as the same variable for each entry
X = []
y = []

for entry in json_object:
    text = entry.get('data')['text']
    doc = nlp_es(text)

    # Create vectors of true labels
    true = np.zeros(len(doc), dtype=int)

    for prediction in entry.get('predictions', []):
        for label_data in prediction['result']:
          label_value = label_data['value']
          labels = label_value['labels']
          start_index = label_value['start']
          end_index = label_value['end']
          text2 = text[start_index:end_index]  # Extract text based on start and end indexes

          # Add words to corresponding sets based on labels
          for label in labels:
              if label == "NEG":
                  start, end = convert_char_to_token(doc, text, (start_index, end_index)) # get index of negation in token form
                  true[start:end] = 1

              if label == "UNC":
                  start, end = convert_char_to_token(doc, text, (start_index, end_index)) # get index of uncertainty in token form
                  true[start:end] = 2
              if label == "NSCO":
                  start, end = convert_char_to_token(doc, text, (start_index, end_index)) # get index of negation scope in token form
                  true[start:end] = 3
              if label == "USCO":
                  start, end = convert_char_to_token(doc, text, (start_index, end_index)) # get index of uncertainty scope in token form
                  true[start:end] = 4


    tokens_list = [token.text.lower() for token in doc] # Doc has object type, to work with word embeddings, we need a list of tokens. The lower() is to ensure consistency

    # Generate the feature vectors and labels
    list_for_dictionaries = [] # to append every word in a same doc
    for i, token in enumerate(tokens_list):
        x_vec = {'word': token , 'POS': doc[i].pos_} # For every token
        list_for_dictionaries.append(x_vec)

    X.append(list_for_dictionaries) # Here we should have a list for each doc with nested dictionaries for each word
    true = list(true)
    y.append(true)

y = [[str(element) for element in sequence] for sequence in y]


# Padding:

In [127]:
mim_doc_length = min(len(doc) for doc in X)
mim_doc_length

243

In [130]:
total_length = sum(len(doc) for doc in X)
average = total_length / len(X)
average

992.007874015748

In [111]:
max_doc_length = max(len(doc) for doc in X)

padded_X = []
for doc in X:
    padded_doc = doc + [{'word': '<PAD>', 'POS': '<PAD>'}] * (max_doc_length - len(doc))
    padded_X.append(padded_doc)

padded_y = pad_sequences(y, maxlen=max_doc_length, padding='post', value=5)

In [112]:
all_words = set([item['word'] for doc in padded_X for item in doc])
all_pos = set([item['POS'] for doc in padded_X for item in doc])

word_index = {word: i+1 for i, word in enumerate(all_words)}
pos_index = {pos: i+1 for i, pos in enumerate(all_pos)}

# Convert to sequences of indices
X_word_sequences = [[word_index[token['word']] for token in doc] for doc in padded_X]
X_pos_sequences = [[pos_index[token['POS']] for token in doc] for doc in padded_X]

# Combine word and POS sequences if necessary
X_combined_sequences = np.array([np.array([X_word_sequences[i], X_pos_sequences[i]]).T for i in range(len(X_word_sequences))])

In [113]:
X_combined_sequences.shape

(254, 4174, 2)

Reshape and concatenate to fit the input requirements of LSTM

In [114]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, TimeDistributed, Dense, Concatenate


word_input = Input(shape=(max_doc_length,), dtype='int32', name='word_input')
pos_input = Input(shape=(max_doc_length,), dtype='int32', name='pos_input')

# EMbedding:
word_embedding = Embedding(input_dim=len(word_index) + 1, output_dim=128, input_length=max_doc_length)(word_input)
pos_embedding = Embedding(input_dim=len(pos_index) + 1, output_dim=32, input_length=max_doc_length)(pos_input)

# Concatenate:
combined = Concatenate()([word_embedding, pos_embedding])
# The resulting tensor has a shape of (batch_size, max_doc_length, 160) (128 + 32), where 160 is the combined dimension of word and POS embeddings.

# Model: LSTM

## Build the model

In [115]:
bi_lstm = Bidirectional(LSTM(units=64, return_sequences=True))(combined)
output = TimeDistributed(Dense(6, activation='softmax'))(bi_lstm) ############ % classes
model = Model(inputs=[word_input, pos_input], outputs=[output])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 word_input (InputLayer)     [(None, 4174)]               0         []                            
                                                                                                  
 pos_input (InputLayer)      [(None, 4174)]               0         []                            
                                                                                                  
 embedding_9 (Embedding)     (None, 4174, 128)            2613888   ['word_input[0][0]']          
                                                                                                  
 embedding_10 (Embedding)    (None, 4174, 32)             640       ['pos_input[0][0]']           
                                                                                            

## Train

In [None]:
X_train_words, X_test_words, X_train_pos, X_test_pos, y_train, y_test = train_test_split(X_word_sequences, X_pos_sequences, padded_y, test_size=0.2, random_state=42)

# Train the model
model.fit([X_train_words, X_train_pos], y_train, batch_size=32, epochs=5, validation_split=0.2)


## Validation

In [117]:
predictions = model.predict([np.array(X_test_words), np.array(X_test_pos)])



In [118]:
predictions

array([[[0.17059693, 0.16356897, 0.16624984, 0.16843835, 0.16487534,
         0.16627055],
        [0.16645446, 0.16411874, 0.16865493, 0.167405  , 0.16696428,
         0.16640252],
        [0.16732156, 0.16473852, 0.16895357, 0.16697778, 0.16605714,
         0.16595136],
        ...,
        [0.16419163, 0.16830443, 0.16175117, 0.172498  , 0.1703677 ,
         0.16288705],
        [0.16443886, 0.16817582, 0.16169311, 0.17293167, 0.16984224,
         0.1629183 ],
        [0.16496176, 0.16800125, 0.16160154, 0.17336658, 0.16913894,
         0.16293   ]],

       [[0.17059587, 0.16356929, 0.16625544, 0.16843696, 0.16487314,
         0.16626933],
        [0.16645326, 0.164119  , 0.16866173, 0.16740292, 0.16696203,
         0.16640101],
        [0.16732025, 0.16473892, 0.16896151, 0.16697483, 0.16605482,
         0.16594976],
        ...,
        [0.16419163, 0.16830443, 0.16175117, 0.172498  , 0.1703677 ,
         0.16288705],
        [0.16443886, 0.16817582, 0.16169311, 0.17293167, 0.169

In [119]:
predicted_labels = np.argmax(predictions, axis=-1)

In [123]:
predicted_labels[18]

array([0, 2, 2, ..., 3, 3, 3])

In [122]:
from sklearn.metrics import classification_report
cr = classification_report(y_test[20], predicted_labels[20])
print(cr)

              precision    recall  f1-score   support

           0       0.93      0.06      0.12      2050
           1       0.01      0.08      0.02        25
           2       0.01      0.33      0.01         9
           3       0.06      0.29      0.10        83
           4       0.00      0.06      0.00        31
           5       0.00      0.00      0.00      1976

    accuracy                           0.04      4174
   macro avg       0.17      0.14      0.04      4174
weighted avg       0.46      0.04      0.06      4174



In [86]:
def count_ones(binary_vector):
    count = 0
    for bit in binary_vector:
        if bit == 1:
            count += 1
    return count
counter = 0
for llista in y_test:
  counter += count_ones(llista)

print(counter)


823
