In [None]:
pip install transformers datasets evaluate accelerate

In [None]:
pip install ipywidgets

In [None]:
from datasets import load_dataset, load_metric, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling, Trainer,  TrainingArguments
from ipywidgets import interact, widgets
from IPython.display import display
import math
import numpy as np
import pandas as pd
import sys

In [None]:
# Importamos el dataset y adecuamos los datos
# Los labels eventualmente serán el mismo texto, entonces no importa tenerlos
data = load_dataset("celikmus/mayo_clinic_symptoms_and_diseases_v1")
# Pasamos a pandas para manipular más fácil
df = data['train'].to_pandas()



  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# Ejemplos de formas de preguntar sobre la enfermedad para los síntomas
preguntas_base = ['What disease does the patient have?',
             'Which diagnosis would you give to the patient?', 
             'What is the most likely diagnosis for the previous symptoms?', 
             'What would be the diagnosis for the symptoms described previously?',
             'Given the symptoms, what condition does the patient have?',
             'What is the diagnosis?',
             'What condition is best described by the previous symptoms?']
# número de ejemplares del dataset
rows = np.shape(df)[0] 
pos = np.random.choice(len(preguntas_base), rows)
preguntas = []
for i in range(rows):
  # asignamos una pregunta de manera aleatoria a cada ejemplar
  preguntas.append(preguntas_base[pos[i]]) 
df['text'] = df['text'] + ' ' + preguntas

df['text'] = df['text'] + ' ' + df['label']
data = Dataset.from_pandas(df)

In [None]:
# Dividimos dataset en entrenamiento y prueba
data = data.train_test_split(test_size = 0.2)
print(data)
# Ejemplar de dataset: data
print(data['train'][0])

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 846
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 212
    })
})
{'text': 'Depending on the joint that\'s affected, signs and symptoms of osteochondritis dissecans might include: \nPain. This most common symptom of osteochondritis dissecans might be triggered by physical activity — walking up stairs, climbing a hill or playing sports.\nSwelling and tenderness. The skin around your joint might be swollen and tender.\nJoint popping or locking. Your joint might pop or stick in one position if a loose fragment gets caught between bones during movement.\nJoint weakness. You might feel as though your joint is "giving way" or weakening.\nDecreased range of motion. You might be unable to straighten the affected limb completely.\n What is the most likely diagnosis for the previous symptoms? osteochondritis-dissecans', 'label': 'osteochondritis-dissecans'}


In [None]:
# Función para tokenizar
tokenizer = AutoTokenizer.from_pretrained("gpt2")
# Función de preposesamiento de datos que debe ser aplicada a todo el dataset
def funcion_preprocesamiento(elemento):
    return tokenizer(elemento['text'])
# Usamos batched = True para procesar más de un elemento a la vez 
# Usamos num_proc para incrementar el número de procesos
tokenized_data = data.map(funcion_preprocesamiento, batched = True, 
                             num_proc = 4, remove_columns=data['train'].column_names,)#(?)

tokenized_data

Map (num_proc=4):   0%|          | 0/846 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1818 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1371 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1114 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (3251 > 1024). Running this sequence through the model will result in indexing errors


Map (num_proc=4):   0%|          | 0/212 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1239 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1657 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1147 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1344 > 1024). Running this sequence through the model will result in indexing errors


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 846
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 212
    })
})

Algunas secuencias de tokens son más largas que lo permitido por GPT-2. Así juntamos todas las secuencias y las dividimos en bloques de tamaño dado por `block_size`. Dicho tamaño debe encontrarse entre en el rango de lo suficientemente grande para que la gpu pueda trabajar, pero que tampoco supere el máximo establecido por el modelo.

In [None]:
block_size = 512
def secuencias_iguales(elementos):
    # Concatenación de todas las secuencias
    secuencias_concatendas = {k: sum(elementos[k], []) for k in elementos.keys()}
    # Longitud de la concatenación de las secuencias
    n = len(secuencias_concatendas[list(elementos.keys())[0]])
    # Eliminamos los caracteres necesarios para que al dividir la secuencia los
    # bloque queden exactos. No agregamos espacios para rellenar otro bloque pues
    # no es soportado por el tokenizer de GPT-2
    if n >= block_size:
        n = (n // block_size) * block_size
    # Dividimos las secuencias concatenadas en bloque dados por el block_size
    secuencias = {
        k: [t[i : i + block_size] for i in range(0, n, block_size)]
        for k, t in secuencias_concatendas.items()
    }
    secuencias["labels"] = secuencias["input_ids"].copy()
    return secuencias

s_i_dataset = tokenized_data.map(secuencias_iguales, batched = True, num_proc = 4)
s_i_dataset

Map (num_proc=4):   0%|          | 0/846 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/212 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 499
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 125
    })
})

In [None]:
# Creamos un lote de ejemplos
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Entrenamiento

In [None]:
# CausalLM solo predice tokens
model = AutoModelForCausalLM.from_pretrained("gpt2")

In [None]:
# Entrenamiento
training_args = TrainingArguments(
    output_dir = "gpt2_diagnosticos",
    evaluation_strategy = "epoch",
    learning_rate = 2e-5,
    weight_decay = 0.01,
    push_to_hub = False,
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = s_i_dataset["train"],
    eval_dataset = s_i_dataset["test"],
    data_collator = data_collator,
)
trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,2.137511
2,No log,2.043661
3,No log,2.020143


TrainOutput(global_step=189, training_loss=2.3833944330770502, metrics={'train_runtime': 239.0907, 'train_samples_per_second': 6.261, 'train_steps_per_second': 0.79, 'total_flos': 391154171904000.0, 'train_loss': 2.3833944330770502, 'epoch': 3.0})

In [None]:
# Calculamos evaluamos la perplejidad.
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 7.54


In [None]:
def respuestas():
  # Pseudo-interfaz para probar el modelo
  text = widgets.Text(
      placeholder='Enter your symptoms',
      disabled=False
      )
  display('e.g: Patient has swollen feet and tongue, myalgia and arthralgia. He says he gets easily tired. What would be his diagnosis?')
  display( text)
  # Hacemos inferencia con el modelo obtenido 
  def callback(wdgt):
    display('')
    display('Symptoms: ')
    display(wdgt.value)
    # Hacemos inferencia con el modelo obtenido 
    inputs = tokenizer(wdgt.value, return_tensors="pt").input_ids
    inputs = inputs.to(device='cuda')
    outputs = model.generate(inputs, max_new_tokens=5, do_sample=True, top_k=50, top_p=0.95)
    display('Output: ')
    display(tokenizer.batch_decode(outputs, skip_special_tokens=True))
  text.on_submit(callback)
respuestas()
    

'e.g: Patient has swollen feet and tongue, myalgia and arthralgia. He says he gets easily tired. What would be his diagnosis?'

Text(value='', placeholder='Enter your symptoms')

''

'Symptoms: '

'The patient has fever, throat pain and headhache. What would be his diagnosis?'

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Output: '

['The patient has fever, throat pain and headhache. What would be his diagnosis? mild-to-moderate']

''

'Symptoms: '

'The patient has fever, throat pain and headhache. What would be his diagnosis?'

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Output: '

['The patient has fever, throat pain and headhache. What would be his diagnosis? chirovirus\n']

''

'Symptoms: '

'The patient has fever, throat pain and headhache. What would be his diagnosis?'

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Output: '

['The patient has fever, throat pain and headhache. What would be his diagnosis? influenza-prevention-']

''

'Symptoms: '

'The patient has fever, throat pain and headhache. What would be his diagnosis?'

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Output: '

['The patient has fever, throat pain and headhache. What would be his diagnosis? hairdresser •']

''

'Symptoms: '

'The patient has a swollen mole that is itchy and reddish in color. What would be his most likely diagnosis?'

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Output: '

['The patient has a swollen mole that is itchy and reddish in color. What would be his most likely diagnosis? anXiaVent']

''

'Symptoms: '

'The patient has a swollen mole that is itchy and reddish in color. What would be his most likely diagnosis?'

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Output: '

['The patient has a swollen mole that is itchy and reddish in color. What would be his most likely diagnosis?\nChronic obstructive']

''

'Symptoms: '

'The patient has a swollen mole that is itchy and reddish in color. What would be his most likely diagnosis?'

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Output: '

['The patient has a swollen mole that is itchy and reddish in color. What would be his most likely diagnosis? ulcerative colitis']

''

'Symptoms: '

'The patient has a swollen mole that is itchy and reddish in color. What would be his most likely diagnosis?'

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Output: '

['The patient has a swollen mole that is itchy and reddish in color. What would be his most likely diagnosis? ophthalmologist ']

''

'Symptoms: '

'The patient has a swollen mole that is itchy and reddish in color. What would be his most likely diagnosis?'

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Output: '

['The patient has a swollen mole that is itchy and reddish in color. What would be his most likely diagnosis? Myalgia\n Myalgia']