## FINE TUNING

In [1]:
import pandas as pd
import numpy as np
from transformers import pipeline
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from transformers import RobertaTokenizerFast, RobertaForQuestionAnswering, Trainer, TrainingArguments
from accelerate import Accelerator

2024-08-04 21:55:49.363896: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-04 21:55:49.470652: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
accelerator = Accelerator()

In [3]:
model_name = "themariolinml/roberta-base-sqaud2-on-medical_meadow_medqa-v1"
tokenizer = RobertaTokenizerFast.from_pretrained(model_name)
model = RobertaForQuestionAnswering.from_pretrained(model_name)

In [4]:
pipe = pipeline('question-answering', model=model, tokenizer=tokenizer)

In [5]:
def generate_context(row):
    """
    Generates context by using a question-answering pipeline to get answers based on the provided question and context.

    Args:
        row (pd.Series): A row from a pandas DataFrame containing 'user_query' and 'answer' fields. 
                        'user_query' is the question to be answered and 'answer' is the context used for answering.

    Returns:
        str: The answer generated by the question-answering pipeline. If an error occurs or the answer cannot be retrieved,
            an empty string is returned.

    Raises:
        Exception: If there is an issue with the question-answering pipeline, an exception is caught and an error message
                    is printed. The function will return an empty string in such cases.

    Notes:
        - The function expects the 'pipe' object to be a `transformers.pipeline` configured for question-answering tasks.
        - The function verifies if the result from the pipeline contains the key 'answer'. If not, it prints an unexpected 
        result message and returns an empty string.
    """
    try:
        # Use the pipeline to get the answer
        result = pipe(question=row['user_query'], context=row['answer'])
        
        # Check if the result contains the 'answer' key
        if isinstance(result, dict) and 'answer' in result:
            return result['answer']
        else:
            print(f"Unexpected result from the pipeline: {result}")
            return ''
    except Exception as e:
        print(f"Error generating context for question: {row['user_query']}. Error: {e}")
        return ''

In [6]:
df = pd.read_csv('../data/csv/train_dataset/df_clean.csv')

In [7]:
df = df[['user_query', 'answer']]

In [8]:
df['context'] = df.apply(generate_context, axis=1)

In [9]:
df.to_csv('../data/csv/train_dataset/df_generative.csv')

In [5]:
df = pd.read_csv('../data/csv/train_dataset/df_generative.csv')

In [6]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [8]:
df.head()

Unnamed: 0,user_query,answer,context
0,lyme disease 12 years ago I was bitten by tick...,Lyme disease tests are used to determine if a ...,Lyme disease
1,raynauds syndrome My sons middle toe turned wh...,Only one finger or toe or parts of one or more...,finger or toe or parts of one or more may be a...
2,burn to my wrist Hello I burnt my wrist 2 days...,"Before giving first aid, it is important to de...",major burn
3,treatment of parkinson I AM HAVING PARKINSON F...,you should know that people who have Parkinson...,Parkinson's disease
4,periventricular heterotopia. scoliosis - pos...,Isolated lissencephaly sequence (ILS) is a con...,Isolated lissencephaly sequence (ILS)


In [14]:
df.isnull().sum()

user_query    0
answer        0
context       2
dtype: int64

In [16]:
df['context'] = df['context'].fillna('')

In [17]:
dataset = Dataset.from_pandas(df)

In [18]:
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

In [19]:
train_test_split 

DatasetDict({
    train: Dataset({
        features: ['user_query', 'answer', 'context'],
        num_rows: 356
    })
    test: Dataset({
        features: ['user_query', 'answer', 'context'],
        num_rows: 90
    })
})

In [20]:
def tokenize_function(examples):
    return tokenizer(
        examples['user_query'],
        examples['context'],
        truncation=True,
        padding='max_length',
        return_offsets_mapping=True
    )

In [21]:
tokenized_datasets = train_test_split.map(tokenize_function, batched=True)

Map:   0%|          | 0/356 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

In [22]:
def preprocess_answers(examples):
    start_positions = []
    end_positions = []

    for i, (answer, context) in enumerate(zip(examples['answer'], examples['context'])):
        start_idx = context.find(answer)
        if start_idx == -1:
            start_positions.append(0)
            end_positions.append(0)
        else:
            start_positions.append(tokenizer.encode(context[:start_idx], truncation=True, return_tensors='pt').size(1))
            end_positions.append(start_positions[-1] + len(tokenizer.encode(answer, truncation=True, return_tensors='pt')) - 1)

    examples['start_positions'] = start_positions
    examples['end_positions'] = end_positions
    return examples

In [23]:
tokenized_datasets = tokenized_datasets.map(preprocess_answers, batched=True)

Map:   0%|          | 0/356 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

In [24]:
# Configurar los argumentos de entrenamiento
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01
    )



In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    )

In [26]:
trainer.train()

  0%|          | 0/69 [00:00<?, ?it/s]

In [None]:
evaluation_results = trainer.evaluate()
print(evaluation_results)

In [None]:
trainer.save_model("./my_model")
tokenizer.save_pretrained("./my_model")