In [3]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification, GPT2Tokenizer, GPT2ForSequenceClassification, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset
import pandas as pd

Initialize the GPT-2 tokenizer and model

In [5]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2ForSequenceClassification.from_pretrained("gpt2")

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
dataset = load_dataset('csv', data_files='../data/corpus_turims.csv', encoding='utf8')

In [54]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Pregunta', 'labels'],
        num_rows: 30
    })
})

In [7]:
def tokenize_function(examples):
    return tokenizer(examples["Respuesta"], padding="max_length", truncation=True, return_tensors="pt")

In [8]:
block_size = 512


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["Pregunta"] = result["input_ids"].copy()
    return result

In [9]:
tokenizer.pad_token = tokenizer.eos_token

In [10]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [11]:
tokenized_datasets = dataset.map(
    tokenize_function, 
    batched=True,
    remove_columns=dataset["train"].column_names,
    )

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

In [12]:
tokenized_datasets['train']

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 30
})

In [13]:
lm_dataset = tokenized_datasets.map(group_texts, batched=True)

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

In [14]:
lm_dataset['train']

Dataset({
    features: ['input_ids', 'attention_mask', 'Pregunta'],
    num_rows: 60
})

In [15]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [16]:
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned-qa",
    num_train_epochs=4,
    per_device_train_batch_size=8,
    evaluation_strategy="steps",
    save_steps=10,
    eval_steps=10
)

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=lm_dataset['train'],
)

In [18]:
trainer.train()

  0%|          | 0/32 [00:00<?, ?it/s]

IndexError: index out of range in self

In [2]:
model_name = 'gpt2-medium'
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

In [3]:
def get_response_from_corpus(user_input, corpus_df, tokenizer):
   """Buscar la respuesta en el corpus basada en la pregunta del usuario."""
   user_tokens = tokenizer.encode(user_input.lower(), add_special_tokens=False)

   for index, row in corpus_df.iterrows():
      pregunta = row['Pregunta'].lower()
      respuesta = row['Respuesta']

   for token in user_tokens:
      if token in tokenizer.encode(pregunta, add_special_tokens=False):
         return respuesta

   return None

In [4]:
def setup_chatbot():
   # Load the pre-trained GPT-2 model and tokenizer

   # Load your corpus from the CSV file
   corpus_df = pd.read_csv('../data/corpus_turims.csv', encoding='utf-8')

   print("Chatbot: ¡Hola! Soy un chatbot. ¿En qué puedo ayudarte hoy?")

   while True:
      user_input = input("Tú: tienes alguna consulta el día de hoy?")
      if user_input.lower() == 'exit':
         print("Chatbot: Hasta luego. ¡Que tengas un buen día!")
         break

      # Buscar la respuesta en el corpus basada en la pregunta del usuario
      response = get_response_from_corpus(user_input, corpus_df, tokenizer)

      if response:
         print("Chatbot:", response)
      else:
         print("Chatbot: Lo siento, no tengo información sobre eso en mi corpus.")

In [5]:
setup_chatbot()

Chatbot: ¡Hola! Soy un chatbot. ¿En qué puedo ayudarte hoy?
Chatbot: Las Termas de Papallacta son conocidas por sus aguas termales relajantes en medio de la naturaleza.
Chatbot: Hasta luego. ¡Que tengas un buen día!
