In [None]:
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from transformers import TrainerCallback

dataset = load_dataset('text', data_files={'train': '/content/dataset_clarification_questions.txt'})

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

model = GPT2LMHeadModel.from_pretrained('gpt2')

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',  # Directory for storing logs
    logging_steps=10,       # Log every 10 steps
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

class PrintLossCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        print(f"Epoch {state.epoch} - Loss: {state.log_history[-1]['loss']:.4f}")

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    data_collator=data_collator,
    eval_dataset=None,
    callbacks=[PrintLossCallback()]
)

trainer.train()

model.save_pretrained('./fine_tuned_gpt2')
tokenizer.save_pretrained('./fine_tuned_gpt2')




Step,Training Loss
10,2.9404
20,1.9202
30,1.8369
40,1.4553
50,1.4669
60,1.3882
70,1.3153
80,1.4851
90,1.1772
100,1.2602


Epoch 1.0 - Loss: 1.0419
Epoch 2.0 - Loss: 1.0017
Epoch 3.0 - Loss: 0.7883
Epoch 4.0 - Loss: 0.6206
Epoch 5.0 - Loss: 0.5922


('./fine_tuned_gpt2/tokenizer_config.json',
 './fine_tuned_gpt2/special_tokens_map.json',
 './fine_tuned_gpt2/vocab.json',
 './fine_tuned_gpt2/merges.txt',
 './fine_tuned_gpt2/added_tokens.json')

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the fine-tuned model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GPT2LMHeadModel.from_pretrained('./fine_tuned_gpt2').to(device)
tokenizer = GPT2Tokenizer.from_pretrained('./fine_tuned_gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Set the model to evaluation mode
model.eval()

# Function to generate responses
def generate_response(prompt):
    # Encode the input prompt and move to the correct device
    inputs = tokenizer.encode(prompt, return_tensors='pt').to(device)

    # Create an attention mask (1 for real tokens, 0 for padding)
    attention_mask = (inputs != tokenizer.pad_token_id).type(inputs.dtype).to(device)

    # Generate a response
    with torch.no_grad():
        outputs = model.generate(inputs, attention_mask=attention_mask, max_length=50, num_return_sequences=1, do_sample=True)

    # Decode the generated response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Define unclear prompts
unclear_prompts = [
    "What's the deal?",
    "Can you tell me more about that?",
    "Explain the thing.",
    "I need help with my question.",
    "Why is it like this?",
]

# Generate responses for unclear prompts
for prompt in unclear_prompts:
    response = generate_response(prompt)
    print(f"Prompt: '{prompt}'")
    print(f"Response: '{response}'")
    print("-" * 50)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt: 'What's the deal?'
Response: 'What's the deal? Which specific product or service are you referring to? What specific terms or instructions are you referring to? What's the deal with? Which specific product or service are you referring to? What specific terms or instructions are you referring to'
--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt: 'Can you tell me more about that?'
Response: 'Can you tell me more about that? What specific part of the project are you interested in? What specific aspects? What specific components are you interested in? What specific features are you interested in? What specific criteria should be used for this evaluation? What'
--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt: 'Explain the thing.'
Response: 'Explain the thing. What specific details should be included in the presentation? What format do you prefer for the presentation? What materials or topics should be included? What format do you prefer for the presentation? What format do you prefer for the presentation?'
--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt: 'I need help with my question.'
Response: 'I need help with my question. What type of questions are you looking to ask for help with? What type of questions are you referring to? What resources are you looking to fill? What project or service do you need help with? What project or'
--------------------------------------------------
Prompt: 'Why is it like this?'
Response: 'Why is it like this? Could you clarify the project timeline? What specific timelines are you referring to? What specific areas need to be reworked? What specific processes are you referring to? What specific changes are you referring to? What specific aspects of'
--------------------------------------------------
