In [None]:
from IPython import get_ipython
from IPython.display import display
# %%
!pip install -q transformers datasets scikit-learn pandas

from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
from datasets import Dataset, DatasetDict

# 1. Load dataset
df = pd.read_csv("dataset-tickets-multi-lang-4-20k.csv")

# Fill potential missing values in 'answer' with an empty string
df['answer'] = df['answer'].fillna('')

# 2. Prepare input-output format
def format_example(row):
    input_text = f"question: {row['subject']} context: {row['body']}"
    output_text = row['answer']
    return {"input_text": input_text, "output_text": output_text}

formatted_data = df.apply(format_example, axis=1, result_type="expand")

# 3. Convert to Hugging Face Dataset
hf_dataset = Dataset.from_pandas(formatted_data)

# 4. Train-test split
split_dataset = hf_dataset.train_test_split(test_size=0.1, seed=42)

# 5. Load tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# 6. Tokenize function
def tokenize(example):
    model_inputs = tokenizer(example["input_text"], max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(example["output_text"], max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = split_dataset.map(tokenize, batched=True)

# 7. Load model
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# 8. Define training arguments
training_args = TrainingArguments(
    output_dir="./t5-support-bot",
    # Changed evaluation_strategy to eval_strategy
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,
    load_best_model_at_end=True,
)

# 9. Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# 10. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# 11. Train the model
trainer.train()

# 12. Save final model
model.save_pretrained("./final-t5-support-bot")
tokenizer.save_pretrained("./final-t5-support-bot")

In [None]:
import zipfile
import os

# Define the directory where your model and tokenizer are saved
model_directory = "./final-t5-support-bot"

# Define the name for your zip file
zip_filename = "final-t5-support-bot.zip"

# Create a ZipFile object in write mode
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    # Walk through the directory and add each file to the zip
    for root, dirs, files in os.walk(model_directory):
        for file in files:
            # Create the full path to the file
            file_path = os.path.join(root, file)
            # Add the file to the zip archive, maintaining the directory structure
            # The arcname is the path within the zip file
            arcname = os.path.relpath(file_path, model_directory)
            zipf.write(file_path, arcname)

print(f"Model and tokenizer saved to {zip_filename}")

In [None]:
!unzip -o final-t5-support-bot.zip -d ./final-t5-support-bot

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# Load model and tokenizer
model_path = "final-t5-support-bot"  # 🔧 removed './'
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)
model.eval()

# Generate response function
def generate_response(question, context, max_length=128):
    input_text = f"question: {question} context: {context}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True, max_length=512).to(model.device)

    with torch.no_grad():
        outputs = model.generate(input_ids, max_length=max_length, num_beams=4, early_stopping=True)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test example
test_question = "Platform crash"
test_context = "The analytics platform stopped working unexpectedly and restarting the MacBook did not help."

response = generate_response(test_question, test_context)
print("💬 Generated Response:\n", response)

In [None]:
# Test input 2
test_question = "How can I reset my password?"
test_context = "I forgot my login credentials and I can't access my account anymore. The reset password link doesn't work."

response = generate_response(test_question, test_context)
print("Generated Response:\n", response)

Generated Response:
 I have forgotten my login credentials and can't access my account anymore. Please let me know a suitable time to call you at tel_num> to discuss this further.


In [None]:
import os
import json
import matplotlib.pyplot as plt

# Step 1: Create dummy logs folder and trainer_state.json with sample data
os.makedirs("./logs", exist_ok=True)

dummy_log = {
    "log_history": [
        {"step": 10, "loss": 2.5},
        {"step": 20, "loss": 2.0, "eval_loss": 2.1},
        {"step": 30, "loss": 1.7, "eval_loss": 1.8},
        {"step": 40, "loss": 1.3, "eval_loss": 1.4},
        {"step": 50, "loss": 1.1, "eval_loss": 1.2},
    ]
}

with open("./logs/trainer_state.json", "w") as f:
    json.dump(dummy_log, f)

# Step 2: Load the trainer_state.json
with open("./logs/trainer_state.json") as f:
    trainer_state = json.load(f)

log_history = trainer_state.get("log_history", [])

# Step 3: Extract step, train loss and eval loss
steps = [entry["step"] for entry in log_history if "loss" in entry]
train_loss = [entry["loss"] for entry in log_history if "loss" in entry]
eval_loss = [entry.get("eval_loss", None) for entry in log_history if "loss" in entry]

# Step 4: Plot training and eval loss
plt.figure(figsize=(8,5))
plt.plot(steps, train_loss, label="Train Loss", marker='o')
plt.plot(steps, eval_loss, label="Eval Loss", marker='x')
plt.xlabel("Step")
plt.ylabel("Loss")
plt.title("Training and Evaluation Loss")
plt.legend()
plt.grid(True)
plt.show()