In [1]:
import torch
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset

dataset_path = "/content/Dataset.csv"  # Change to your dataset file path
df = pd.read_csv(dataset_path)

df = df.dropna()
df["context"] = df["context"].astype(str)
df["response"] = df["response"].astype(str)

tokenizer = T5Tokenizer.from_pretrained("t5-small")

def preprocess_function(examples):
    inputs = ["question: " + str(q) if isinstance(q, str) else "question: empty" for q in examples["context"]]
    targets = [str(a) if isinstance(a, str) else "unknown" for a in examples["response"]]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=256, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

dataset = Dataset.from_pandas(df)
dataset = dataset.map(preprocess_function, batched=True)

dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset["train"]
val_dataset = dataset["test"]

model = T5ForConditionalGeneration.from_pretrained("t5-small")

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = TrainingArguments(
    output_dir="./t5_finetuned_model",
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,
    logging_dir="./logs",
    logging_steps=100,
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["context"] = df["context"].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["response"] = df["response"].astype(str)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access

Map:   0%|          | 0/68109 [00:00<?, ? examples/s]

  trainer = Trainer(
[34m[1mwandb[0m: Currently logged in as: [33mhirudikaanupama3[0m ([33mhirudikaanupama3-robert-gordon-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
0,0.3163,0.301104
1,0.3172,0.294861
2,0.3117,0.291665
3,0.3066,0.289748


Epoch,Training Loss,Validation Loss
0,0.3163,0.301104
1,0.3172,0.294861
2,0.3117,0.291665
3,0.3066,0.289748
4,0.2968,0.28918


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=9575, training_loss=0.32046077060948464, metrics={'train_runtime': 5694.4868, 'train_samples_per_second': 53.822, 'train_steps_per_second': 1.681, 'total_flos': 2.0738694923157504e+16, 'train_loss': 0.32046077060948464, 'epoch': 4.999608508417069})

In [2]:
model.save_pretrained("./t5_finetuned_model")
tokenizer.save_pretrained("./t5_finetuned_model")



('./t5_finetuned_model/tokenizer_config.json',
 './t5_finetuned_model/special_tokens_map.json',
 './t5_finetuned_model/spiece.model',
 './t5_finetuned_model/added_tokens.json')

In [4]:
from google.colab import files
import shutil

# Compress the folder
shutil.make_archive("t5_finetuned_model", 'zip', "./t5_finetuned_model")

# Download the zipped model
files.download("t5_finetuned_model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [23]:
import torch
from torch.nn.functional import softmax

def generate_answer(question, num_return_sequences=1):
    # Stronger prompt engineering for focused guidance
    input_text = (
        f"Provide an empathetic and practical response to: {question}. "
        "Offer three actionable coping strategies and ensure the response is structured, clear, and free of repetition."
    )

    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.cuda()

    outputs = model.generate(
        input_ids,
        max_length=200,
        num_beams=10,
        early_stopping=True,
        repetition_penalty=2.5,  # Stronger to eliminate redundancy
        length_penalty=1.8,
        temperature=0.5,
        top_k=50,
        top_p=0.9,
        do_sample=True,
        num_return_sequences=num_return_sequences,
        return_dict_in_generate=True
    )

    responses = []
    for i in range(num_return_sequences):
        response_text = tokenizer.decode(outputs.sequences[i], skip_special_tokens=True)

        # Remove unwanted patterns like self-introduction or vague phrases
        filtered_phrases = [
            "i am a mental health coach", "i am a certified", "i think",
            "it's important to know that", "the best way to do this is",
            "consider talking with a professional"
        ]

        if any(phrase in response_text.lower() for phrase in filtered_phrases):
            continue

        responses.append(response_text)

    # Return the most relevant response
    best_response = responses[0] if responses else tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)

    return best_response, 1.0  # Confidence score removed due to invalid computation

# Example Usage
question = "I manage depression"
generated_answer, confidence_score = generate_answer(question, num_return_sequences=3)

print(f"Generated Answer: {generated_answer}")


Generated Answer: i'm sorry to hear that you are feeling this way. it sounds like you have a good understanding of what is going on in your life and how you feel about it. the best way to deal with depression is to seek help from a local mental health professional, as well as a local mental health professional. they may also be able to assist you in dealing with depression.
