ðŸ’» Step 1: Importing Libraries.

In [45]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
from transformers import pipeline

ðŸ’» Step 2: Load and Split the Data

In [46]:
# 1. Loading the Dataset
print("Downloading EmpatheticDialogues dataset...")
dataset = load_dataset("csv",data_files="emotion-emotion_69k.csv")

# 2. Split it (90% for training, 10% for testing)
# This creates a new structure with 'train' and 'test' inside
dataset = dataset['train'].train_test_split(test_size=0.1)

# 3. Rename 'test' to 'validation' (Standard naming for Hugging Face)
dataset['validation'] = dataset.pop('test')

print("âœ… Data loaded and split successfully!")
print(dataset)

Downloading EmpatheticDialogues dataset...
âœ… Data loaded and split successfully!
DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'Situation', 'emotion', 'empathetic_dialogues', 'labels', 'Unnamed: 5', 'Unnamed: 6'],
        num_rows: 58172
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'Situation', 'emotion', 'empathetic_dialogues', 'labels', 'Unnamed: 5', 'Unnamed: 6'],
        num_rows: 6464
    })
})


In [47]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'Situation', 'emotion', 'empathetic_dialogues', 'labels', 'Unnamed: 5', 'Unnamed: 6'],
        num_rows: 58172
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'Situation', 'emotion', 'empathetic_dialogues', 'labels', 'Unnamed: 5', 'Unnamed: 6'],
        num_rows: 6464
    })
})

ðŸ’» Step 3: Check Column Names (Crucial!)

In [48]:
# Print the column names so we know what to use
print("Column Names:", dataset['train'].column_names)

# Print one example to see what the data looks like
print("Example Row:", dataset['train'][0])

Column Names: ['Unnamed: 0', 'Situation', 'emotion', 'empathetic_dialogues', 'labels', 'Unnamed: 5', 'Unnamed: 6']
Example Row: {'Unnamed: 0': 49773, 'Situation': 'I was a little shocked when a storm came through recently a it hailed and damaged my vehicle a little! Luckily I have insurance at least..', 'emotion': 'surprised', 'empathetic_dialogues': "Customer :That's awful! What kind of car do you have?\nAgent :", 'labels': 'a nissan altima', 'Unnamed: 5': None, 'Unnamed: 6': None}


In [49]:
dataset['validation'].shape, dataset['train'].shape

((6464, 7), (58172, 7))

ðŸ’» Step 4: Load Model & Format Data

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# --- FORMATTING & TOKENIZING DATA ---
def format_data(example):
    # 1. Cleaning up the History (Change "Customer" to "User")
    # This is the "Input" part of the stream
    history = example['empathetic_dialogues']

    history = history.replace('Customer :', 'User:').replace('Customer:', 'User:')
    history = history.replace('Agent :', 'Bot:').replace('Agent:', 'Bot:')

    # 2. Get the Response (Changing "labels" to the target text)
    # This is the "Output" part of the stream
    response = example['labels']

    # 3. COMBINE them into one string for the model to learn
    return {"text": f"{history} {response}"}

# Applying formatting
formatted_dataset = dataset.map(format_data)

# Tokenizing
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

tokenized_datasets = formatted_dataset.map(tokenize_function, batched=True,remove_columns=formatted_dataset["train"].column_names)

# Select small subset for training
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(5000))
eval_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(100))

print("âœ… Data combined and ready!")

In [None]:
training_args = TrainingArguments(
    output_dir="./mental_health_bot",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    save_steps=500,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

print("ðŸš€ Starting training...")
trainer.train()
print("ðŸŽ‰ Training complete!")

ðŸš€ Starting training...




Step,Training Loss


In [None]:
my_bot = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)

def ask_bot(input_text):
    prompt = f"User: {input_text}\nBot:"

    response = my_bot(
        prompt,

        # 1. CHANGE 'max_length' to 'max_new_tokens'
        # This tells it: "Write UP TO 30 new words, but stop early if you are done."
        max_new_tokens=30,

        num_return_sequences=1,
        do_sample=True,
        top_k=50,

        # 2. ADD Temperature (Focus)
        # 0.6 makes it logical. 1.0 makes it crazy.
        temperature=0.6,

        # 3. ADD Repetition Penalty (Anti-Babble)
        # 1.2 forces it to use new words.
        repetition_penalty=1.2,

        # 4. ADD EOS Token (The "Stop" Button)
        pad_token_id=tokenizer.eos_token_id
    )

    return response[0]['generated_text'].split("Bot:")[-1].strip()

print(ask_bot("I am feeling really tensed."))