In [None]:
!pip install transformers datasets --quiet
!git clone https://github.com/LCS2-IIITD/SPARTA_WSDM2022.git

Cloning into 'SPARTA_WSDM2022'...
remote: Enumerating objects: 48, done.[K
remote: Counting objects: 100% (48/48), done.[K
remote: Compressing objects: 100% (39/39), done.[K
remote: Total 48 (delta 7), reused 34 (delta 3), pack-reused 0 (from 0)[K
Receiving objects: 100% (48/48), 45.61 KiB | 4.15 MiB/s, done.
Resolving deltas: 100% (7/7), done.


In [None]:
import numpy as np
import pandas as pd
import os
from datasets import Dataset, load_dataset
from transformers import TrainingArguments


In [None]:
!pip install -U transformers




In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    BertTokenizer,
    BertModel,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)

In [None]:
# ✅ STEP 1: Load HOPE Therapy Data
hope_path = "/content/SPARTA_WSDM2022/HOPE_data/HOPE_therapy_session_transcripts"
files = [f for f in os.listdir(hope_path) if f.endswith(".csv")]

hope_pairs = []
for file in files:
    df = pd.read_csv(os.path.join(hope_path, file))

    # Map the Type column to proper roles based on the data structure
    # In HOPE dataset, 'T' is Therapist and 'P' is Patient/Client
    df['Speaker'] = df['Type'].map({'T': 'Therapist', 'P': 'Client'})
    df['Content'] = df['Utterance']  # Rename for consistency

    # Extract consecutive client-therapist exchanges
    for i in range(1, len(df)):
        if df.loc[i-1, 'Speaker'] == "Client" and df.loc[i, 'Speaker'] == "Therapist":
            hope_pairs.append({
                "prompt": f"Client: {df.loc[i-1, 'Content']}",
                "response": f"Therapist: {df.loc[i, 'Content']}",
                "source": "HOPE"
            })

print(f"Extracted {len(hope_pairs)} dialogue pairs from HOPE dataset")

Extracted 225 dialogue pairs from HOPE dataset


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          48 non-null     object
 1   Type        48 non-null     object
 2   Utterance   48 non-null     object
 3   Dialog_Act  48 non-null     object
 4   Speaker     48 non-null     object
 5   Content     48 non-null     object
dtypes: object(6)
memory usage: 2.4+ KB


In [None]:
# ✅ STEP 2: Load EmpatheticDialogues
try:
    empathetic_ds = load_dataset("empathetic_dialogues")
    empathy_pairs = []

    # Process conversations to get contextual exchanges
    prev_conv_id = None
    context = ""

    for row in empathetic_ds['train']:
        # Only use responses from utterance_idx > 0 (responses to the initial context)
        if row['utterance_idx'] > 0 and row['conv_id'] == prev_conv_id:
            empathy_pairs.append({
                "prompt": f"Client: {context}",
                "response": f"Therapist: {row['utterance']}",
                "emotion": row['context'],
                "source": "EmpatheticDialogues"
            })

        # Update context and conversation tracking
        context = row['utterance']
        prev_conv_id = row['conv_id']

    print(f"Extracted {len(empathy_pairs)} dialogue pairs from EmpatheticDialogues dataset")
except Exception as e:
    print(f"Error loading EmpatheticDialogues: {e}")
    empathy_pairs = []

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/4.51k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.15k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/28.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/76673 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/12030 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10943 [00:00<?, ? examples/s]

Error loading EmpatheticDialogues: Loading a dataset cached in a LocalFileSystem is not supported.


In [None]:
# ✅ STEP 3: Load CounselChat
try:
    !wget !wget https://raw.githubusercontent.com/nbertagnolli/counsel-chat/master/data/counselchat-data.csv
    cc_df = pd.read_csv("counselchat-data.csv")
    counsel_pairs = []

    for _, row in cc_df.iterrows():
        if pd.notnull(row['questionText']) and pd.notnull(row['answerText']):
            counsel_pairs.append({
                "prompt": f"Client: {row['questionText']}",
                "response": f"Therapist: {row['answerText'].replace('<p>','')}",
                "source": "CounselChat"
            })

    print(f"Extracted {len(counsel_pairs)} dialogue pairs from CounselChat dataset")
except Exception as e:
    print(f"Error loading CounselChat: {e}")
    counsel_pairs = []

--2025-05-12 21:30:44--  http://!wget/
Resolving !wget (!wget)... failed: Name or service not known.
wget: unable to resolve host address ‘!wget’
--2025-05-12 21:30:44--  https://raw.githubusercontent.com/nbertagnolli/counsel-chat/master/data/counselchat-data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3019672 (2.9M) [text/plain]
Saving to: ‘counselchat-data.csv’


2025-05-12 21:30:46 (79.9 MB/s) - ‘counselchat-data.csv’ saved [3019672/3019672]

FINISHED --2025-05-12 21:30:46--
Total wall clock time: 1.1s
Downloaded: 1 files, 2.9M in 0.04s (79.9 MB/s)
Extracted 1383 dialogue pairs from CounselChat dataset


In [None]:
# ✅ STEP 4: Merge All Dialogues
dialogue_data = hope_pairs + empathy_pairs + counsel_pairs
print(f"Total dialogue pairs: {len(dialogue_data)}")

# Optional: Sample checking of data quality
print("\nSample data from each source:")
for source in ["HOPE", "EmpatheticDialogues", "CounselChat"]:
    samples = [d for d in dialogue_data if d.get("source") == source]
    if samples:
        print(f"\n{source} sample:")
        sample = np.random.choice(samples)
        print(f"Prompt: {sample['prompt']}")
        print(f"Response: {sample['response']}")

Total dialogue pairs: 1608

Sample data from each source:

HOPE sample:
Prompt: Client: I mean, that's what scares me. That's, that's exactly the thought that I'm thinking when I'm about to take the test. And if I don't pass that, it's just gonna have like this domino effect of failure.
Response: Therapist: Okay, so it could be a core belief, but it's also seems like you're still on that thought. Like as you get ready to take that test. You're thinking I'm just destined to be a failure. 

CounselChat sample:
Prompt: Client: Whenever I run into a situation that makes me upset or angry, I tend to start cursing and badly offending the person I am confronting. I say mean things to let my anger out. Whenever people tell me stuff about my relationship (like starting rumors or saying negative things about me or my relationship) I lash out not just them but at my boyfriend. I feel like I keep causing drama due to my personality. I want to be a better person and learn to let things not get to m

In [None]:
# ✅ STEP 5: Create Emotion Classification Dataset for BERT
# Extract emotion information where available, or label based on content analysis
def extract_emotion_label(row):
    if "emotion" in row and row["emotion"]:
        return row["emotion"]
    # Default emotions for samples without explicit labels
    return "neutral"

for item in dialogue_data:
    item["emotion_label"] = extract_emotion_label(item)

# Create emotion classification dataset
emotion_dataset = Dataset.from_list(dialogue_data)
train_val_emotion = emotion_dataset.train_test_split(test_size=0.1)

In [None]:
# ✅ STEP 6: Train BERT for Emotion Understanding
# Load BERT tokenizer and model
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Define preprocessing function for BERT
def preprocess_function(examples):
    return bert_tokenizer(
        examples["prompt"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

print(train_val_emotion["train"].column_names)

# Process datasets for BERT
tokenized_train_emotion = train_val_emotion["train"].map(preprocess_function, batched=True)
tokenized_val_emotion = train_val_emotion["test"].map(preprocess_function, batched=True)

# Get unique emotion labels and create label mapping
unique_emotions = set()
for item in dialogue_data:
    unique_emotions.add(item["emotion_label"])
emotion_labels = list(unique_emotions)
label2id = {label: i for i, label in enumerate(emotion_labels)}
id2label = {i: label for i, label in enumerate(emotion_labels)}

# Add numeric labels
def add_numeric_labels(example):
    example["label"] = label2id.get(example["emotion_label"], 0)
    return example

tokenized_train_emotion = tokenized_train_emotion.map(add_numeric_labels)
tokenized_val_emotion = tokenized_val_emotion.map(add_numeric_labels)

# Initialize BERT for sequence classification
bert_model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(emotion_labels),
    id2label=id2label,
    label2id=label2id
)

# Training arguments for BERT
bert_training_args = TrainingArguments(
    output_dir="./emotion-analysis-model",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy="steps",
    eval_steps=200,
    logging_dir="./logs-bert",
    logging_steps=50,
    save_steps=200,
    save_total_limit=2,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    fp16=True,
    report_to="none"
)

# Initialize BERT trainer
bert_trainer = Trainer(
    model=bert_model,
    args=bert_training_args,
    train_dataset=tokenized_train_emotion,
    eval_dataset=tokenized_val_emotion
)

# Train BERT model (commented out to avoid accidental execution)
# bert_trainer.train()

# Save the BERT model
# bert_model.save_pretrained("./emotion-analysis-final")
# bert_tokenizer.save_pretrained("./emotion-analysis-final")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

['prompt', 'response', 'source', 'emotion_label']


Map:   0%|          | 0/1447 [00:00<?, ? examples/s]

Map:   0%|          | 0/161 [00:00<?, ? examples/s]

Map:   0%|          | 0/1447 [00:00<?, ? examples/s]

Map:   0%|          | 0/161 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# ✅ STEP 7: Prepare for GPT-2 Fine-Tuning
# Format data for training
for d in dialogue_data:
    # Enhance prompts with emotion context from BERT
    d['text'] = f"{d['prompt']}\n{d['response']}"

# Create dataset object for GPT-2
gpt_dataset = Dataset.from_list(dialogue_data)
train_val_gpt = gpt_dataset.train_test_split(test_size=0.1)
train_dataset = train_val_gpt['train']
val_dataset = train_val_gpt['test']

# Tokenize for GPT-2
gpt_tokenizer = AutoTokenizer.from_pretrained("gpt2")
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token  # GPT-2 doesn't have a pad token by default

def tokenize_gpt(examples):
    return gpt_tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )

tokenized_train_dataset = train_dataset.map(tokenize_gpt, batched=True,
                                         remove_columns=["prompt", "response", "text", "source", "emotion_label"])
tokenized_val_dataset = val_dataset.map(tokenize_gpt, batched=True,
                                      remove_columns=["prompt", "response", "text", "source", "emotion_label"])

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/1447 [00:00<?, ? examples/s]

Map:   0%|          | 0/161 [00:00<?, ? examples/s]

In [None]:
# ✅ STEP 8: Configure GPT-2 Training
# Initialize GPT-2 model
gpt_model = AutoModelForCausalLM.from_pretrained("gpt2")

# Data collator for GPT-2
data_collator = DataCollatorForLanguageModeling(
    tokenizer=gpt_tokenizer,
    mlm=False
)

# Training arguments for GPT-2
gpt_training_args = TrainingArguments(
    output_dir="./mental-health-chatbot-model",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    eval_strategy="steps",
    eval_steps=500,
    logging_dir="./logs-gpt",
    logging_steps=100,
    save_steps=500,
    save_total_limit=2,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_steps=500,
    lr_scheduler_type="linear",
    learning_rate=5e-5,
    fp16=True,
    report_to="none"
)

# Initialize GPT-2 trainer
gpt_trainer = Trainer(
    model=gpt_model,
    args=gpt_training_args,
    data_collator=data_collator,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset
)

# Train GPT-2 model (commented out to avoid accidental execution)
# gpt_trainer.train()

# Save the GPT-2 model
# gpt_model.save_pretrained("./mental-health-chatbot-final")
# gpt_tokenizer.save_pretrained("./mental-health-chatbot-final")

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
# ✅ STEP 6: Configure Training
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("gpt2")  # Add this line
tokenizer.pad_token = tokenizer.eos_token  # Set padding token

# Initialize model
model = AutoModelForCausalLM.from_pretrained("gpt2")

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./mental-health-chatbot-model",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    eval_strategy="steps",
    eval_steps=500,
    logging_dir="./logs",
    logging_steps=100,
    save_steps=500,
    save_total_limit=2,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_steps=500,
    lr_scheduler_type="linear",
    learning_rate=5e-5,
    fp16=True,
    report_to="none"
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset
)

In [None]:
# ✅ STEP 9: Combined Response Generation System
class MentalHealthChatbot:
    def __init__(self, bert_model, bert_tokenizer, gpt_model, gpt_tokenizer):
        self.bert_model = bert_model
        self.bert_tokenizer = bert_tokenizer
        self.gpt_model = gpt_model
        self.gpt_tokenizer = gpt_tokenizer
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Move models to correct device
        self.bert_model.to(self.device)
        self.gpt_model.to(self.device)

        # Set models to evaluation mode
        self.bert_model.eval()
        self.gpt_model.eval()

    def analyze_emotion(self, text):
        """Use BERT to analyze the emotional content and context of user input"""
        inputs = self.bert_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = self.bert_model(**inputs)
            emotion_scores = outputs.logits[0]
            emotion_id = torch.argmax(emotion_scores).item()
            emotion = self.bert_model.config.id2label.get(emotion_id, "neutral")
            confidence = torch.softmax(emotion_scores, dim=0)[emotion_id].item()

        return emotion, confidence

    def generate_response(self, user_input, max_length=150):
        """Generate therapeutic response using BERT emotion analysis and GPT-2"""
        # First, analyze emotion with BERT
        emotion, confidence = self.analyze_emotion(user_input)

        # Construct enhanced prompt with emotion context
        enhanced_prompt = f"Client (feeling {emotion}): {user_input}\nTherapist:"

        # Generate response with GPT-2
        inputs = self.gpt_tokenizer(enhanced_prompt, return_tensors="pt")
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        output_sequences = self.gpt_model.generate(
            **inputs,
            max_length=len(inputs["input_ids"][0]) + max_length,
            num_return_sequences=1,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=self.gpt_tokenizer.eos_token_id
        )

        # Decode response and extract therapist's part
        full_text = self.gpt_tokenizer.decode(output_sequences[0], skip_special_tokens=True)

        try:
            therapist_response = full_text.split("Therapist:", 1)[1].strip()
        except IndexError:
            therapist_response = full_text

        response_data = {
            "response": therapist_response,
            "emotion_detected": emotion,
            "confidence": confidence
        }

        return response_data

# Example usage (assuming models are loaded):
# chatbot = MentalHealthChatbot(bert_model, bert_tokenizer, gpt_model, gpt_tokenizer)
# response = chatbot.generate_response("I've been feeling really down lately and nothing seems to help.")
# print(f"Detected emotion: {response['emotion_detected']} (confidence: {response['confidence']:.2f})")
# print(f"Response: {response['response']}")

In [None]:
def load_chatbot_models():
    # Load BERT components
    bert_model = AutoModelForSequenceClassification.from_pretrained("./emotion-analysis-final")
    bert_tokenizer = BertTokenizer.from_pretrained("./emotion-analysis-final")

    # Load GPT-2 components
    gpt_model = AutoModelForCausalLM.from_pretrained("./mental-health-chatbot-final")
    gpt_tokenizer = AutoTokenizer.from_pretrained("./mental-health-chatbot-final")

    # Create chatbot instance
    chatbot = MentalHealthChatbot(bert_model, bert_tokenizer, gpt_model, gpt_tokenizer)
    return chatbot

In [None]:
import torch
import math

def calculate_perplexity(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss
    return math.exp(loss.item())

In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def calculate_bleu(reference, hypothesis):
    smoothie = SmoothingFunction().method4
    return sentence_bleu([reference.split()], hypothesis.split(), smoothing_function=smoothie)


In [None]:
!pip install rouge-score
from rouge_score import rouge_scorer

def calculate_rouge(reference, hypothesis):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    return scorer.score(reference, hypothesis)

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=4b3d0100aa7164b4b9eff27cabe082e5a073256319df77748875db64e6ddce23
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
from transformers import pipeline

sentiment_analyzer = pipeline("sentiment-analysis")

def get_sentiment_score(text):
    result = sentiment_analyzer(text)[0]
    return result  # Returns {'label': 'POSITIVE', 'score': 0.98}

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


In [None]:
from collections import Counter

def distinct_n_gram(responses, n=2):
    all_ngrams = []
    for response in responses:
        tokens = response.split()
        ngrams = list(zip(*[tokens[i:] for i in range(n)]))
        all_ngrams.extend(ngrams)

    total = len(all_ngrams)
    unique = len(set(all_ngrams))
    return unique / total if total > 0 else 0


In [None]:
import time

def measure_inference_time(model, tokenizer, prompt):
    start = time.time()
    inputs = tokenizer(prompt, return_tensors="pt")
    _ = model.generate(**inputs)
    end = time.time()
    return end - start


In [None]:
import psutil
import os
import gc
import torch

def get_memory_usage():
    process = psutil.Process(os.getpid())
    cpu_mem = process.memory_info().rss / 1024 ** 2  # in MB
    if torch.cuda.is_available():
        gpu_mem = torch.cuda.max_memory_allocated() / 1024 ** 2
        torch.cuda.reset_max_memory_allocated()
    else:
        gpu_mem = None
    gc.collect()
    return {'cpu_memory_MB': cpu_mem, 'gpu_memory_MB': gpu_mem}


In [None]:
from tqdm import tqdm

# Assuming you have a list of input prompts and reference responses
input_prompts = [item["prompt"] for item in response_data]       # Replace val_data with your dataset
reference_responses = [item["response"] for item in response_data]

generated_responses = []

# Generate model responses
model.eval()
for prompt in tqdm(input_prompts, desc="Generating responses"):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=50,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=False
        )
    decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    generated_responses.append(decoded_output)

# Initialize accumulators
bleu_scores = []
rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
perplexities = []

for ref, gen in zip(reference_responses, generated_responses):
    bleu_scores.append(calculate_bleu(ref, gen))

    rouge = calculate_rouge(ref, gen)
    for key in rouge_scores:
        rouge_scores[key].append(rouge[key].fmeasure)

    perplexities.append(calculate_perplexity(model, tokenizer, gen))

# Compute distinct-n
distinct_1 = distinct_n_gram(generated_responses, n=1)
distinct_2 = distinct_n_gram(generated_responses, n=2)

# Print average results
print("\n--- Evaluation Metrics ---")
print(f"Avg BLEU Score: {sum(bleu_scores)/len(bleu_scores):.4f}")
print(f"Avg ROUGE-1: {sum(rouge_scores['rouge1'])/len(rouge_scores['rouge1']):.4f}")
print(f"Avg ROUGE-2: {sum(rouge_scores['rouge2'])/len(rouge_scores['rouge2']):.4f}")
print(f"Avg ROUGE-L: {sum(rouge_scores['rougeL'])/len(rouge_scores['rougeL']):.4f}")
print(f"Avg Perplexity: {sum(perplexities)/len(perplexities):.4f}")
print(f"Distinct-1: {distinct_1:.4f}")
print(f"Distinct-2: {distinct_2:.4f}")


NameError: name 'response_data' is not defined