In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!nvidia-smi


In [None]:
!ls


In [None]:
!pip install -q \
  transformers \
  datasets \
  peft \
  bitsandbytes \
  accelerate \
  pandas

In [None]:
import pandas as pd
from datasets import Dataset


In [None]:
from huggingface_hub import login

login()


In [None]:
counsel_df = pd.read_csv("/kaggle/input/empathyy/counselchat.csv")


In [None]:
counsel_df.head()


In [None]:
emp_df = pd.read_parquet("/kaggle/input/empathyy/emp_dialogues.parquet")


In [None]:
emp_df.head()


In [None]:
print("CounselChat rows:", len(counsel_df))
print("Empathetic Dialogues rows:", len(emp_df))


In [None]:
# Keep only what we need
counsel_clean = counsel_df[["questionText", "answerText"]].dropna()

# Rename to standard names
counsel_clean = counsel_clean.rename(columns={
    "questionText": "user",
    "answerText": "assistant"
})

# Trim whitespace
counsel_clean["user"] = counsel_clean["user"].str.strip()
counsel_clean["assistant"] = counsel_clean["assistant"].str.strip()

print(counsel_clean.head())
print("CounselChat cleaned rows:", len(counsel_clean))


In [None]:
import json

def extract_pairs(row):
    pairs = []
    convo = row["conversations"]

    # Ensure it's parsed JSON
    if isinstance(convo, str):
        convo = json.loads(convo)

    for i in range(len(convo) - 1):
        if convo[i]["role"] == "user" and convo[i+1]["role"] == "assistant":
            user_text = convo[i]["content"].strip()
            assistant_text = convo[i+1]["content"].strip()

            if len(user_text) > 5 and len(assistant_text) > 5:
                pairs.append({
                    "user": user_text,
                    "assistant": assistant_text,
                    "emotion": row["emotion"],
                    "situation": row["situation"]
                })
    return pairs


In [None]:
emp_pairs = []

for _, row in emp_df.iterrows():
    emp_pairs.extend(extract_pairs(row))

emp_clean = pd.DataFrame(emp_pairs)

print(emp_clean.head())
print("Extracted Empathetic Dialogue pairs:", len(emp_clean))


In [None]:
def trim_text(df, max_user=300, max_assistant=400):
    df = df.copy()
    df["user"] = df["user"].str.slice(0, max_user)
    df["assistant"] = df["assistant"].str.slice(0, max_assistant)
    return df

counsel_clean = trim_text(counsel_clean)
emp_clean = trim_text(emp_clean)


In [None]:
emp_clean["emotion"].value_counts()


In [None]:
def add_emotion_context(row):
    return f"The user feels {row['emotion']}. {row['user']}"

emp_clean["user"] = emp_clean.apply(add_emotion_context, axis=1)


In [None]:
emp_sampled = emp_clean.sample(
    n=10000,
    random_state=42
)

print("CounselChat:", len(counsel_clean))
print("EmpDialogues sampled:", len(emp_sampled))


In [None]:
combined_df = pd.concat(
    [counsel_clean[["user", "assistant"]],
     emp_sampled[["user", "assistant"]]],
    ignore_index=True
)

print("Total combined rows:", len(combined_df))
combined_df.sample(5)


In [None]:
emp_sampled["emotion"].value_counts()


In [None]:
def to_chat_format(row):
    return {
        "messages": [
            {"role": "user", "content": row["user"]},
            {"role": "assistant", "content": row["assistant"]}
        ]
    }

chat_data = combined_df.apply(to_chat_format, axis=1).tolist()

print(chat_data[0])
print("Total chat samples:", len(chat_data))


In [None]:
from datasets import Dataset

hf_dataset = Dataset.from_list(chat_data)
hf_dataset


In [None]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model


In [None]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it")

def count_tokens(example):
    text = ""
    for msg in example["messages"]:
        text += msg["content"]
    return {"token_count": len(tokenizer(text)["input_ids"])}

token_counts = hf_dataset.map(count_tokens)
tokens = token_counts["token_count"]

print("Max tokens:", np.max(tokens))
print("Mean tokens:", int(np.mean(tokens)))


In [None]:
model_id = "google/gemma-2-2b-it"

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

import torch
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    load_in_4bit=True,
    device_map={"": 0},
    torch_dtype=torch.float16,
)

# ðŸ”´ CRITICAL FIXES FOR GEMMA-2
model.config.use_cache = False                # required for training
model.config.attn_implementation = "eager"    # ðŸ”´ disables FlashAttention




In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


In [None]:
def flatten_chat(example):
    text = ""
    for msg in example["messages"]:
        if msg["role"] == "user":
            text += f"User: {msg['content']}\n"
        else:
            text += f"Assistant: {msg['content']}\n"
    return {"text": text}


In [None]:
train_dataset = hf_dataset.map(
    flatten_chat,
    remove_columns=hf_dataset.column_names
)


In [None]:
def tokenize_fn(example):
    return tokenizer(
        example["text"],
        truncation=True,
        max_length=512,
        padding=False
    )

tokenized_dataset = train_dataset.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"]
)


In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)


In [None]:
training_args = TrainingArguments(
    output_dir="./gemma-empathy",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=2,
    logging_steps=50,
    save_steps=200,          # ðŸ”´ safer
    save_total_limit=3,
    report_to="none",
    optim="adamw_torch",
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)


In [None]:
trainer.train()


In [None]:
model.save_pretrained("/kaggle/working/gemma-empathy-lora")
tokenizer.save_pretrained("/kaggle/working/gemma-empathy-lora")


In [None]:
from transformers import pipeline

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=200,
    temperature=0.7,
    do_sample=True
)

prompt = "User: I feel overwhelmed and anxious lately.\nAssistant:"
print(pipe(prompt)[0]["generated_text"])


In [None]:
combined_df.to_csv("/kaggle/working/combined_empathy_dataset.csv", index=False)


In [None]:
hf_dataset.to_csv("/kaggle/working/combined_hf_dataset.csv", index=False)
