In [None]:
import pandas as pd
import numpy as np

# Loading dataset and Pre-process data

In [None]:
df = pd.read_csv("Depression_Dataset.csv")
df.head()

In [None]:
df.isnull().sum()

In [None]:
df['label'].unique()

In [None]:
df['label'].value_counts()

# Load Tokenizer from huggingface

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('airesearch/wangchanberta-base-att-spm-uncased')

In [None]:
def preprocess_function(examples):
  return tokenizer(examples["text"], truncation=True)

# Train Valdation Test Split

In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)

train_valid_test = dataset.train_test_split(test_size=0.3, seed=42)
train_valid = train_valid_test['train']
test_dataset = train_valid_test['test']

train_valid_split = train_valid.train_test_split(test_size=0.2, seed=42)
train_dataset = train_valid_split['train']
valid_dataset = train_valid_split['test']

from datasets import DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': valid_dataset,
    'test': test_dataset
})

tokenized_dataset = dataset_dict.map(preprocess_function, batched=True)


# Setup Data Collator

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load Pretrained Model for Sequence Classification

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
model = AutoModelForSequenceClassification.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased", num_labels=2)

In [None]:
!pip install evaluate

In [None]:
import evaluate

accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels)
    recall = recall_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels)

    return {
        "accuracy": accuracy["accuracy"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1": f1["f1"]

    }


In [None]:
def model_init():
    from transformers import AutoModelForSequenceClassification
    return AutoModelForSequenceClassification.from_pretrained(
        "airesearch/wangchanberta-base-att-spm-uncased",
        num_labels=2
    )

# Define Training Arguments & Initialize Trainer

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    seed=42,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    report_to="none",
    logging_strategy="steps",
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train Model

In [None]:
trainer.train()
trainer.save_model("./final_model")

# Evaluate Model

In [None]:
val_metrics = trainer.evaluate()
val_metrics

In [None]:
test_metrics = trainer.evaluate(tokenized_dataset["test"])
test_metrics

# Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
import numpy as np
import matplotlib.pyplot as plt

test_output = trainer.predict(tokenized_dataset['test'])

y_pred = np.argmax(test_output.predictions, axis=1)
y_true = test_output.label_ids

cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Neg", "Pos"])
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.show()

print(classification_report(y_true, y_pred, target_names=["Neg", "Pos"], digits=4))


# Model Loss & Accuracy

In [None]:
train_loss, train_steps = [], []
eval_loss, eval_accuracy, eval_steps = [], [], []

for log in trainer.state.log_history:
    if "loss" in log and "eval_loss" not in log:
        train_loss.append(log["loss"])
        train_steps.append(log["step"])
    if "eval_loss" in log:
        eval_loss.append(log["eval_loss"])
        eval_steps.append(log["step"])
        if "eval_accuracy" in log:
            eval_accuracy.append(log["eval_accuracy"])

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 4))
plt.plot(train_steps, train_loss, label="Train Loss")
plt.plot(eval_steps, eval_loss, label="Validation Loss")
plt.xlabel("Steps")
plt.ylabel("Loss")
plt.title("Training and Validation Loss")
plt.legend()
plt.grid(True)
plt.show()

if eval_accuracy:
    plt.figure(figsize=(10, 4))
    plt.plot(eval_steps, eval_accuracy, marker="o", label="Validation Accuracy", color="green")
    plt.xlabel("Steps")
    plt.ylabel("Accuracy")
    plt.title("Validation Accuracy")
    plt.ylim(0, 1.0)
    plt.legend()
    plt.grid(True)
    plt.show()


# Testing Model Generalization

In [None]:
texts = [
    "ไม่อยากตื่นเจอวันพรุ่งนี้",
    "ฉันกำลังรักษาโรคซึมเศร้า",
    "วันนี้รู้สึกดีมาก มีความสุข",
    "ไม่มีแรงทำอะไรเลย เหนื่อยจนทนไม่ไหว",
    "ชีวิตนี้ไม่มีความหมาย",
    "เราหวังจะมีเจ้าชายออกมาจากตะเกียงหรอ",
    "หลังจากได้รับการรักษามาสักระยะ",
]

inputs = tokenizer(
    texts,
    padding=True,
    truncation=True,
    return_tensors="pt"
)

import torch
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=1)
    preds = torch.argmax(probs, dim=1)

for i, text in enumerate(texts):
    label = preds[i].item()
    confidence = probs[i][label].item()
    result = "เป็นโรคซึมเศร้า" if label == 1 else "ไม่เป็นโรคซึมเศร้า"
    print(f"'{text}' → {result} (ความมั่นใจ = {confidence:.2f})")
