In [1]:
import torch
import multiprocessing
from torch.utils.data import Dataset, DataLoader
from transformers import (
    BertTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd
import wandb
import numpy as np
import random

In [2]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

set_seed(42)

In [3]:
multiprocessing.set_start_method("spawn", force=True)

In [4]:
wandb.login(key="c62c25918ce7fcb2ae41fc887c02e43aeb0cb261")  

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mchallesvu[0m ([33mchallesvu-ton-duc-thang-university[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [5]:
def load_data(file_path):
    data = pd.read_csv(file_path)
    return data

In [6]:
train_data = load_data('/kaggle/input/belong-irf/train.csv')
val_data = load_data('/kaggle/input/belong-irf/val.csv')

In [7]:
class sDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [10]:
def encode_labels(labels):
    return ["Yes" if label == "Yes" else "No" for label in labels]

train_texts = train_data['post'].tolist()
train_labels = encode_labels(train_data['is_belong'].tolist())
val_texts = val_data['post'].tolist()
val_labels = encode_labels(val_data['is_belong'].tolist())

In [11]:
label_map = {"No": 0, "Yes": 1}
train_indices = [label_map[label] for label in train_labels]
val_indices = [label_map[label] for label in val_labels]
train_dataset = sDataset(train_texts, train_indices, tokenizer, max_len=512)
val_dataset = sDataset(val_texts, val_indices, tokenizer, max_len=512)

In [12]:
training_args = TrainingArguments(
    output_dir="./results",
    run_name="mental-bert-finetune",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    disable_tqdm=False,
    dataloader_num_workers=0
)



In [13]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [15]:
trainer.train()

[34m[1mwandb[0m: Tracking run with wandb version 0.19.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241226_172358-ur21xbng[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mmental-bert-finetune[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/challesvu-ton-duc-thang-university/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/challesvu-ton-duc-thang-university/huggingface/runs/ur21xbng[0m


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5506,0.509787,0.770791,0.809444,0.764331,0.860215
2,0.386,0.425243,0.807302,0.837607,0.800654,0.878136
3,0.248,0.457206,0.809331,0.840136,0.799353,0.885305
4,0.1506,0.575464,0.809331,0.844371,0.784615,0.913978
5,0.1504,0.736754,0.807302,0.840336,0.791139,0.896057
6,0.0376,0.789108,0.813387,0.841379,0.810631,0.874552
7,0.0125,0.957282,0.807302,0.83816,0.798701,0.88172
8,0.0045,1.081335,0.805274,0.826087,0.835165,0.817204
9,0.015,1.11565,0.813387,0.845118,0.796825,0.899642




TrainOutput(global_step=558, training_loss=0.17882471565677915, metrics={'train_runtime': 1127.4234, 'train_samples_per_second': 34.982, 'train_steps_per_second': 1.1, 'total_flos': 4669695010529280.0, 'train_loss': 0.17882471565677915, 'epoch': 9.0})

In [16]:
model.save_pretrained("mental-bert-finetune-irf")
tokenizer.save_pretrained("mental-bert-finetune-irf")

('mental-bert-finetune-irf/tokenizer_config.json',
 'mental-bert-finetune-irf/special_tokens_map.json',
 'mental-bert-finetune-irf/vocab.txt',
 'mental-bert-finetune-irf/added_tokens.json')

## Evaluate

In [17]:
test_data = load_data('/kaggle/input/belong-irf/test.csv')

In [18]:
test_texts = test_data['post'].tolist()
test_labels = encode_labels(test_data['is_belong'].tolist())

In [19]:
test_indices = [label_map[label] for label in test_labels]
test_dataset = sDataset(test_texts, test_indices, tokenizer, max_len=512)

In [20]:
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [21]:
model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="binary")

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.8231
Precision: 0.8169
Recall: 0.8769
F1 Score: 0.8458


In [22]:
texts = ["All my life i've been going through shit (only 17 years old) and when things started to get better i crashed. I can't get myself to get out of bed no matter how much i try, my family understands but do still not approve since my grades dropped from all A's to E-C. It has been like this for 1-2 years now and none of my friends understands how It's like, I can't really blame them either since I don't like talking about it and i've always been taught to be a man and keep this stuff to myself. They just see a lazy fuck who is too irresponsible to go too school, same with my teachers. Idk if typing here is going to help at all but if anyone has some tips/advice on how to get motivated again i would be super happy."]
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
inputs = {key: val.to(device) for key, val in inputs.items()}
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)

print("Predictions:", predictions.cpu().numpy())

Predictions: [1]
