In [1]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
from transformers import ElectraTokenizerFast,Trainer,TrainingArguments
from transformers import ElectraForSequenceClassification,Pipeline
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
import torch
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

In [2]:
RANDOM_SEED = 42
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_score(labels, predictions)
    recall = recall_score(labels, predictions)
    precision = precision_score(labels, predictions)
    f1 = f1_score(labels, predictions)

    return {"accuracy": accuracy, "recall": recall, "precision": precision, "f1": f1}

In [None]:
PATH = 'google/electra-small-discriminator'
tokenizer = ElectraTokenizerFast.from_pretrained(PATH)

In [5]:
class Suicide_Dataset(Dataset):
    def __init__(self,tokenizer:PreTrainedTokenizerBase,train_data,train_label):
        self.data = train_data.to_numpy()
        self.label = torch.tensor(train_label.to_numpy())
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,id):
        sentence = self.data[id]
        target = self.label[id]
        encoding = self.tokenizer(sentence, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
        return {
            'input_ids': encoding['input_ids'].squeeze(0), # Squeeze to remove the extra dimension
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': target
        }

In [None]:
df1 = pd.read_csv("../../../data/Suicide_Detection.csv")
df1["class"] = df1["class"].apply(lambda x: 1 if x == "suicide" else 0)
label = df1["class"]
data = df1["text"]
train_data,test_data,train_label,test_label = train_test_split(data,label,test_size=0.2,random_state=RANDOM_SEED)
train_dataset = Suicide_Dataset(tokenizer,train_data,train_label)
test_dataset = Suicide_Dataset(tokenizer,test_data,test_label)

In [7]:
len(train_dataset)

185659

In [8]:
discriminator = ElectraForSequenceClassification.from_pretrained(PATH)
discriminator.to(device)

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Li

In [9]:
for name, param in discriminator.named_parameters():
    print(f"Layer: {name} | Requires Grad: {param.requires_grad}")

Layer: electra.embeddings.word_embeddings.weight | Requires Grad: True
Layer: electra.embeddings.position_embeddings.weight | Requires Grad: True
Layer: electra.embeddings.token_type_embeddings.weight | Requires Grad: True
Layer: electra.embeddings.LayerNorm.weight | Requires Grad: True
Layer: electra.embeddings.LayerNorm.bias | Requires Grad: True
Layer: electra.embeddings_project.weight | Requires Grad: True
Layer: electra.embeddings_project.bias | Requires Grad: True
Layer: electra.encoder.layer.0.attention.self.query.weight | Requires Grad: True
Layer: electra.encoder.layer.0.attention.self.query.bias | Requires Grad: True
Layer: electra.encoder.layer.0.attention.self.key.weight | Requires Grad: True
Layer: electra.encoder.layer.0.attention.self.key.bias | Requires Grad: True
Layer: electra.encoder.layer.0.attention.self.value.weight | Requires Grad: True
Layer: electra.encoder.layer.0.attention.self.value.bias | Requires Grad: True
Layer: electra.encoder.layer.0.attention.output.d

In [10]:
training_args = TrainingArguments(
    output_dir="./finetuned-electra",
    evaluation_strategy='epoch',
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    overwrite_output_dir=True
)

trainer = Trainer(
    model=discriminator,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train(resume_from_checkpoint=True)

In [11]:
trainer.evaluate()

  0%|          | 0/2901 [00:00<?, ?it/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33msylvis[0m ([33msylvis-hanoi-university-of-science-and-technology[0m). Use [1m`wandb login --relogin`[0m to force relogin


{'eval_loss': 0.12574569880962372,
 'eval_accuracy': 0.9762576753204782,
 'eval_recall': 0.9797215496368039,
 'eval_precision': 0.9728232869654817,
 'eval_f1': 0.976260232658337,
 'eval_runtime': 222.8289,
 'eval_samples_per_second': 208.299,
 'eval_steps_per_second': 13.019}

In [None]:
trainer.save_model("./finetuned-electra")
tokenizer.save_pretrained("./finetuned-electra")