In [1]:
pip uninstall -y tensorflow

Found existing installation: tensorflow 2.16.1
Uninstalling tensorflow-2.16.1:
  Successfully uninstalled tensorflow-2.16.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install tensorflow-cpu

Collecting tensorflow-cpu
  Downloading tensorflow_cpu-2.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting tensorboard<2.19,>=2.18 (from tensorflow-cpu)
  Downloading tensorboard-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Collecting keras>=3.5.0 (from tensorflow-cpu)
  Downloading keras-3.6.0-py3-none-any.whl.metadata (5.8 kB)
Collecting ml-dtypes<0.5.0,>=0.4.0 (from tensorflow-cpu)
  Downloading ml_dtypes-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Downloading tensorflow_cpu-2.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (230.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m230.0/230.0 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading keras-3.6.0-py3-none-any.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading ml_dtypes-0.4.1-cp310-

In [3]:
import numpy as np
import pandas as pd
import random
import torch
from torch.optim import Adam
from tqdm import tqdm
from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling
from transformers import BertTokenizer,BertConfig
from transformers import BertForMaskedLM

In [4]:
df = pd.read_csv("/kaggle/input/new-suicide/Suicide_Detection.csv")

# Pretrain Bert_API 

* Prepare dataset

In [13]:
# balance dataset pretrain
dataset = pd.concat([df[df['class'] == 'suicide'][20000:30000], df[df['class'] == 'non-suicide'][20000:30000]], axis = 0)

In [14]:
# mlm task dataset
def prepare_data_for_mlm(df, tokenizer, max_length= 300):
  texts = df['text'].tolist()
  inputs = tokenizer(texts, max_length=max_length, padding='max_length', truncation=True)
  return inputs

tokenizer = BertTokenizer.from_pretrained('/kaggle/input/pretrained-v3')
train_inputs = prepare_data_for_mlm(dataset, tokenizer)

# Create a PyTorch Dataset for MLM
class MLMDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = MLMDataset(train_inputs)

# Create a DataCollator for MLM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm_probability=0.15
)

# Create a DataLoader for MLM
train_dataloader = DataLoader(
    train_dataset, batch_size= 64, collate_fn=data_collator
)

* load model pretrain

In [15]:
# load BERT for MLM
config = BertConfig.from_pretrained("/kaggle/input/pretrained-v3", num_hidden_layers=1)
model = BertForMaskedLM.from_pretrained("/kaggle/input/pretrained-v3", config=config)

In [16]:
model

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

* training

In [17]:
# convert to GPU
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# optimizer
epochs = 11
optimizer = Adam(model.parameters(), lr=0.002)
store_loss = {'loss MLM': []}
# training
model.train()
for epoch in range(epochs):
    train_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}"):
        batch = {k: v.to(device) for k, v in batch.items()}

        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
    if epoch == 10:
        model.save_pretrained("suicide-bert-pretrained")
        tokenizer.save_pretrained("suicide-bert-pretrained")
    store_loss['loss MLM'].append(train_loss / len(train_dataloader))
    print(f"Epoch {epoch + 1}: Pretrain Loss: {train_loss / len(train_dataloader)}")

Training Epoch 1: 100%|██████████| 313/313 [05:55<00:00,  1.14s/it]


Epoch 1: Pretrain Loss: 3.517213663734948


Training Epoch 2: 100%|██████████| 313/313 [05:55<00:00,  1.13s/it]


Epoch 2: Pretrain Loss: 3.400194068305409


Training Epoch 3: 100%|██████████| 313/313 [05:54<00:00,  1.13s/it]


Epoch 3: Pretrain Loss: 3.322795727763313


Training Epoch 4: 100%|██████████| 313/313 [05:55<00:00,  1.14s/it]


Epoch 4: Pretrain Loss: 3.2341557059425137


Training Epoch 5: 100%|██████████| 313/313 [05:55<00:00,  1.14s/it]


Epoch 5: Pretrain Loss: 3.183718991355774


Training Epoch 6: 100%|██████████| 313/313 [05:56<00:00,  1.14s/it]


Epoch 6: Pretrain Loss: 3.1404536791121997


Training Epoch 7: 100%|██████████| 313/313 [05:56<00:00,  1.14s/it]


Epoch 7: Pretrain Loss: 3.102560271851171


Training Epoch 8: 100%|██████████| 313/313 [05:56<00:00,  1.14s/it]


Epoch 8: Pretrain Loss: 3.0554085501466695


Training Epoch 9: 100%|██████████| 313/313 [05:55<00:00,  1.14s/it]


Epoch 9: Pretrain Loss: 3.01453208009275


Training Epoch 10: 100%|██████████| 313/313 [05:55<00:00,  1.14s/it]


Epoch 10: Pretrain Loss: 2.9875987177839676


Training Epoch 11: 100%|██████████| 313/313 [05:55<00:00,  1.14s/it]


Epoch 11: Pretrain Loss: 2.9521815761590537


* save model pretrain

In [18]:
model.save_pretrained("suicide-bert-pretrained-1")
tokenizer.save_pretrained("suicide-bert-pretrained-1")

('suicide-bert-pretrained-1/tokenizer_config.json',
 'suicide-bert-pretrained-1/special_tokens_map.json',
 'suicide-bert-pretrained-1/vocab.txt',
 'suicide-bert-pretrained-1/added_tokens.json')

# Finetune 

* prepare data

In [6]:
# balance dataset pretrain
dataset = pd.concat([df[df['class'] == 'suicide'][:5000], df[df['class'] == 'non-suicide'][:5000]], axis = 0)

In [5]:
texts = df['text'].tolist()
labels = df['class'].tolist()

In [11]:
from sklearn.model_selection import train_test_split

# load tokenizer
tokenizer = BertTokenizer.from_pretrained("/kaggle/input/pretrained-v3")

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.4)

# encoding text
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

In [12]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = [1 if label == 'suicide' else 0 for label in labels]

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

finetune_dataset = TextDataset(train_encodings, train_labels)
val_dataset = TextDataset(val_encodings, val_labels)

finetune_loader = DataLoader(finetune_dataset, batch_size= 64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size= 64, shuffle=True)

* load model pretrained for finetune 

In [13]:
from transformers import BertForSequenceClassification

# load pretrained
config = BertConfig.from_pretrained("/kaggle/input/pretrained-v3", num_hidden_layers=1,num_labels=2)
model = BertForSequenceClassification.from_pretrained("/kaggle/input/pretrained-v3",config = config)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/pretrained-v3 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


* training

In [None]:
epochs = 10
optimizer = Adam(model.parameters(), lr=2e-3)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(epochs):
    model.train()
    train_loss = 0
    for batch in tqdm(finetune_loader, desc=f"Training Epoch {epoch + 1}"):
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss
        train_loss += loss.item()

        # Backward pass
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    avg_train_loss = train_loss / len(finetune_loader)
    print(f"Epoch {epoch + 1} - Average Training Loss: {avg_train_loss:.4f}")

    # evaluate 
    model.eval()
    val_loss = 0
    correct = 0
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()

            preds = torch.argmax(outputs.logits, dim=1)
            correct += (preds == batch['labels']).sum().item()

    avg_val_loss = val_loss / len(val_loader)
    accuracy = correct / len(val_dataset)
    print(f"Epoch {epoch + 1} - Validation Loss: {avg_val_loss:.4f} - Accuracy: {accuracy:.4f}")

Training Epoch 1: 100%|██████████| 2176/2176 [17:06<00:00,  2.12it/s]


Epoch 1 - Average Training Loss: 0.1482
Epoch 1 - Validation Loss: 0.1171 - Accuracy: 0.9565


Training Epoch 2: 100%|██████████| 2176/2176 [17:04<00:00,  2.12it/s]


Epoch 2 - Average Training Loss: 0.1007
Epoch 2 - Validation Loss: 0.1169 - Accuracy: 0.9570


Training Epoch 3: 100%|██████████| 2176/2176 [17:08<00:00,  2.12it/s]


Epoch 3 - Average Training Loss: 0.0819
Epoch 3 - Validation Loss: 0.1347 - Accuracy: 0.9569


Training Epoch 4: 100%|██████████| 2176/2176 [17:08<00:00,  2.12it/s]


Epoch 4 - Average Training Loss: 0.0671
Epoch 4 - Validation Loss: 0.1365 - Accuracy: 0.9568


Training Epoch 5: 100%|██████████| 2176/2176 [17:12<00:00,  2.11it/s]


Epoch 5 - Average Training Loss: 0.0520
Epoch 5 - Validation Loss: 0.1861 - Accuracy: 0.9527


Training Epoch 6: 100%|██████████| 2176/2176 [17:12<00:00,  2.11it/s]


Epoch 6 - Average Training Loss: 0.0432
Epoch 6 - Validation Loss: 0.2072 - Accuracy: 0.9533


Training Epoch 7: 100%|██████████| 2176/2176 [17:12<00:00,  2.11it/s]


Epoch 7 - Average Training Loss: 0.0351
Epoch 7 - Validation Loss: 0.1870 - Accuracy: 0.9544


Training Epoch 8: 100%|██████████| 2176/2176 [17:14<00:00,  2.10it/s]


Epoch 8 - Average Training Loss: 0.0293
Epoch 8 - Validation Loss: 0.2151 - Accuracy: 0.9519


Training Epoch 9: 100%|██████████| 2176/2176 [17:16<00:00,  2.10it/s]


Epoch 9 - Average Training Loss: 0.0272
Epoch 9 - Validation Loss: 0.2011 - Accuracy: 0.9516


Training Epoch 10: 100%|██████████| 2176/2176 [17:18<00:00,  2.09it/s]


Epoch 10 - Average Training Loss: 0.0210
Epoch 10 - Validation Loss: 0.1936 - Accuracy: 0.9496


* save model

In [15]:
model.save_pretrained("bert-classify-model")
tokenizer.save_pretrained("bert-classify-model")

('bert-classify-model/tokenizer_config.json',
 'bert-classify-model/special_tokens_map.json',
 'bert-classify-model/vocab.txt',
 'bert-classify-model/added_tokens.json')