In [1]:
%pip install torch sklearn transformers pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertModel

In [3]:
# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained('bert-base-uncased', cache_dir='./bert')

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [4]:
# Tokenize the dataset
def tokenize_text(text):
    return tokenizer(text, padding='max_length', max_length=128, truncation=True, return_tensors="pt")

In [6]:
df = pd.read_csv('final.csv')

In [7]:
df['input_ids'] = df['pattern'].apply(lambda x: tokenize_text(x)['input_ids'][0])
df['attention_mask'] = df['pattern'].apply(lambda x: tokenize_text(x)['attention_mask'][0])

# Convert labels to numerical values
labels = df['tag'].unique()
label_dict = {label: idx for idx, label in enumerate(labels)}
df['label'] = df['tag'].map(label_dict)

In [8]:
df.head()

Unnamed: 0,tag,pattern,input_ids,attention_mask,label
0,tindakan_stunting,Gimana cara mengatasi kalau anak terdeteksi st...,"[tensor(101), tensor(21025), tensor(24805), te...","[tensor(1), tensor(1), tensor(1), tensor(1), t...",0
1,tindakan_stunting,Apa langkah pertama yang harus diambil kalau a...,"[tensor(101), tensor(9706), tensor(2050), tens...","[tensor(1), tensor(1), tensor(1), tensor(1), t...",0
2,tindakan_stunting,Apa yang harus dilakukan orang tua kalau anakn...,"[tensor(101), tensor(9706), tensor(2050), tens...","[tensor(1), tensor(1), tensor(1), tensor(1), t...",0
3,tindakan_stunting,Bagaimana cara membantu anak yang sudah terlan...,"[tensor(101), tensor(4524), tensor(4886), tens...","[tensor(1), tensor(1), tensor(1), tensor(1), t...",0
4,tindakan_stunting,Apa aja tindakan yang bisa dilakukan kalau ana...,"[tensor(101), tensor(9706), tensor(2050), tens...","[tensor(1), tensor(1), tensor(1), tensor(1), t...",0


In [9]:
# Freeze BERT model parameters to avoid training them
for param in bert.parameters():
    param.requires_grad = False

In [10]:
from torch.utils.data import DataLoader, TensorDataset

In [19]:
# Define the seq2seq model with BERT fine-tuning
class Seq2SeqClassifier(nn.Module):
    def __init__(self, bert_model, hidden_dim, output_dim):
        super(Seq2SeqClassifier, self).__init__()
        self.bert = bert_model
        self.fc = nn.Linear(bert_model.config.hidden_size, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.out = nn.Linear(hidden_dim, output_dim)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        hidden = self.relu(self.fc(pooled_output))
        hidden = self.dropout(hidden)
        output = self.out(hidden)
        return output

In [28]:
# Hyperparameters
hidden_dim = 128
output_dim = 36
learning_rate = 2e-5
batch_size = 16
epochs = 10

In [29]:
# Create the model, loss function, and optimizer
model = Seq2SeqClassifier(bert, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

In [30]:
# Create data loaders
input_ids = torch.stack(df['input_ids'].tolist())
attention_masks = torch.stack(df['attention_mask'].tolist())
labels = torch.tensor(df['label'].tolist())

In [31]:
dataset = TensorDataset(input_ids, attention_masks, labels)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [27]:
# Verify the number of unique labels
labels = df['tag'].unique()
label_dict = {label: idx for idx, label in enumerate(labels)}
df['label'] = df['tag'].map(label_dict)

# Number of unique labels
num_labels = len(labels)
print(f"Number of unique labels: {num_labels} {output_dim}")

Number of unique labels: 36 4


In [33]:
# Training loop with fine-tuning
for epoch in range(128):
    model.train()
    total_loss = 0
    correct_predictions = 0

    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, preds = torch.max(outputs, dim=1)
        correct_predictions += torch.sum(preds == labels).item()

    avg_loss = total_loss / len(train_loader)
    accuracy = correct_predictions / len(df)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")

Epoch 1/10, Loss: 3.5697, Accuracy: 0.0575
Epoch 2/10, Loss: 3.5676, Accuracy: 0.0481
Epoch 3/10, Loss: 3.5666, Accuracy: 0.0497
Epoch 4/10, Loss: 3.5620, Accuracy: 0.0512
Epoch 5/10, Loss: 3.5707, Accuracy: 0.0606
Epoch 6/10, Loss: 3.5568, Accuracy: 0.0419
Epoch 7/10, Loss: 3.5632, Accuracy: 0.0575
Epoch 8/10, Loss: 3.5696, Accuracy: 0.0497
Epoch 9/10, Loss: 3.5647, Accuracy: 0.0481
Epoch 10/10, Loss: 3.5589, Accuracy: 0.0543
Epoch 11/10, Loss: 3.5607, Accuracy: 0.0512
Epoch 12/10, Loss: 3.5570, Accuracy: 0.0543
Epoch 13/10, Loss: 3.5597, Accuracy: 0.0637
Epoch 14/10, Loss: 3.5528, Accuracy: 0.0466
Epoch 15/10, Loss: 3.5588, Accuracy: 0.0435
Epoch 16/10, Loss: 3.5531, Accuracy: 0.0683
Epoch 17/10, Loss: 3.5564, Accuracy: 0.0637
Epoch 18/10, Loss: 3.5522, Accuracy: 0.0512
Epoch 19/10, Loss: 3.5529, Accuracy: 0.0481
Epoch 20/10, Loss: 3.5608, Accuracy: 0.0512
Epoch 21/10, Loss: 3.5575, Accuracy: 0.0637
Epoch 22/10, Loss: 3.5487, Accuracy: 0.0637
Epoch 23/10, Loss: 3.5512, Accuracy: 0.06

In [18]:
def evaluate_model(model, data_loader):
    model.eval()
    correct_predictions = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask)
            _, preds = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(preds == labels).item()

    accuracy = correct_predictions / len(data_loader.dataset)
    return accuracy

# Create a DataLoader for the evaluation
eval_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

# Calculate the accuracy
final_accuracy = evaluate_model(model, eval_loader)
print(f"Final Accuracy: {final_accuracy:.4f}")

Final Accuracy: 0.0543
