In [None]:
%pip install torch pandas numpy transformers sklearn

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertModel, AdamW
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.nn.functional import cross_entropy

In [None]:
# Load the dataset
df = pd.read_csv('example_question.csv')
df.head()

In [None]:
# Encode labels
label_encoder = LabelEncoder()

In [None]:
df['tag'] = label_encoder.fit_transform(df['tag'])
df.head()

In [None]:
# Tokenize the text
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def tokenize_text(text):
    tokens = tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')
    return tokens['input_ids'][0], tokens['attention_mask'][0]

In [None]:
tokenized = df['patterns'].apply(lambda x: tokenize_text(x))
df['input_ids'] = tokenized.apply(lambda x: x[0])
df['attention_mask'] = tokenized.apply(lambda x: x[1])

In [None]:
# Convert token columns to lists for proper indexing in DataLoader
df['input_ids'] = df['input_ids'].apply(lambda x: x.tolist())
df['attention_mask'] = df['attention_mask'].apply(lambda x: x.tolist())

In [None]:
class QuestionTagDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        item = self.dataframe.iloc[idx]
        input_ids = item['input_ids']
        attention_mask = item['attention_mask']
        label = item['tag']
        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
dataset = QuestionTagDataset(df)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In [None]:
class BERTTagger(nn.Module):
    def __init__(self, num_labels):
        super(BERTTagger, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased', cache_dir='./bert')
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        logits = self.classifier(pooled_output)
        return logits
    
num_labels = len(label_encoder.classes_)
model = BERTTagger(num_labels)

# Step 4: Train the Model
optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
def train(model, dataloader, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in dataloader:
            optimizer.zero_grad()
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']
            outputs = model(input_ids, attention_mask)
            loss = cross_entropy(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataloader)}')
        
train(model, dataloader)

In [None]:
def evaluate(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']
            outputs = model(input_ids, attention_mask)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f'Accuracy: {correct / total * 100}%')
    
evaluate(model, dataloader)

In [None]:
def predict_tag(input_text, true_tag, model, tokenizer, label_encoder):
    inputs = tokenizer(input_text, return_tensors='pt', padding='max_length', max_length=32, truncation=True)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, 1)
        predicted_tag = label_encoder.inverse_transform(predicted.numpy())[0]
    
    # Calculate accuracy
    true_tag_encoded = label_encoder.transform([true_tag])[0]
    accuracy = (predicted == torch.tensor([true_tag_encoded], dtype=torch.long)).sum().item() / 1.0
    
    return predicted_tag, accuracy

In [None]:
# Example usage
input_text = "Apa itu stunting?"
true_tag = "definisi_stunting"  # This should be the actual tag for the given input_text
predicted_tag, accuracy = predict_tag(input_text, true_tag, model, tokenizer, label_encoder)
print(f"Input Text: {input_text}")
print(f"Predicted Tag: {predicted_tag}")
print(f"Accuracy: {accuracy * 100:.2f}%")