# **Global Needs**

In [None]:
import torch

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# **Class Weights**

In [None]:
import pandas as pd
# without specified encoding
preprocessed_paslon = "preprocessed_paslon.csv"
data = pd.read_csv(preprocessed_paslon, delimiter = ",", encoding='ISO-8859-1')

# result
print(data.head())

                                               tweet  label
0  batin komen lawak pasti suruh ma atas kalo ga ...    1.0
1  contoh buzzer goblok kasih bukti data fakta vi...    1.0
2  pak anies asal beda perintah entah benar salah...    1.0
3  pak munafik anies ngakunya resah soal orang or...    1.0
4  desak hrs tentang tum coba lihat rekam jejak g...    1.0


In [None]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

In [None]:
# Calculating Class Distribution
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.array([0, 1]),
    y=data['label'].values
)


# Conversion to tensor for PyTorch Use
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32).to(device)

# Result
print("Class Weights Tensor:", class_weights_tensor)

Class Weights Tensor: tensor([2.0005, 0.6666], device='cuda:0')


In [None]:
# Testing only

leRate = 3e-5          # Learning rate (lebih tinggi untuk memulai optimasi lebih agresif)
baSize = 16            # Batch size (tetap untuk menjaga stabilitas)
weLoss = class_weights_tensor    # Weighted loss (mengacu pada distribusi kelas)

# **Fine-Tuning IndoBert (Teacher Model)_Class Weights**

In [None]:
!pip install datasets

from transformers import AutoModelForSequenceClassification
from datasets import Dataset



In [None]:
# load base model
base_model = AutoModelForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p2', num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import json
# Load tokenized data for base model (train and validation only)
with open('base_tokenized_data.json', 'r') as f:
    base_tokenized_data = json.load(f)

train_dataset_base = Dataset.from_dict(base_tokenized_data['train'])
val_dataset_base = Dataset.from_dict(base_tokenized_data['val'])

In [None]:
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss

def collate_fn(batch):
    return {
        'input_ids': torch.stack([torch.tensor(item['input_ids']) for item in batch]),
        'attention_mask': torch.stack([torch.tensor(item['attention_mask']) for item in batch]),
        'labels': torch.tensor([item['labels'] for item in batch], dtype=torch.long)
    }

# DataLoader for train dataset with collate_fn
train_loader = DataLoader(
    train_dataset_base,
    batch_size=baSize,
    shuffle=True,
    collate_fn=collate_fn
)
print(train_loader)

# DataLoader for validation dataset
val_loader = DataLoader(
    val_dataset_base,
    batch_size=baSize,
    shuffle=False,  # no need to shuffle for validation
    collate_fn=collate_fn
)
print(val_loader)


# Define loss function with weighted loss
criterion = CrossEntropyLoss(weight=class_weights_tensor)

# Define optimizer
optimizer = torch.optim.AdamW(base_model.parameters(), lr=leRate)

<torch.utils.data.dataloader.DataLoader object at 0x7c5540d4ec10>
<torch.utils.data.dataloader.DataLoader object at 0x7c546f778290>


In [None]:
# Parameter Early Stopping
patience = 3
min_delta = 0.001
best_val_loss = float('inf')
patience_counter = 0
best_teacher_model = None

base_model.to(device)

# Training loop
for epoch in range(50):
    total_train_loss = 0
    base_model.train()

    # Training
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = base_model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Calculate loss with weighted loss
        loss = criterion(logits, labels)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    # Validation
    base_model.eval()
    total_val_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = base_model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # Calculate validation loss
            loss = criterion(logits, labels)
            total_val_loss += loss.item()

            # Save prediction and label for evaluation
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculating validation metrics
    from sklearn.metrics import classification_report
    val_metrics = classification_report(all_labels, all_preds, target_names=['Positive', 'Negative'])

    # loss and metrics output
    print(f"Epoch {epoch + 1}")
    print(f"Training Loss: {total_train_loss / len(train_loader):.4f}")
    print(f"Validation Loss: {total_val_loss / len(val_loader):.4f}")
    print("Validation Metrics:")
    print(val_metrics)

    # Early Stopping Check
    if total_val_loss < best_val_loss - min_delta:
        best_val_loss = total_val_loss
        patience_counter = 0
        print(f"Validation loss improved to {best_val_loss:.4f}. Saving model...")
        # Save model
        save_dir = f"best_model_epoch_{epoch + 1}"
        base_model.save_pretrained(save_dir)
        best_teacher_model = base_model
    else:
        patience_counter += 1
        print(f"No improvement in validation loss for {patience_counter} epoch(s).")

        if patience_counter >= patience:
            print("Early stopping triggered.")
            break

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Epoch 1
Training Loss: 0.4873
Validation Loss: 0.3911
Validation Metrics:
              precision    recall  f1-score   support

    Positive       0.59      0.84      0.70       278
    Negative       0.95      0.83      0.88       926

    accuracy                           0.83      1204
   macro avg       0.77      0.83      0.79      1204
weighted avg       0.86      0.83      0.84      1204

Validation loss improved to 29.7213. Saving model...
Epoch 2
Training Loss: 0.3255
Validation Loss: 0.4194
Validation Metrics:
              precision    recall  f1-score   support

    Positive       0.74      0.75      0.75       278
    Negative       0.93      0.92      0.92       926

    accuracy                           0.88      1204
   macro avg       0.83      0.84      0.83      1204
weighted avg       0.88      0.88      0.88      1204

No improvement in validation loss for 1 epoch(s).
Epoch 3
Training Loss: 0.2159
Validation Loss: 0.3978
Validation Metrics:
              precisi

# **Testing/Evaluation (Dirun terpisah)**

In [None]:
test_dataset_base = Dataset.from_dict(base_tokenized_data['test'])

In [None]:
from sklearn.metrics import classification_report
def evaluate_model(model, dataset, collate_fn):
    model.to(device)
    model.eval()
    data_loader = DataLoader(dataset, batch_size=baSize, collate_fn=collate_fn)
    all_preds, all_labels = [], []

    for batch in data_loader:
        input_ids = torch.tensor(batch['input_ids']).to(device)
        attention_mask = torch.tensor(batch['attention_mask']).to(device)
        labels = torch.tensor(batch['labels']).to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return classification_report(all_labels, all_preds, target_names=['Positive', 'Negative'])

In [None]:
print("Evaluasi Teacher  Model (Testing Set):")
print(evaluate_model(best_teacher_model, test_dataset_base, collate_fn))

Evaluasi Teacher  Model (Testing Set):


  input_ids = torch.tensor(batch['input_ids']).to(device)
  attention_mask = torch.tensor(batch['attention_mask']).to(device)
  labels = torch.tensor(batch['labels']).to(device)


              precision    recall  f1-score   support

    Positive       0.73      0.84      0.78       313
    Negative       0.94      0.89      0.91       891

    accuracy                           0.88      1204
   macro avg       0.83      0.87      0.85      1204
weighted avg       0.89      0.88      0.88      1204

