# **Global Needs**

In [None]:
import torch

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
leRate = 3e-5          # Learning rate (lebih tinggi untuk memulai optimasi lebih agresif)
baSize = 16            # Batch size (tetap untuk menjaga stabilitas)
inp_temperature = 2        # Temperature untuk Knowledge Distillation (sedikit lebih rendah untuk smoothing yang tidak terlalu agresif)
inp_alpha = 0.5            # Alpha (lebih fokus pada soft targets dari teacher model)

In [None]:
!pip install -q gdown
import gdown
import os

# Pastikan folder target ada
os.makedirs("base_model_indobert_binary", exist_ok=True)

# File ID kamu (GANTI dengan ID asli)
tokenized_id = ""
config_id = ""
model_id = ""

# Unduh lite_tokenized_data.json ke direktori utama
gdown.download(f"https://drive.google.com/uc?id={tokenized_id}", "lite_tokenized_data.json", quiet=False)

# Unduh config.json ke dalam folder model
gdown.download(f"https://drive.google.com/uc?id={config_id}", "base_model_indobert_binary/config.json", quiet=False)

# Unduh pytorch_model.bin ke dalam folder model
gdown.download(f"https://drive.google.com/uc?id={model_id}", "base_model_indobert_binary/model.safetensors", quiet=False, use_cookies=True)


# **Knowledge Distillation**

In [None]:
from transformers import AutoModelForSequenceClassification
from torch.nn import KLDivLoss, CrossEntropyLoss
from torch.optim import AdamW
from torch.utils.data import DataLoader
import torch
import torch.nn.functional as F

In [None]:
# Load teacher model and student model
teacher_model = AutoModelForSequenceClassification.from_pretrained('base_model_indobert_binary')
student_model = AutoModelForSequenceClassification.from_pretrained('indobenchmark/indobert-lite-base-p2', num_labels=2)

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-lite-base-p2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Freeze teacher model
teacher_model.eval()
for param in teacher_model.parameters():
    param.requires_grad = False

In [None]:
# Distillation loss
temperature = inp_temperature
alpha = inp_alpha  # Balance between hard and soft labels
kl_loss = KLDivLoss(reduction="batchmean")
ce_loss = CrossEntropyLoss()

# Distillation loss function
def distillation_loss(student_logits, teacher_logits, labels):
    # Logits softmax for student and teacher
    soft_targets = F.log_softmax(student_logits / temperature, dim=-1)  # Use dim=-1 for flexibility
    teacher_probs = F.softmax(teacher_logits / temperature, dim=-1)  # Use dim=-1 for flexibility

    # Hard target loss (CrossEntropy)
    hard_loss = ce_loss(student_logits, labels)

    # Soft target loss (KLDivLoss with temperature scaling)
    soft_loss = kl_loss(soft_targets, teacher_probs) * (temperature ** 2)

    # Combine losses
    return alpha * hard_loss + (1 - alpha) * soft_loss

In [None]:
import json
!pip install datasets
from datasets import Dataset
# load tokenized data for lite model (train and validation only)
with open('lite_tokenized_data.json', 'r') as f:
    lite_tokenized_data = json.load(f)

train_dataset_lite = Dataset.from_dict(lite_tokenized_data['train'])
val_dataset_lite = Dataset.from_dict(lite_tokenized_data['val'])



In [None]:
# Collate function
optimizer = AdamW(student_model.parameters(), lr=leRate)

def collate_fn(batch):
    return {
        'input_ids': torch.stack([torch.tensor(item['input_ids']) for item in batch]),
        'attention_mask': torch.stack([torch.tensor(item['attention_mask']) for item in batch]),
        'labels': torch.tensor([item['labels'] for item in batch], dtype=torch.long)
    }

# DataLoader for train loader with collate_fn
train_loader = DataLoader(
    train_dataset_lite,
    batch_size=baSize,
    shuffle=True,
    collate_fn=collate_fn
)
print(train_loader)

# DataLoader for validation dataset
val_loader = DataLoader(
    val_dataset_lite,
    batch_size=baSize,
    shuffle=False,
    collate_fn=collate_fn
)
print(val_loader)


<torch.utils.data.dataloader.DataLoader object at 0x7c19981f98d0>
<torch.utils.data.dataloader.DataLoader object at 0x7c19aeec6bd0>


In [None]:
from sklearn.metrics import classification_report

student_model.to(device)
teacher_model.to(device)

best_val_loss = float('inf')
patience = 3
patience_counter = 0
student_model_withKD = None

for epoch in range(50):
    total_train_loss = 0
    student_model.train()

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():
            teacher_logits = teacher_model(input_ids=input_ids, attention_mask=attention_mask).logits

        student_logits = student_model(input_ids=input_ids, attention_mask=attention_mask).logits
        loss = distillation_loss(student_logits, teacher_logits, labels)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_train_loss += loss.item()

    # Validation
    total_val_loss = 0
    student_model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            teacher_logits = teacher_model(input_ids=input_ids, attention_mask=attention_mask).logits
            student_logits = student_model(input_ids=input_ids, attention_mask=attention_mask).logits
            loss = distillation_loss(student_logits, teacher_logits, labels)
            total_val_loss += loss.item()

            preds = torch.argmax(student_logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_train_loss = total_train_loss / len(train_loader)
    avg_val_loss = total_val_loss / len(val_loader)

    # Output metrics
    print(f"\nEpoch {epoch + 1}")
    print(f"Training Loss: {avg_train_loss:.4f}")
    print(f"Validation Loss: {avg_val_loss:.4f}")
    print("Classification Report:")
    print(classification_report(all_labels, all_preds, target_names=['Positive', 'Negative']))

    # Early stopping logic
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        # Save student model
        student_model_withKD = student_model
    else:
        patience_counter += 1
        print(f"Early stopping patience counter: {patience_counter}/{patience}")

        if patience_counter >= patience:
            print("Early stopping triggered.")
            break



Epoch 1
Training Loss: 0.2726
Validation Loss: 0.2103
Classification Report:
              precision    recall  f1-score   support

    Positive       0.72      0.74      0.73       278
    Negative       0.92      0.91      0.92       926

    accuracy                           0.87      1204
   macro avg       0.82      0.83      0.82      1204
weighted avg       0.88      0.87      0.87      1204


Epoch 2
Training Loss: 0.1983
Validation Loss: 0.2119
Classification Report:
              precision    recall  f1-score   support

    Positive       0.75      0.69      0.72       278
    Negative       0.91      0.93      0.92       926

    accuracy                           0.87      1204
   macro avg       0.83      0.81      0.82      1204
weighted avg       0.87      0.87      0.87      1204

Early stopping patience counter: 1/3

Epoch 3
Training Loss: 0.1665
Validation Loss: 0.2041
Classification Report:
              precision    recall  f1-score   support

    Positive       0

In [None]:
student_model_withKD.save_pretrained('lite_model_kd')

# ***Testing/Evalution***

In [None]:
test_dataset_lite = Dataset.from_dict(lite_tokenized_data['test'])

In [None]:
def evaluate_model(model, dataset, collate_fn):
    model.to(device)
    model.eval()
    data_loader = DataLoader(dataset, batch_size=baSize, collate_fn=collate_fn)
    all_preds, all_labels = [], []

    for batch in data_loader:
        input_ids = torch.tensor(batch['input_ids']).to(device)
        attention_mask = torch.tensor(batch['attention_mask']).to(device)
        labels = torch.tensor(batch['labels']).to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return classification_report(all_labels, all_preds, target_names=['Positive', 'Negative'])

In [None]:
print("Evaluasi student model with KD (Testing Set):")
print(evaluate_model(student_model_withKD, test_dataset_lite, collate_fn))

Evaluasi student model with KD (Testing Set):


  input_ids = torch.tensor(batch['input_ids']).to(device)
  attention_mask = torch.tensor(batch['attention_mask']).to(device)
  labels = torch.tensor(batch['labels']).to(device)


              precision    recall  f1-score   support

    Positive       0.81      0.76      0.78       313
    Negative       0.92      0.94      0.93       891

    accuracy                           0.89      1204
   macro avg       0.86      0.85      0.86      1204
weighted avg       0.89      0.89      0.89      1204

