## 1. Import Library

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score, 
    f1_score, precision_score, recall_score, precision_recall_fscore_support
)
from sklearn.preprocessing import LabelEncoder

# PyTorch dan Transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

import warnings
warnings.filterwarnings('ignore')

# Set style untuk visualisasi
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
pd.set_option('display.max_colwidth', 200)

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print("Libraries loaded successfully!")

## 2. Load Dataset

In [None]:
# Load dataset
df = pd.read_csv("dataset_labeled.csv")

print(f"Dataset dimuat: {len(df)} baris")
print(f"Kolom: {list(df.columns)}")
print(f"\nDistribusi sentimen:")
print(df['sentiment'].value_counts())
print(f"\nPersentase:")
print(df['sentiment'].value_counts(normalize=True) * 100)

df.head()

## 3. Persiapan Data

In [None]:
# Pisahkan fitur dan label
X = df['comment']
y = df['sentiment']

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

print(f"Training set: {len(X_train)} samples")
print(f"Testing set: {len(X_test)} samples")
print(f"\nLabel mapping:")
print({i: label for i, label in enumerate(label_encoder.classes_)})

## 4. Load IndoBERT Model dan Tokenizer

In [None]:
# Load IndoBERT tokenizer dan model (PyTorch)
model_name = "indobenchmark/indobert-base-p1"
bert_tokenizer = AutoTokenizer.from_pretrained(model_name)

# Setup label mapping
num_classes = len(label_encoder.classes_)
id2label = {i: label for i, label in enumerate(label_encoder.classes_)}
label2id = {label: i for i, label in enumerate(label_encoder.classes_)}

# Load model dengan PyTorch
bert_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_classes,
    id2label=id2label,
    label2id=label2id
)

# Set device (CPU atau CUDA)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_model.to(device)

print(f"Model loaded: {model_name}")
print(f"Device: {device}")
print(f"Number of labels: {num_classes}")
print(f"Labels: {list(id2label.values())}")

## 5. Tokenisasi Data

In [None]:
# Tokenisasi training dan testing data
train_encodings = bert_tokenizer(
    X_train.tolist(),
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors='pt'
)

test_encodings = bert_tokenizer(
    X_test.tolist(),
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors='pt'
)

print(f"Training encodings keys: {train_encodings.keys()}")
print(f"Training input shape: {train_encodings['input_ids'].shape}")
print(f"Testing input shape: {test_encodings['input_ids'].shape}")

## 6. Create PyTorch Dataset

In [None]:
# Create PyTorch Dataset
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Buat dataset
train_dataset = SentimentDataset(train_encodings, y_train_encoded)
test_dataset = SentimentDataset(test_encodings, y_test_encoded)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")
print("PyTorch datasets created")

## 7. Setup Training Arguments

In [None]:
# Setup Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    do_eval=True,
    save_strategy='epoch',
    load_best_model_at_end=False
)

print("Training arguments configured")

## 8. Fine-tuning IndoBERT

In [None]:
# Metric computation function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='weighted'
    )
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Split train menjadi train dan validation (80-20)
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_subset, val_subset = torch.utils.data.random_split(
    train_dataset, [train_size, val_size]
)

# Create Trainer
trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=train_subset,
    eval_dataset=val_subset,
    compute_metrics=compute_metrics
)

print("=" * 60)
print("Memulai Fine-tuning IndoBERT...")
print("=" * 60)

# Training
trainer.train()

print("\n" + "=" * 60)
print("Fine-tuning IndoBERT selesai!")
print("=" * 60)

## 9. Evaluasi pada Test Set

In [None]:
# Evaluasi IndoBERT pada test set
print("\n" + "=" * 60)
print("INDOBERT - EVALUASI PADA TEST SET")
print("=" * 60)

# Predict dengan trainer
predictions = trainer.predict(test_dataset)
y_pred_bert_encoded = predictions.predictions.argmax(-1)
y_pred_bert = label_encoder.inverse_transform(y_pred_bert_encoded)

print(classification_report(y_test, y_pred_bert))
print(f"\nAccuracy: {accuracy_score(y_test, y_pred_bert):.4f}")
print(f"Precision (Weighted): {precision_score(y_test, y_pred_bert, average='weighted'):.4f}")
print(f"Recall (Weighted): {recall_score(y_test, y_pred_bert, average='weighted'):.4f}")
print(f"F1-Score (Weighted): {f1_score(y_test, y_pred_bert, average='weighted'):.4f}")
print("=" * 60)

## 10. Confusion Matrix

In [None]:
# Confusion Matrix untuk IndoBERT
cm_bert = confusion_matrix(y_test, y_pred_bert)
plt.figure(figsize=(10, 8))
sns.heatmap(cm_bert, annot=True, fmt='d', cmap='YlOrBr', 
            xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_,
            cbar_kws={'label': 'Count'})
plt.xlabel('Predicted', fontsize=12, fontweight='bold')
plt.ylabel('Actual', fontsize=12, fontweight='bold')
plt.title(f'Confusion Matrix - IndoBERT\nAccuracy: {accuracy_score(y_test, y_pred_bert):.4f}', 
          fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 11. Simpan Model

In [None]:
import os

# Buat direktori models jika belum ada
os.makedirs('models', exist_ok=True)

# Simpan IndoBERT model
bert_model.save_pretrained('models/indobert_finetuned')
bert_tokenizer.save_pretrained('models/indobert_finetuned')

print("\n" + "=" * 60)
print("MODEL INDOBERT BERHASIL DISIMPAN!")
print("=" * 60)
print("Lokasi: models/indobert_finetuned/")
print("=" * 60)

## 12. Download Model dari Colab (Opsional)

In [None]:
# Kompres model menjadi ZIP untuk diunduh
import shutil
from google.colab import files

# Kompres folder model
shutil.make_archive('indobert_finetuned', 'zip', 'models/indobert_finetuned')

print("Model berhasil dikompres ke: indobert_finetuned.zip")
print("Ukuran file:", round(os.path.getsize('indobert_finetuned.zip') / (1024*1024), 2), "MB")
print("\nMengunduh file...")

# Download file
files.download('indobert_finetuned.zip')

print("\n" + "=" * 60)
print("MODEL BERHASIL DIUNDUH!")
print("=" * 60)
print("File: indobert_finetuned.zip")
print("Extract file ZIP tersebut di folder 'models/' project Anda")
print("=" * 60)