<a href="https://colab.research.google.com/github/HarisGunawanRomadon/model-transformer/blob/main/model_transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets torch scikit-learn pandas accelerate

In [None]:
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from google.colab import drive
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns
import random

# ==========================================
# 1. MOUNT GOOGLE DRIVE
# ==========================================

if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"‚úÖ Device: {device}")

# ==========================================
# 2. CONFIG & KAMUS SINONIM (FIXED)
# ==========================================

model_name = "indobenchmark/indobert-base-p1"
tokenizer = BertTokenizer.from_pretrained(model_name)
label_map = {'positif': 2, 'netral': 1, 'negatif': 0}
id2label = {v: k for k, v in label_map.items()}

# Kamus Sinonim
synonyms_dict = {
    # POSITIF
    'bagus': ['baik', 'oke', 'mantap', 'memuaskan', 'keren', 'top', 'prima'],
    'baik': ['bagus', 'oke', 'menyenangkan', 'ramah', 'sopan'],
    'senang': ['bahagia', 'gembira', 'suka', 'nyaman', 'betah', 'riang', 'happy'],
    'puas': ['senang', 'lega', 'bangga', 'takjub'],
    'enak': ['lezat', 'sedap', 'mantap', 'nikmat', 'gurih', 'nyummy'],
    'suka': ['senang', 'tertarik', 'berminat', 'cinta', 'gemar', 'demen'],
    'cepat': ['kilat', 'ngebut', 'lancar', 'sigap'],
    'membantu': ['berguna', 'bermanfaat', 'menolong', 'memudahkan'],
    'keren': ['bagus', 'hebat', 'mantap', 'kece'],
    'nyaman': ['enak', 'tenang', 'betah', 'adem'],
    'lancar': ['mulus', 'aman', 'tanpa kendala'],
    'produktif': ['efektif', 'menghasilkan', 'berguna'],
    'memuaskan': ['bagus', 'menyenangkan', 'oke'],

    # NEGATIF
    'jelek': ['buruk', 'parah', 'mengecewakan', 'kurang', 'ancur', 'payah'],
    'buruk': ['jelek', 'parah', 'negatif', 'tidak bagus', 'rusak'],
    'sedih': ['kecewa', 'kesal', 'bad mood', 'murung', 'duka'],
    'kecewa': ['sedih', 'tidak puas', 'kesal', 'menyesal', 'gondok'],
    'marah': ['kesal', 'emosi', 'ngamuk', 'benci'],
    'benci': ['tidak suka', 'muak', 'kesal', 'anti'],
    'lambat': ['lelet', 'lama', 'lemot', 'pelan'],
    'sulit': ['susah', 'ribet', 'sukar', 'berat', 'pusing'],
    'mahal': ['tinggi', 'boros', 'pricy'],
    'rusak': ['error', 'bermasalah', 'bug', 'gangguan', 'macet'],

    # UMUM
    'sangat': ['amat', 'benar-benar', 'sungguh', 'banget', 'sekali'],
    'tidak': ['kurang', 'belum', 'gak', 'tak', 'bukan'],
    'kurang': ['tidak', 'belum', 'sedikit'],
    'pelayanan': ['service', 'layanan', 'staff'],
    'aplikasi': ['apk', 'app', 'software', 'sistem'],
    'makanan': ['masakan', 'kuliner', 'hidangan', 'menu']
}

def aggressive_augmentation(text, n_aug=3):
    """
    Augmentasi yang lebih agresif.
    Menggunakan Regex untuk mendeteksi kata (menangani tanda baca).
    """
    variations = set()
    words = re.findall(r'\w+', text.lower()) # Ambil hanya kata (abaikan tanda baca sementara untuk matching)

    # Percobaan generate n variasi
    for _ in range(n_aug * 2): # Coba loop lebih banyak
        temp_text = text
        replaced = False

        # Shuffle kata-kata yang ada di text
        random_words = words.copy()
        random.shuffle(random_words)

        for word in random_words:
            if word in synonyms_dict:
                replacement = random.choice(synonyms_dict[word])
                # Ganti kata (case insensitive regex replace)
                # \b memastikan kita mengganti "suka" tapi tidak mengganti "sukarela"
                pattern = re.compile(r'\b' + re.escape(word) + r'\b', re.IGNORECASE)
                temp_text = pattern.sub(replacement, temp_text)
                replaced = True

        if replaced:
            variations.add(temp_text)

        if len(variations) >= n_aug:
            break

    return list(variations)

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s.,?!]', ' ', text) # Sisakan tanda baca penting
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# ==========================================
# 3. PREPARE DATA
# ==========================================

path_train = '/content/drive/MyDrive/STMIK IKMI Cirebon/Semester 6/Deep Learning Lanjut/data/dataset_sentimen_300.csv'
path_test  = '/content/drive/MyDrive/STMIK IKMI Cirebon/Semester 6/Deep Learning Lanjut/data/dataset_tanpa_label_400_mix.csv'

print("\nüì• Membaca Dataset...")
df_train = pd.read_csv(path_train)
df_test = pd.read_csv(path_test)

# Rename columns if needed
if 'text' not in df_train.columns: df_train.rename(columns={df_train.columns[0]: 'text', df_train.columns[1]: 'label'}, inplace=True)
if 'text' not in df_test.columns: df_test.rename(columns={df_test.columns[0]: 'text'}, inplace=True)

# Clean
df_train['text'] = df_train['text'].apply(clean_text)
df_test['text'] = df_test['text'].apply(clean_text)
df_train['label'] = df_train['label'].str.lower().str.strip()
df_train['label_id'] = df_train['label'].map(label_map)
df_train = df_train.dropna(subset=['label_id'])

# 1. SPLIT DATA (MURNI)
X_raw = df_train['text'].tolist()
y_raw = df_train['label_id'].tolist()

X_train, X_val, y_train, y_val = train_test_split(
    X_raw, y_raw, test_size=0.20, random_state=42, stratify=y_raw
)

print(f"üì¶ Split Awal: Train={len(X_train)} | Val={len(X_val)}")

# 2. AUGMENTASI + OVERSAMPLING (BOOSTING DATA)
print("üîÑ Memulai Augmentasi & Oversampling...")
X_train_final = []
y_train_final = []

# Target: Kita ingin setiap sampel punya minimal 4 variasi (1 asli + 3 augmentasi/copy)
for text, label in zip(X_train, y_train):
    # 1. Masukkan data asli
    X_train_final.append(text)
    y_train_final.append(label)

    # 2. Coba Augmentasi
    augs = aggressive_augmentation(text, n_aug=3)

    # 3. Masukkan hasil augmentasi
    if len(augs) > 0:
        for aug_text in augs:
            X_train_final.append(aug_text)
            y_train_final.append(label)

    # 4. SAFETY NET: Jika augmentasi gagal (kalimat terlalu unik),
    # DUPLIKASI data asli 3x agar bobotnya seimbang dengan data lain
    else:
        for _ in range(3):
            X_train_final.append(text)
            y_train_final.append(label)

print(f"‚úÖ Data Training SEBELUM Augmentasi: {len(X_train)}")
print(f"‚úÖ Data Training SETELAH Augmentasi: {len(X_train_final)} (Naik {len(X_train_final)/len(X_train):.1f}x)")

# Dataset Wrapper
class SentimentDataset(Dataset):
    def __init__(self, texts, labels=None):
        self.texts = texts
        self.labels = labels
    def __len__(self): return len(self.texts)
    def __getitem__(self, item):
        enc = tokenizer(str(self.texts[item]), max_length=128, padding='max_length', truncation=True, return_tensors='pt')
        item_dict = {'input_ids': enc['input_ids'].flatten(), 'attention_mask': enc['attention_mask'].flatten()}
        if self.labels is not None: item_dict['labels'] = torch.tensor(self.labels[item], dtype=torch.long)
        return item_dict

train_ds = SentimentDataset(X_train_final, y_train_final)
val_ds = SentimentDataset(X_val, y_val)
test_ds = SentimentDataset(df_test['text'].values)

# ==========================================
# 4. TRAINING (TUNED HYPERPARAMETERS)
# ==========================================

print("\nüîß Inisialisasi Model...")
model = BertForSequenceClassification.from_pretrained(
    model_name, num_labels=3,
    hidden_dropout_prob=0.1, # Kurangi dropout agar lebih 'nempel' karena data kecil
    attention_probs_dropout_prob=0.1
)
model.to(device)

# Hyperparameter yang lebih agresif untuk data kecil
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,            # Epoch dinaikkan karena data sudah diperbanyak
    learning_rate=2e-5,             # LR standar BERT
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_ratio=0.1,               # Warmup 10% dari step
    weight_decay=0.01,
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    fp16=True if torch.cuda.is_available() else False,
    save_total_limit=1,
    seed=42,
    report_to="none"
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = classification_report(labels, preds, output_dict=True, zero_division=0)['macro avg']['f1-score']
    return {'accuracy': acc, 'f1_macro': f1}

trainer = Trainer(
    model=model, args=training_args,
    train_dataset=train_ds, eval_dataset=val_ds,
    compute_metrics=compute_metrics
)

print("\nüöÄ MULAI TRAINING...")
trainer.train()

# ==========================================
# 5. FINAL PREDICTION & CHECK
# ==========================================

print("\nüîÆ Prediksi Test Set...")
preds = trainer.predict(test_ds)
probs = torch.nn.functional.softmax(torch.tensor(preds.predictions), dim=-1)
max_probs = probs.max(dim=-1).values.numpy()
pred_labels = [id2label[i] for i in preds.predictions.argmax(-1)]

df_result = df_test.copy()
df_result['prediksi'] = pred_labels
df_result['confidence'] = max_probs
df_result['level'] = pd.cut(max_probs, bins=[0, 0.5, 0.7, 1.0], labels=['Low', 'Medium', 'High'])

# Save
out_path = '/content/drive/MyDrive/STMIK IKMI Cirebon/Semester 6/Deep Learning Lanjut/data/hasil_prediksi_FINAL_FIXED.csv'
df_result.to_csv(out_path, index=False)

print(f"‚úÖ Selesai! Disimpan di: {out_path}")
print(f"üìä Rata-rata Confidence: {max_probs.mean():.4f}")

# Cek Sample Penting
print("\nüîç Cek Validitas Logika Model:")
check_indices = [0, 1, 4] # Cek 3 data awal
for i in check_indices:
    if i < len(df_result):
        row = df_result.iloc[i]
        print(f"Text: '{row['text'][:40]}...' -> {row['prediksi']} ({row['confidence']:.2%})")

# Simpan Model
model_path = '/content/drive/MyDrive/STMIK IKMI Cirebon/Semester 6/Deep Learning Lanjut/model/sentiment_final_v2'
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)
print(f"üíæ Model tersimpan di {model_path}")