<a href="https://colab.research.google.com/github/Hkd225/sentiment-anlyze/blob/main/Sentiment_analyze.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ===================================================================
# 1. SETUP & IMPORTS
# ===================================================================
# Instalasi Library
# !pip install scikit-learn transformers datasets torch pandas numpy

import pandas as pd
import numpy as np
import json
from datetime import datetime

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, classification_report, f1_score

# PyTorch
import torch
from torch.nn import CrossEntropyLoss

# HuggingFace Transformers & Datasets
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    DataCollatorWithPadding
)
from datasets import Dataset as HFDataset

# Tentukan device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Menggunakan device: {device}")

# ===================================================================
# 2. DATA LOADING & PREPROCESSING (MODIFIED)
# ===================================================================

# Load dataset
df = pd.read_csv('indonesian-adjective-sentiment-raw.csv')
print("\n--- Data Awal ---")
print(f"Dataset shape: {df.shape}")

### Fungsi untuk Labeling Sentiment Heuristik (DIREVISI) ###
def determine_sentiment_from_explanation(explanation):
    """Tentukan sentiment berdasarkan kata kunci dalam penjelasan (Heuristic Labeling)"""
    explanation_lower = str(explanation).lower()

    positive_keywords = [
        'baik', 'bagus', 'indah', 'senang', 'gembira', 'puas', 'suka', 'hebat',
        'unggul', 'positif', 'menguntungkan', 'bermanfaat', 'menyenangkan',
        'ceria', 'bahagia', 'nikmat', 'nyaman', 'mantap', 'luar biasa', 'cantik',
        'elok', 'molek', 'menarik', 'memuaskan', 'sempurna', 'optimal', 'bagus',
        'terbaik', 'sukses', 'berhasil', 'mengagumkan'
    ]

    negative_keywords = [
        'buruk', 'jelek', 'tidak baik', 'susah', 'sulit', 'negatif', 'merugikan',
        'menyedihkan', 'mengecewakan', 'berbahaya', 'menyakitkan', 'parah',
        'rusak', 'gagal', 'celaka', 'sengsara', 'menderita', 'menyesal', 'malas',
        'bodoh', 'kotor', 'jahat', 'menjijikkan', 'menakutkan', 'mengerikan',
        'menjengkelkan', 'jelek', 'gagal'
    ]

    positive_count = sum(1 for keyword in positive_keywords if keyword in explanation_lower)
    negative_count = sum(1 for keyword in negative_keywords if keyword in explanation_lower)

    if positive_count > negative_count:
        return 'positive'
    elif negative_count > positive_count:
        return 'negative'
    elif positive_count == 0 and negative_count == 0:
        # Hanya netral jika tidak ada keyword sentiment sama sekali
        return 'neutral'
    else:
        # Ambiguitas (count positif == count negatif > 0), hapus data ini (NaN)
        return np.nan

def create_training_text(row):
    """Gabungkan kata dan penjelasan untuk konteks yang lebih baik bagi model"""
    word = str(row['word']).strip()
    explanation = str(row['explanation']).strip()
    return f"Kata sifat '{word}' berarti {explanation}"

# Terapkan Labeling
df['sentiment'] = df['explanation'].apply(determine_sentiment_from_explanation)
df['training_text'] = df.apply(create_training_text, axis=1)

# Map sentiment ke label numerik
sentiment_mapping = {'positive': 2, 'neutral': 1, 'negative': 0}
df['label'] = df['sentiment'].map(sentiment_mapping)

# Hapus rows yang tidak valid (termasuk NaN baru dari ambiguitas)
df = df.dropna(subset=['label'])
df['label'] = df['label'].astype(int)

print("\n--- Distribusi Label Akhir ---")
print(df['sentiment'].value_counts())
print(f"Total sampel final: {len(df)}")

# ===================================================================
# 3. CLASS WEIGHTS & DATA SPLIT
# ===================================================================

# Hitung Bobot Kelas (Class Weights)
labels_unique = df['label'].unique()
labels_unique.sort()

try:
    # Hitung bobot yang berbanding terbalik dengan frekuensi kelas
    class_weights = compute_class_weight(
        'balanced',
        classes=labels_unique,
        y=df['label']
    )
except ValueError:
    print("Warning: Tidak bisa menghitung bobot kelas, menggunakan bobot default (uniform).")
    class_weights = np.ones(len(labels_unique))

# Konversi ke Tensor PyTorch
weights = torch.tensor(class_weights, dtype=torch.float).to(device)
print("\nBobot kelas yang dihitung (Baru):", weights.tolist()) # Bobot akan berubah karena jumlah data netral berkurang

# Split data menggunakan stratify
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df['label'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])

print(f"\n--- Split Dataset ---")
print(f"Training: {len(train_df)} samples")
print(f"Validation: {len(val_df)} samples")
print(f"Test: {len(test_df)} samples")

# ===================================================================
# 4. CUSTOM TRAINER DAN METRIK (Tidak Berubah)
# ===================================================================

class WeightedLossTrainer(Trainer):
    """Trainer kustom dengan weighted loss."""
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss_fct = CrossEntropyLoss(weight=self.class_weights)

        loss = loss_fct(logits.view(-1, self.model.config.num_labels),
                        labels.view(-1))

        return (loss, outputs) if return_outputs else loss

def compute_metrics(p):
    """Fungsi untuk menghitung metrik F1-Score (weighted) dan Akurasi."""
    preds = np.argmax(p.predictions, axis=1)

    f1 = f1_score(p.label_ids, preds, average='weighted', zero_division=0)
    acc = accuracy_score(p.label_ids, preds)

    return {"accuracy": acc, "f1_weighted": f1}

# ===================================================================
# 5. MODEL SETUP & TOKENIZATION (Tidak Berubah)
# ===================================================================

model_name = "indolem/indobert-base-uncased"
print(f"\nLoading tokenizer & model: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(
        examples["training_text"],
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors=None
    )

print("Mempersiapkan dataset...")
train_dataset = HFDataset.from_pandas(train_df[['training_text', 'label']])
val_dataset = HFDataset.from_pandas(val_df[['training_text', 'label']])
test_dataset = HFDataset.from_pandas(test_df[['training_text', 'label']])

print("Tokenizing datasets...")
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

columns_to_keep = ['input_ids', 'attention_mask', 'label']
tokenized_train = tokenized_train.remove_columns([col for col in tokenized_train.column_names if col not in columns_to_keep])
tokenized_val = tokenized_val.remove_columns([col for col in tokenized_val.column_names if col not in columns_to_keep])
tokenized_test = tokenized_test.remove_columns([col for col in tokenized_test.column_names if col not in columns_to_keep])

print("Initializing standard model (IndoBERT)...")
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    id2label={0: "negative", 1: "neutral", 2: "positive"},
    label2id={"negative": 0, "neutral": 1, "positive": 2}
).to(device)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# ===================================================================
# 6. TRAINING (MODIFIED HYPERPARAMETERS)
# ===================================================================

# TRAINING ARGUMENTS
training_args = TrainingArguments(
    output_dir="./sentiment-model-output",
    overwrite_output_dir=True,
    num_train_epochs=12, # DITINGKATKAN
    per_device_train_batch_size=4, # DITURUNKAN
    per_device_eval_batch_size=8,
    warmup_steps=100,
    logging_steps=50,
    eval_steps=50,
    save_steps=200,
    eval_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1_weighted",
    greater_is_better=True,
    learning_rate=1e-5, # DITURUNKAN
    weight_decay=0.05, # DITINGKATKAN
    logging_dir="./logs",
    report_to=None,
    dataloader_pin_memory=True if torch.cuda.is_available() else False,
    remove_unused_columns=False,
    # Menambahkan save_total_limit untuk manajemen ruang disk
    save_total_limit=2
)

# Initialize Trainer dengan weighted loss
trainer = WeightedLossTrainer(
    class_weights=weights,
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)], # DITINGKATKAN
)

# TRAINING MODEL
print("\n" + "="*40)
print("=== MEMULAI TRAINING DENGAN WEIGHTED LOSS (PARAM BARU) ===")
print("="*40)
train_results = trainer.train()

# ===================================================================
# 7. EVALUASI DAN PENYIMPANAN (Tidak Berubah)
# ===================================================================

# EVALUASI
print("\n" + "="*40)
print("=== EVALUASI MODEL FINAL PADA TEST SET ===")
print("="*40)

predictions = trainer.predict(tokenized_test)
preds = np.argmax(predictions.predictions, axis=-1)

# Hitung akurasi
accuracy = accuracy_score(test_df['label'], preds)
print(f"Accuracy on test set: {accuracy:.4f}")

# Classification report
print("\nClassification Report:")
print(classification_report(test_df['label'], preds,
                            target_names=['negative', 'neutral', 'positive'],
                            zero_division=0))

# SIMPAN MODEL TERBAIK
print("\n=== MENYIMPAN MODEL TERBAIK ===")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_save_path = f"./trained-sentiment-model-{timestamp}"

trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"✓ Model disimpan di: {model_save_path}")

# ===================================================================
# 8. PREDICTOR CLASS & UJI COBA (Tidak Berubah)
# ===================================================================

class IndonesianSentimentAnalyzer:
    """Kelas untuk menguji model yang sudah dilatih."""
    def __init__(self, model_path):
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)
        self.model.eval()

    def predict_word(self, word, explanation=""):
        if explanation:
            text = f"Kata sifat '{word}' berarti {explanation}"
        else:
            text = f"Kata sifat '{word}'"

        inputs = self.tokenizer(
            text,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=128
        ).to(device)

        with torch.no_grad():
            outputs = self.model(**inputs)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

        probs = predictions.cpu().numpy()[0]
        predicted_class = np.argmax(probs)

        sentiment_map = {0: "NEGATIVE", 1: "NEUTRAL", 2: "POSITIVE"}

        return {
            'word': word,
            'sentiment': sentiment_map[predicted_class],
            'confidence': float(probs[predicted_class]),
            'probabilities': {
                'negative': float(probs[0]),
                'neutral': float(probs[1]),
                'positive': float(probs[2])
            }
        }

# Uji coba model
print("\n=== TEST MODEL YANG SUDAH DILATIH ===")
analyzer = IndonesianSentimentAnalyzer(model_save_path)

test_words = [
    ("bagus", "sangat baik dan memuaskan"),
    ("jelek", "tidak baik dan buruk"),
    ("cepat", "bergerak dengan laju tinggi"),
    ("indah", "menyenangkan dipandang mata"),
    ("malas", "tidak mau bekerja atau berusaha"),
    ("sukses", "berhasil mencapai tujuan")
]

print("\nHasil prediksi dengan model terlatih:")
for word, explanation in test_words:
    result = analyzer.predict_word(word, explanation)
    print(f"✦ {result['word']:<8}: {result['sentiment']:<9} (confidence: {result['confidence']:.3f})")

# ===================================================================
# 9. EKSPOR UNTUK WEBSITE (Tidak Berubah)
# ===================================================================

print("\n=== MENYIMPAN MODEL UNTUK WEBSITE ===")

website_model = {
    'word_to_sentiment': {},
    'word_to_confidence': {},
    'metadata': {
        'model_type': 'fine-tuned-bert-weighted',
        'timestamp': timestamp,
        'total_words': len(df),
        'test_accuracy': float(accuracy),
        'training_samples': len(train_df),
        'model_name': model_name
    }
}

print("Memprediksi sentimen untuk semua kata...")
for idx, row in df.iterrows():
    if idx % 1000 == 0 and idx != 0:
        print(f"Processing {idx}/{len(df)}...")

    result = analyzer.predict_word(row['word'], row['explanation'])
    website_model['word_to_sentiment'][row['word']] = result['sentiment']
    website_model['word_to_confidence'][row['word']] = result['confidence']

# Simpan sebagai JSON
website_model_filename = f'sentiment_model_{timestamp}.json'
with open(website_model_filename, 'w', encoding='utf-8') as f:
    json.dump(website_model, f, indent=2, ensure_ascii=False)

print(f"✓ Model untuk website disimpan sebagai: {website_model_filename}")

# Download file (Jika menggunakan Google Colab)
# from google.colab import files
# try:
#     files.download(website_model_filename)
# except:
#     print("Download file diabaikan. Jalankan di Colab untuk mengaktifkan download.")

print("\n" + "="*60)
print("✅ TRAINING & EVALUASI SELESAI!")
print("="*60)