In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, IntervalStrategy, EarlyStoppingCallback
import torch
import torch.nn.functional as F
from tqdm import tqdm
from sklearn.model_selection import KFold
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import numpy as np
import os

os.environ["WANDB_DISABLED"] = "true"

# Langkah 2 Memasukan dataset dan tahap pengolahan data
df = pd.read_csv("label_manual - Sheet1.csv")

# Perbaiki baris rename: ganti 'komentar' ke 'text'
# dan pastikan kolom label Anda akan diolah menjadi 'labels'
df.rename(columns={'komentar': 'text'}, inplace=True) # Hanya ganti 'komentar' ke 'text'

print(f"Nilai kosong pada kolom text: {df['text'].isna().sum()}")
print(f"Nilai kosong pada kolom label: {df['label'].isna().sum()}") # Kolom 'label' masih ada di sini

df = df.dropna(subset=['text'])
df = df.dropna(subset=['label']) # Ini akan menghapus baris dengan NaN di kolom 'label' asli

print(f"DataFrame shape after dropping initial NA: {df.shape}")

label_map = {'Netral': 0, 'Positif': 1, 'Negatif': 2} # Perhatikan perubahan kapitalisasi sesuai data Anda
df['label'] = df['label'].map(label_map)

# *** PERBAIKAN UTAMA: Ganti nama kolom 'label' menjadi 'labels' dan pastikan tipe datanya int
df.rename(columns={'label': 'labels'}, inplace=True)
df['labels'] = df['labels'].astype(int) # Pastikan label bertipe integer
# *** AKHIR PERBAIKAN UTAMA

print(f"DataFrame shape after label mapping and renaming to 'labels': {df.shape}")

# Langkah 3 Inisialisasi model (tokenizer tetap sama untuk semua fold)
model_name = "cahya/roberta-base-indonesian-522M"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Langkah 4 Tokenisasi Data (Tokenisasi dilakukan sekali pada seluruh dataset awal)
def tokenize_function(examples):
  texts = [str(text) if text is not None else "" for text in examples['text']]
  return tokenizer(texts, truncation=True, padding="max_length", max_length=512)

# Buat dataset Hugging Face dari DataFrame
full_dataset = Dataset.from_pandas(df)
full_dataset = full_dataset.map(tokenize_function, batched=True)

# Optional: Tampilan contoh tokenisasi (tetap bisa di sini)
sample_idx = 0
sample_text = full_dataset[sample_idx]['text']
sample_tokens = full_dataset[sample_idx]

token_ids = sample_tokens['input_ids']
token_words = tokenizer.convert_ids_to_tokens(token_ids[:30])

token_df = pd.DataFrame({
  'Position': range(len(token_words)),
  'Token': token_words,
  'ID': token_ids[:len(token_words)],
  'Attention Mask': sample_tokens['attention_mask'][:len(token_words)]
})

token_df['Token'] = token_df['Token'].str.replace('Ġ', '', regex=False)

print(f"\nText asli sebelum tokenisasi: {sample_text}\\n")
print("Hasil tokenisasi:")
display(token_df)

# ==============================================================================
# PENAMBAHAN CROSS-VALIDATION
# ==============================================================================

num_folds = 5 # Anda bisa mengubah jumlah fold sesuai kebutuhan
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42) # random_state untuk reproduktifitas

all_fold_metrics = []
all_predictions = []
all_true_labels = []
all_texts = [] # *** PERBAIKAN: List baru untuk mengumpulkan semua teks dari set pengujian

# Iterasi untuk setiap fold
for fold, (train_index, test_index) in enumerate(kf.split(full_dataset)):
    print(f"\n===== Training Fold {fold + 1}/{num_folds} =====")

    # Split dataset untuk fold saat ini
    train_dataset = full_dataset.select(train_index)
    test_dataset = full_dataset.select(test_index)

    # Langkah 6 Load Model (model perlu diinisialisasi ulang untuk setiap fold agar tidak ada informasi bocor)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_map))

    # Langkah 7 Set training config & trainer
    # Pastikan output_dir unik untuk setiap fold jika Anda ingin menyimpan model per fold
    training_args = TrainingArguments(
        output_dir=f"./results_fold_{fold + 1}", # Direktori output unik per fold
        eval_strategy=IntervalStrategy.EPOCH,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=1, # Mengurangi epoch untuk testing
        logging_dir=f"./logs_fold_{fold + 1}", # Direktori log unik per fold
        logging_steps=10,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        save_strategy=IntervalStrategy.EPOCH,
        report_to="none" # Nonaktifkan reporting ke WANDB secara eksplisit
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    # Langkah 8 Memulai training model
    trainer.train()

    # Langkah 9 Evaluasi Model untuk fold saat ini
    results = trainer.evaluate()
    print(f"Evaluation results for Fold {fold + 1}: {results}")
    all_fold_metrics.append(results)

    # Simpan prediksi untuk kalkulasi metrik agregat nanti
    predictions = trainer.predict(test_dataset)
    predicted_labels = predictions.predictions.argmax(-1)
    true_labels = test_dataset["labels"]

    all_predictions.extend(predicted_labels)
    all_true_labels.extend(true_labels)
    all_texts.extend(test_dataset["text"]) # *** PERBAIKAN: Kumpulkan teks dari test_dataset
# ==============================================================================
# AGREGASI HASIL CROSS-VALIDATION
# ==============================================================================

print("\n===== Aggregating Cross-Validation Results =====")

# Hitung rata-rata metrik dari semua fold
avg_eval_loss = np.mean([res['eval_loss'] for res in all_fold_metrics])
print(f"Average Eval Loss across {num_folds} folds: {avg_eval_loss:.4f}")

# Hitung metrik keseluruhan dari semua prediksi (dari semua fold)
y_true_overall = np.array(all_true_labels)
y_pred_overall = np.array(all_predictions)

accuracy_overall = accuracy_score(y_true_overall, y_pred_overall)
precision_macro_overall = precision_score(y_true_overall, y_pred_overall, average='macro')
precision_weighted_overall = precision_score(y_true_overall, y_pred_overall, average='weighted')
recall_macro_overall = recall_score(y_true_overall, y_pred_overall, average='macro')
recall_weighted_overall = recall_score(y_true_overall, y_pred_overall, average='weighted')
f1_macro_overall = f1_score(y_true_overall, y_pred_overall, average='macro')
f1_weighted_overall = f1_score(y_true_overall, y_pred_overall, average='weighted')

report_overall = classification_report(y_true_overall, y_pred_overall, target_names=list(label_map.keys()), output_dict=True)
cm_overall = confusion_matrix(y_true_overall, y_pred_overall)

print(f"\nOverall Accuracy: {accuracy_overall:.4f}")
print(f"Overall Precision (macro): {precision_macro_overall:.4f}")
print(f"Overall Precision (weighted): {precision_weighted_overall:.4f}")
print(f"Overall Recall (macro): {recall_macro_overall:.4f}")
print(f"Overall Recall (weighted): {recall_weighted_overall:.4f}")
print(f"Overall F1 Score (macro): {f1_macro_overall:.4f}")
print(f"Overall F1 Score (weighted): {f1_weighted_overall:.4f}")

print("\nOverall Confusion Matrix:")
print(cm_overall)

# Simpan hasil agregat ke file Excel baru
reverse_label_map = {v: k for k, v in label_map.items()}

# Buat DataFrame untuk menyimpan semua prediksi dari semua fold
final_sentiment_results = pd.DataFrame({
    "text": all_texts, # *** PERBAIKAN: Gunakan list all_texts yang sudah dikumpulkan
    "true_label": all_true_labels,
    "predicted_label": all_predictions
})
final_sentiment_results['true_label_text'] = final_sentiment_results['true_label'].map(reverse_label_map)
final_sentiment_results['predicted_label_text'] = final_sentiment_results['predicted_label'].map(reverse_label_map)


with pd.ExcelWriter("sentiment_analysis_results_cross_validation.xlsx") as writer:
    final_sentiment_results.to_excel(writer, sheet_name="All Predictions", index=False)

    overall_metrics_df = pd.DataFrame({
        'Metric': ['Overall Accuracy', 'Overall Precision (macro)', 'Overall Precision (weighted)',
                   'Overall Recall (macro)', 'Overall Recall (weighted)',
                   'Overall F1 Score (macro)', 'Overall F1 Score (weighted)', 'Average Eval Loss'],
        'Value': [accuracy_overall, precision_macro_overall, precision_weighted_overall,
                  recall_macro_overall, recall_weighted_overall,
                  f1_macro_overall, f1_weighted_overall, avg_eval_loss]
    })
    overall_metrics_df.to_excel(writer, sheet_name="Overall CV Metrics", index=False)

    class_metrics_overall = pd.DataFrame()
    for cls in report_overall:
        if cls not in ['accuracy', 'macro avg', 'weighted avg']:\
            class_metrics_overall[cls] = pd.Series(report_overall[cls])
    class_metrics_overall.to_excel(writer, sheet_name="Overall Class Metrics", index=True)

    # Tambahkan sheet untuk metrik per fold
    fold_metrics_df = pd.DataFrame(all_fold_metrics)
    fold_metrics_df.insert(0, 'Fold', range(1, num_folds + 1))
    fold_metrics_df.to_excel(writer, sheet_name="Metrics Per Fold", index=False)


print("Cross-validation results saved to sentiment_analysis_results_cross_validation.xlsx")