In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm.notebook import tqdm

# 1. Load Data

In [None]:
df = pd.read_csv('combined_data/combined_data.csv')

# 2. Load model dan tokenizer IndoBERT untuk klasifikasi sentimen

In [None]:
model_name = "w11wo/indonesian-roberta-base-sentiment-classifier"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Pindahkan model ke GPU jika tersedia

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 3. Fungsi untuk prediksi sentimen

In [None]:
def predict_sentiment(text, max_length=128):
    # Encode teks
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Matikan gradient untuk inferensi
    with torch.no_grad():
        outputs = model(**inputs)
        
    # Dapatkan prediksi terbesar
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    
    # Dapatkan probabilitas dengan softmax
    probs = torch.nn.functional.softmax(logits, dim=1)
    confidence = probs[0][predicted_class].item()
    
    # IndoBERT Sentiment memetakan label:
    # 0: negative, 1: neutral, 2: positive
    label_map = {0: "negative", 1: "neutral", 2: "positive"}
    sentiment = label_map[predicted_class]
    
    return sentiment, confidence


# 4. Lakukan prediksi untuk seluruh dataset dalam batch untuk efisiensi

In [None]:
batch_size = 32
sentiments = []
confidences = []

In [None]:
for i in tqdm(range(0, len(df), batch_size)):
    batch_texts = df['cleaned_text'].iloc[i:i+batch_size].tolist()
    batch_results = [predict_sentiment(text) for text in batch_texts]
    batch_sentiments, batch_confidences = zip(*batch_results)
    
    sentiments.extend(batch_sentiments)
    confidences.extend(batch_confidences)


# 5. Tambahkan kolom sentimen dan confidence ke datafram

In [None]:
df['sentiment'] = sentiments
df['confidence'] = confidences

# 6. Simpan dataframe dengan label sentimen

In [None]:
df.to_csv('combined_data/data_gabungan_twitter_tiktok_labeled.csv', index=False)

# 7. Tampilkan distribusi sentimen

In [None]:
sentiment_counts = df['sentiment'].value_counts()
print("Distribusi Sentimen:")
print(sentiment_counts)
print("\nPersentase Sentimen:")
print(sentiment_counts / len(df) * 100)

# 8. Mengekstrak subset untuk validasi manual (20% data)

In [None]:
validation_sample = df.groupby('sentiment', group_keys=False).apply(
    lambda x: x.sample(frac=0.2, random_state=42)
)

In [None]:
validation_sample.to_csv('combined_data/validation_sample_20percent.csv', index=False)

print(f"\nJumlah data untuk validasi manual: {len(validation_sample)}")