In [2]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Load your dataset
df = pd.read_csv('CIU_dataset.csv', encoding='ISO-8859-1')

pretrained = "mdhugol/indonesia-bert-sentiment-classification"
model = AutoModelForSequenceClassification.from_pretrained(pretrained)
tokenizer = AutoTokenizer.from_pretrained(pretrained)

sentiment_analysis = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

label_index = {'LABEL_0': '1', 'LABEL_1': '0', 'LABEL_2': '-1'}

# Drop rows with NaN values in the "Opini" column
df = df.dropna(subset=['Tweet'])

# Function to apply sentiment analysis and get the label
def get_sentiment_label(text):
    # Convert non-string values to string
    text = str(text)

    # Split the input text into segments of maximum length
    max_length = 512
    segments = [text[i:i + max_length] for i in range(0, len(text), max_length)]

    # Perform sentiment analysis on each segment and concatenate the results
    results = [sentiment_analysis(segment) for segment in segments]
    label = results[-1][0]['label']  # Take the label from the last segment

    return label_index[label]

# Apply sentiment analysis to each row in the "Opini" column
df['Label'] = df['Tweet'].apply(get_sentiment_label)

# Save the labeled dataset to a new CSV file
df.to_csv('labeled_CIU.csv', index=False, encoding='utf-8')

# Print a message indicating that the dataset has been saved
print("Labeled dataset saved to 'labeled_dataset.csv'")


Labeled dataset saved to 'labeled_dataset.csv'
