In [11]:
import pandas as pd
import numpy as np
import re
import string
import requests
import csv
from io import StringIO, BytesIO
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

class TextPreprocessor:
    def __init__(self):
        self.stop_words = self._get_stopwords()
        self.lexicon_positive, self.lexicon_negative = self._get_lexicons()
        self.kamus_slang = self._load_remote_slang()

    def _get_stopwords(self):
        import nltk
        nltk.download('punkt')
        nltk.download('stopwords')
        nltk.download('punkt_tab')
        stop_words = set(stopwords.words('indonesian'))
        stop_words.update(set(stopwords.words('english')))
        return stop_words

    def _load_remote_slang(self):
        url = "https://github.com/MichaelAdi434/Project-Analisis-Sentimen/raw/d21c7566deca33e2871f160f19728f39d5fd273d/kamuskatabaku.xlsx"
        try:
            response = requests.get(url)
            if response.status_code == 200:
                df_slang = pd.read_excel(BytesIO(response.content))
                return dict(zip(df_slang.iloc[:, 0], df_slang.iloc[:, 1]))
            else:
                print("Gagal mengambil kamus dari GitHub, menggunakan kamus kosong.")
                return {}
        except Exception as e:
            print(f"Error saat memuat kamus: {e}")
            return {}

    def _get_lexicons(self):
        pos = {}
        neg = {}
        # Load Positive Lexicon
        res_pos = requests.get('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_positive.csv')
        if res_pos.status_code == 200:
            reader = csv.reader(StringIO(res_pos.text))
            pos = {row[0]: int(row[1]) for row in reader}

        # Load Negative Lexicon
        res_neg = requests.get('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_negative.csv')
        if res_neg.status_code == 200:
            reader = csv.reader(StringIO(res_neg.text))
            neg = {row[0]: int(row[1]) for row in reader}

        return pos, neg

    def clean_text(self, text):
        text = str(text)
        text = re.sub(r'@[A-Za-z0-9]+|#[A-Za-z0-9]+|RT[\s]|http\S+|[0-9]+', '', text)
        text = text.translate(str.maketrans('', '', string.punctuation))
        text = text.lower().strip()
        return text

    def handle_slang(self, text):
        return ' '.join([self.kamus_slang.get(kata, kata) for kata in text.split()])

    def process_tokens(self, text):
        tokens = word_tokenize(text)
        return [t for t in tokens if t not in self.stop_words]

    def get_sentiment(self, tokens):
        score = sum(self.lexicon_positive.get(t, 0) for t in tokens)
        score += sum(self.lexicon_negative.get(t, 0) for t in tokens)
        return 'positive' if score >= 0 else 'negative'

    def run_pipeline(self, df, text_column='content'):

        df = df.dropna(subset=[text_column]).drop_duplicates()
        df['clean_text'] = df[text_column].apply(self.clean_text)
        df['normalized'] = df['clean_text'].apply(self.handle_slang)
        df['tokens'] = df['normalized'].apply(self.process_tokens)
        df['text_akhir'] = df['tokens'].apply(lambda x: ' '.join(x))
        df['label'] = df['tokens'].apply(self.get_sentiment)

        print("Preprocessing selesai!")
        return df[['text_akhir', 'label']]

