# Inference

In [3]:
import json
import re

import nltk
nltk.download('punkt_tab')

from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# Load informal-to-formal dictionary
with open("/home/atmatech/task/sentiment-nawatech/informal_to_formal.json") as json_file:
    dictionaries = json.load(json_file)

# Load Sastrawi stopwords and customize
factory = StopWordRemoverFactory()
stopwords = factory.get_stop_words()
important_words = {'bisa', 'tidak', 'lebih', 'baik', 'buruk', 'suka', 'benci', 'cinta', 'senang', 'marah', 'kesal', 'bagus', 'jelek'}
stopwords = [word for word in stopwords if word not in important_words]

# Inisialisasi stemmer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()

def cleansing(text):
    # 1. Hapus special tokens
    def strip_special_tokens(text):
        return re.sub(r'<[^<>]+>', '', text)
    
    # 2. Hapus simbol (#) tetap mempertahankan kata setelahnya
    def strip_hashtags(text):
        return re.sub(r"#(\w+)", r"\1", text)
    
    # 3.a  Hapus tautan (link)
    def strip_links(text):
        return re.sub(r'http\S+', '', text)
    
    # 3.b Hapus mention (@username)
    def strip_mention(text):
        return re.sub(r'@[A-Za-z0-9_]+', '', text)
    
    # 3.c Hapus tag RT (retweet)
    def strip_retweet(text):
        return re.sub(r'\bRT\b[\s]*', '', text)
    
    # 3.d Hapus tanda baca (punctuation)
    def strip_punctuation(text):
        return re.sub(r'[^\w\s]', ' ', text)
    
    # 3.e Mengubah karakter newline menadi spasi.
    def strip_newline(text):
        return text.replace('\n', ' ')

    # 4. Mengubah karakter kata menjadi huruf kecil (lowercase)
    def case_folding(text):
        return text.lower()
    
    # 5. Mengubah teks menjadi list token
    def tokenizer(text):
        return nltk.word_tokenize(text)
    
    # 6. Normalisasi kata informal/slang ke formal berdasarkan kamus slang (dictionaries)
    def informal_to_formal(tokens):
        return [dictionaries.get(word, word).lower() for word in tokens]

    # 7. Hapus stopwords dengan penyesuaian kustomisasi daftar stopwords
    def remove_stopwords(tokens):
        return [word for word in tokens if word not in stopwords and word != '']
    
    # 8. Menggabungkan kembali list token menjadi teks utuh
    def detokenizer(tokens):
        return ' '.join(tokens)
    
    # 9. Stemming untuk mengubah kata ke bentuk dasarnya
    def stemming(text):
        return stemmer.stem(text)
    
    # Cleaning (Text Level)
    text = strip_special_tokens(text)
    text = strip_hashtags(text)
    text = strip_links(text)
    text = strip_mention(text)
    text = strip_retweet(text)
    text = strip_punctuation(text)
    text = strip_newline(text)
    text = case_folding(text)
    
    # Tokenisasi
    tokens = tokenizer(text)

    # Transformasi (Token Level)
    tokens = informal_to_formal(tokens)   
    tokens = remove_stopwords(tokens) 
    
    # Detokenisasi
    text = detokenizer(tokens)

    # Transformasi (Text Level)
    text = stemmer.stem(text)

    return text.strip() # Hapus spasi ekstra pada awal dan akhir teks.


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/atmatech/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
import joblib

# Load model & vectorizer
svm_model = joblib.load("svm-sentiment-model.joblib")
tfidf = joblib.load("TF-IDF.joblib")

# Input teks
text_baru = "Lebih baik <PROVIDER_NAME>"

# Preprocessing
text_baru_clean = cleansing(text_baru)
print("Teks setelah cleansing:", text_baru_clean)

# Transform ke TF-IDF
text_baru_vec = tfidf.transform([text_baru_clean])

# Ubah ke dense array karena model dilatih dengan .toarray()
text_baru_dense = text_baru_vec.toarray()

# Prediksi
prediksi = svm_model.predict(text_baru_dense)[0]

# Interpretasi hasil
hasil = "positive" if prediksi == 1 else "negative"
print("Prediksi Sentimen:", hasil)


Teks setelah cleansing: lebih baik
Prediksi Sentimen: positive
