In [11]:
import pandas as pd
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import stopwords as stopwords_scratch
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import emoji
import matplotlib.pyplot as plt

In [12]:
feature_bow = pickle.load(open("./model/feature-bow.p",'rb'))
model_nb = pickle.load(open('./model/model-nb.p', 'rb'))
model_nn = pickle.load(open('./model/model-nn.p', 'rb'))

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [13]:
list_stopwords = stopwords_scratch.words('indonesian')

list_stopwords_en = stopwords_scratch.words('english')

list_stopwords.extend(list_stopwords_en)

# Tambah daftar stopword
list_stopwords.extend(['apa', 'yang', 'ini', 'itu', 'haha', 'hehe', 'dong', 'mah','nih', 'kok', 'ya', 'yg', 'si', 'kan', 'gak', 'deh', 'tuh','ga', 'aja', 'yuk', 'dah', 'ngga', 'engga', 'yah', 'gak', 'nya', 'kali'])

# Buat DataFrame dari list stopwords
stopwords_df = pd.DataFrame(list_stopwords, columns=['stopword'])

# Simpan DataFrame sebagai CSV
stopwords_df.to_csv('stopword_filter.csv', index=False, header=False)

In [14]:
df = pd.read_csv('./data/commentsvid.csv')
stop_words = pd.read_csv('./data/stopword_filter.csv')
df.head()

Unnamed: 0,comment
0,Gw yang niat edukasi penyakit yang langka aja ...
1,"aku pernah dikatain ""ih gay gak suka cewek, pa..."
2,trending nya joget2 atau lucu2 sama kayak di n...
3,kalau bikin konten gimmick menurut lu gimana. ...
4,Mau heran tapi lupa ini di negeri indo😂


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1361 entries, 0 to 1360
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  1360 non-null   object
dtypes: object(1)
memory usage: 10.8+ KB


In [16]:
df = df.convert_dtypes()

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1361 entries, 0 to 1360
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  1360 non-null   string
dtypes: string(1)
memory usage: 10.8 KB


In [18]:
df = df[df['comment'].str.strip() != '']

In [19]:
df.isna().sum()

comment    0
dtype: int64

In [20]:
# Initialize Stemmer for Bahasa Indonesia
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [21]:
def cleaning(comment):
    # 1. Hapus emoji
    comment = emoji.replace_emoji(comment, replace="")
    
    # 2. Hapus mention (@username)
    comment = re.sub(r'@\w+', '', comment)
    
    # 3. Hapus nama orang (asumsi nama dengan kapitalisasi huruf besar pertama)
    comment = re.sub(r'\b[A-Z][a-z]*\b', '', comment)
    
    # 4. Hapus URL dan karakter non-alfabet (angka, tanda baca, simbol)
    comment = re.sub(r'http\S+|www\S+|https\S+|[^a-zA-Z\s]', '', comment)
    
    # 5. Konversi ke lowercase
    comment = comment.lower()
    
    # 6. Hapus karakter berulang (misalnya "soooo" menjadi "soo")
    comment = re.sub(r'(.)\1+', r'\1\1', comment)
    
    # 7. Hapus spasi berlebih
    comment = re.sub(r'\s+', ' ', comment).strip()
    
    tokens = word_tokenize(comment)
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # Remove very short words (less than 2 characters)
    tokens = [word for word in tokens if len(word) > 2]
    # Stemming
    tokens = [stemmer.stem(word) for word in tokens]
    
    # Join tokens back to a single string
    return ' '.join(tokens)

In [22]:
df['cleaned_comment'] = df['comment'].apply(cleaning)

print(df[['comment', 'cleaned_comment']].head())

                                             comment  \
0  Gw yang niat edukasi penyakit yang langka aja ...   
1  aku pernah dikatain "ih gay gak suka cewek, pa...   
2  trending nya joget2 atau lucu2 sama kayak di n...   
3  kalau bikin konten gimmick menurut lu gimana. ...   
4            Mau heran tapi lupa ini di negeri indo😂   

                                     cleaned_comment  
0  yang niat edukasi sakit yang langka aja malah ...  
1  aku pernah dikatain gay gak suka cewek padahal...  
2  trending nya joget atau lucu sama kayak negara...  
3  kalau bikin konten gimmick turut gimana kayak ...  
4                    heran tapi lupa ini negeri indo  


In [23]:
print("Jumlah data duplikat:", df['cleaned_comment'].duplicated().sum())

df = df.drop_duplicates(subset='cleaned_comment')

Jumlah data duplikat: 70


In [24]:
df.to_csv('./data/cleaned_comments.csv', index=False)

In [25]:
clean_df = pd.read_csv('./data/cleaned_comments.csv')

In [26]:
clean_df = clean_df.dropna(subset=['cleaned_comment'])

In [27]:
def predict_sentiment(sent):
    text=str(sent)
    # feature extraction
    text_feature = feature_bow.transform([text])
    # predict
    return model_nb.predict(text_feature)[0]

In [28]:
clean_df['predicted_sentiment'] = clean_df.cleaned_comment.apply(predict_sentiment)

clean_df.head()

Unnamed: 0,comment,cleaned_comment,predicted_sentiment
0,Gw yang niat edukasi penyakit yang langka aja ...,yang niat edukasi sakit yang langka aja malah ...,negative
1,"aku pernah dikatain ""ih gay gak suka cewek, pa...",aku pernah dikatain gay gak suka cewek padahal...,negative
2,trending nya joget2 atau lucu2 sama kayak di n...,trending nya joget atau lucu sama kayak negara...,negative
3,kalau bikin konten gimmick menurut lu gimana. ...,kalau bikin konten gimmick turut gimana kayak ...,negative
4,Mau heran tapi lupa ini di negeri indo😂,heran tapi lupa ini negeri indo,neutral


In [29]:
clean_df.to_csv('./data/comments_predicted_sentiment.csv', index=False)