In [1]:
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
import numpy
import json

In [38]:
df = pd.read_csv('dataset.csv')
df[['Comments', 'Sentimen']] = df[['Comments', 'Sentimen']].astype(str).fillna('')
df = df[['Comments', 'Sentimen']]
df.head(100)

Unnamed: 0,Comments,Sentimen
0,ngilu anjeng,netral
1,anjing ericko lim bisa ngomong baik adddduuu ...,netral
2,bang jangan kalah sama haters haters tu cuma ...,positif
3,gak peduli hatters kok bikin lagu,netral
4,gw ngerti lu sibuk ngurusin ytr tapi lu coba ...,netral
...,...,...
95,age restricted lah goblok,netral
96,ah aku ngehate ah biar masuk youtube,netral
97,ah bangsat lo anjing hahaha ngakak ada iklan q...,negatif
98,ah banyak iklan kontol,negatif


# **1. Preprocessing Data**

In [12]:
#Mengecek data duplikat
df = df.drop_duplicates(subset='Comments')
df.duplicated().sum()

np.int64(0)

In [13]:
#Mengecek data kosong
df = df.dropna()
df.isnull().sum()
df

Unnamed: 0,Comments,Sentimen
0,ngilu anjeng,netral
1,anjing ericko lim bisa ngomong baik adddduuu ...,netral
2,bang jangan kalah sama haters haters tu cuma ...,positif
3,gak peduli hatters kok bikin lagu,netral
4,gw ngerti lu sibuk ngurusin ytr tapi lu coba ...,netral
...,...,...
10959,zharborneo ikut rewind gk bang,netral
10960,zhiap bang saya juga,netral
10961,zigi wig tombol mode batas youtube,netral
10962,ziroaz didik nah emng main game main hp nonton...,negatif


Cleaning Data

In [14]:
def clean_data(text):
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['Comments'] = df['Comments'].apply(clean_data)

Case Folding

In [15]:
#Case Folding
df['Comments'] = df['Comments'].str.lower()
df.head(100)

Unnamed: 0,Comments,Sentimen
0,ngilu anjeng,netral
1,anjing ericko lim bisa ngomong baik adddduuu g...,netral
2,bang jangan kalah sama haters haters tu cuma b...,positif
3,gak peduli hatters kok bikin lagu,netral
4,gw ngerti lu sibuk ngurusin ytr tapi lu coba b...,netral
...,...,...
96,ah aku ngehate ah biar masuk youtube,netral
97,ah bangsat lo anjing hahaha ngakak ada iklan q...,negatif
98,ah banyak iklan kontol,negatif
99,ah dasar kontol kontol kontol kontol sama kamu...,negatif


Normalization

In [16]:
def normalize(text):
    def load_normalization_dict():
        with open('normalization_dict.json', 'r') as file:
            normalization_dict = json.load(file)
        return normalization_dict
    normalization_dict = load_normalization_dict()
    for word, replacement in normalization_dict.items():
        pattern = r'\b' + re.escape(word) + r'\b'
        text = re.sub(pattern, replacement, text)
    return text

df['Comments'] = df['Comments'].astype(str).apply(lambda x: normalize(x))
df.head(100)

Unnamed: 0,Comments,Sentimen
0,ngilu anjeng,netral
1,anjing ericko lim bisa ngomong baik adddduuu a...,netral
2,bang jangan kalah sama haters haters tu cuma b...,positif
3,tidak peduli hatters kok bikin lagu,netral
4,aku ngerti kamu sibuk ngurusin ytr tapi kamu c...,netral
...,...,...
96,ah aku ngehate ah biar masuk youtube,netral
97,ah bangsat kamu anjing hahaha tertawa ada ikla...,negatif
98,ah banyak iklan kontol,negatif
99,ah dasar kontol kontol kontol kontol sama kamu...,negatif


Stopword Removal

In [17]:
# Menghilangkan kata-kata stop (stopwords) menggunakan Sastrawi
import Sastrawi
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary
more_stop_words = ['v', 'sih', 'kan', 'loh', 'duh', 'wah', 'yang', 'untuk', 'pada', 'ke', 'para', 'namun', 'menurut', 'antara', 'dia', 'dua', 'ia', 'seperti', 'jika', 'jika', 'sehingga', 'kembali', 'dan', 'ini', 'karena', 'kepada', 'oleh', 'saat', 'harus', 'setelah', 'kami', 'sekitar', 'bagi', 'serta', 'di', 'dari', 'telah', 'sebagai', 'masih', 'hal', 'ketika', 'adalah', 'itu', 'dalam', 'bisa', 'bahwa', 'atau', 'hanya', 'kita', 'dengan', 'akan', 'juga', 'ada', 'mereka', 'sudah', 'saya', 'terhadap', 'secara', 'agar', 'lain', 'anda', 'begitu', 'mengapa', 'kenapa', 'yaitu', 'yakni', 'daripada', 'itulah', 'lagi', 'maka', 'tentang', 'demi', 'dimana', 'kemana', 'pula', 'sambil', 'sebelum', 'sesudah', 'supaya', 'guna', 'kah', 'pun', 'sampai', 'sedangkan', 'selagi', 'sementara', 'apakah', 'sebab', 'selain', 'seolah', 'seraya', 'seterusnya', 'tanpa', 'agak', 'boleh', 'dapat', 'dsb', 'dst', 'dll', 'dahulu', 'dulunya', 'anu', 'demikian', 'ingin', 'juga', 'nggak', 'mari', 'nanti', 'melainkan', 'oh', 'ok', 'seharusnya', 'sebetulnya', 'setiap', 'setidaknya', 'sesuatu', 'pasti', 'saja', 'toh', 'ya', 'walau', 'tolong', 'tentu', 'amat', 'apalagi', 'bagaimanapun', 'sekali', 'jadi', 'nya']

stop_words = StopWordRemoverFactory().get_stop_words()
stop_words.extend(more_stop_words)

new_array = ArrayDictionary(stop_words)
stop_words_remover_new = StopWordRemover(new_array)

def stopword_removal(str_text):
    str_text = stop_words_remover_new.remove(str_text)
    return str_text

df['Comments'] = df['Comments'].apply(lambda x: stopword_removal(x))
df.head()

Unnamed: 0,Comments,Sentimen
0,ngilu anjeng,netral
1,anjing ericko lim ngomong baik adddduuu aku en...,netral
2,bang jangan kalah sama haters haters tu cuma b...,positif
3,peduli hatters kok bikin lagu,netral
4,aku ngerti kamu sibuk ngurusin ytr kamu coba b...,netral


Stemming

In [39]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

def stemming(text_cleaning):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    return stemmer.stem(text_cleaning)

df['Stemmed_Comments'] = df['Comments'].astype(str).apply(stemming)
df['Stemmed_Comments'].to_csv('executed.csv', index=False, encoding='latin1')
df = pd.read_csv('executed.csv', encoding='latin1')
df

Unnamed: 0,Stemmed_Comments
0,ngilu anjeng
1,anjing ericko lim bisa ngomong baik adddduuu g...
2,bang jangan kalah sama haters haters tu cuma b...
3,gak peduli hatters kok bikin lagu
4,gw ngerti lu sibuk ngurusin ytr tapi lu coba b...
...,...
10959,zharborneo ikut rewind gk bang
10960,zhiap bang saya juga
10961,zigi wig tombol mode batas youtube
10962,ziroaz didik nah emng main game main hp nonton...


Tokenization

In [40]:
df['Tokenized_Comments'] = df['Stemmed_Comments'].astype(str).apply(lambda x:x.split())
df

Unnamed: 0,Stemmed_Comments,Tokenized_Comments
0,ngilu anjeng,"[ngilu, anjeng]"
1,anjing ericko lim bisa ngomong baik adddduuu g...,"[anjing, ericko, lim, bisa, ngomong, baik, add..."
2,bang jangan kalah sama haters haters tu cuma b...,"[bang, jangan, kalah, sama, haters, haters, tu..."
3,gak peduli hatters kok bikin lagu,"[gak, peduli, hatters, kok, bikin, lagu]"
4,gw ngerti lu sibuk ngurusin ytr tapi lu coba b...,"[gw, ngerti, lu, sibuk, ngurusin, ytr, tapi, l..."
...,...,...
10959,zharborneo ikut rewind gk bang,"[zharborneo, ikut, rewind, gk, bang]"
10960,zhiap bang saya juga,"[zhiap, bang, saya, juga]"
10961,zigi wig tombol mode batas youtube,"[zigi, wig, tombol, mode, batas, youtube]"
10962,ziroaz didik nah emng main game main hp nonton...,"[ziroaz, didik, nah, emng, main, game, main, h..."


**Translate**

In [12]:
%pip install translate

Note: you may need to restart the kernel to use updated packages.


In [41]:
# Ini yang indo ke indo
from translate import Translator

def translate_id(text):
    try:
        translator = Translator(to_lang="id", from_lang='id')
        translation = translator.translate(text)
        return translation
    except Exception as e:
        print(f"Error in translation: {e}")
        return text

df['Stemmed_Comments'] = df['Stemmed_Comments'].astype(str).dropna()
df['ID_Comments'] = df['Stemmed_Comments'].apply(translate_id)
# df[['ID_Comments']].to_csv('TranslatedSampleID.csv')

In [54]:
%pip install preprocessor
%pip install textblob
%pip install wordcloud
%pip install nltk

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [33]:
data = pd.DataFrame()
final_data = pd.DataFrame()

for index, row in df.iterrows():
    if row['Sentimen'] == 'positif':
        final = pd.concat([final, pd.DataFrame([row])])
    else:
        data = pd.concat([data, pd.DataFrame([row])])

# **2. Labeling**

In [20]:
dl = pd.read_csv('TranslatedSampleID.csv')
# dl = dl.drop('Unnamed: 0', axis=1)
dl.dropna()

Unnamed: 0,ID_Comments
0,rick kamu anak hasil kondom bocor
1,kakak sok pede
2,keren bang slow aku subscribe kontol eenak
3,apa tangan mu tato
4,muka kamu anjing aku rugi nonton youtube kamu ...
...,...
10632,awal mula yutuber suksess
10633,aihh sedih tukang numpang promosi tolol bego g...
10634,mampir chanelku yg suka bokeb
10635,makasih


In [19]:
with open('kamus_kasar.json', 'r') as file:
    kamus = json.load(file)

def labeling(text, kamus):
    words = set(text.split())
    kata_kasar = set(kamus.keys())  
    if words & kata_kasar:
        return 'kasar'
    else:
        return 'tidak kasar'
    
df['labeling'] = df['Stemmed_Comments'].astype(str).apply(lambda x: labeling(x, kamus))
df.to_csv('executed.csv', encoding='latin1')
df

Unnamed: 0,Stemmed_Comments,Tokenized_Comments,ID_Comments,labeling
0,rick kamu anak hasil kondom bocor,"[rick, kamu, anak, hasil, kondom, bocor]",rick kamu anak hasil kondom bocor,tidak kasar
1,kakak sok pede,"[kakak, sok, pede]",kakak sok pede,tidak kasar
2,keren bang slow aku subscribe kontol eenak,"[keren, bang, slow, aku, subscribe, kontol, ee...",keren bang slow aku subscribe kontol eenak,kasar
3,apa tangan mu tato,"[apa, tangan, mu, tato]",apa tangan mu tato,tidak kasar
4,muka kamu anjing aku rugi nonton youtube kamu ...,"[muka, kamu, anjing, aku, rugi, nonton, youtub...",muka kamu anjing aku rugi nonton youtube kamu ...,kasar
...,...,...,...,...
10632,awal mula yutuber suksess,"[awal, mula, yutuber, suksess]",awal mula yutuber suksess,tidak kasar
10633,aihh sedih tukang numpang promosi tolol bego g...,"[aihh, sedih, tukang, numpang, promosi, tolol,...",aihh sedih tukang numpang promosi tolol bego g...,kasar
10634,mampir chanelku yg suka bokeb,"[mampir, chanelku, yg, suka, bokeb]",mampir chanelku yg suka bokeb,tidak kasar
10635,makasih,[makasih],makasih,tidak kasar


Pembagian dataset

In [20]:
from sklearn.model_selection import train_test_split

X = df['Stemmed_Comments'].astype(str).dropna() 
y = df['labeling']           

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Jumlah data latih: {len(X_train)}")
print(f"Jumlah data uji: {len(X_test)}")


Jumlah data latih: 8509
Jumlah data uji: 2128


In [21]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print(f"Dimensi X_train_tfidf: {X_train_tfidf.shape}")
print(f"Dimensi X_test_tfidf: {X_test_tfidf.shape}")


Dimensi X_train_tfidf: (8509, 8579)
Dimensi X_test_tfidf: (2128, 8579)


In [23]:
with open('kamus_kasar.json', 'r') as file:
    kamus = json.load(file)

kata_kasar = set(kamus.keys())

def filter_kata_kasar(text, kamus_keys):
    words = str(text).split()
    filtered_words = [word for word in words if word in kamus_keys]
    return ' '.join(filtered_words)

df['Filtered_Comments'] = df['Stemmed_Comments'].apply(lambda x: filter_kata_kasar(x, kata_kasar))
df.to_csv('executed.csv', index=False, encoding='latin1')
# df[['Stemmed_Comments', 'Filtered_Comments']]
df

Unnamed: 0,Stemmed_Comments,Tokenized_Comments,ID_Comments,labeling,Filtered_Comments
0,rick kamu anak hasil kondom bocor,"[rick, kamu, anak, hasil, kondom, bocor]",rick kamu anak hasil kondom bocor,tidak kasar,
1,kakak sok pede,"[kakak, sok, pede]",kakak sok pede,tidak kasar,
2,keren bang slow aku subscribe kontol eenak,"[keren, bang, slow, aku, subscribe, kontol, ee...",keren bang slow aku subscribe kontol eenak,kasar,kontol
3,apa tangan mu tato,"[apa, tangan, mu, tato]",apa tangan mu tato,tidak kasar,
4,muka kamu anjing aku rugi nonton youtube kamu ...,"[muka, kamu, anjing, aku, rugi, nonton, youtub...",muka kamu anjing aku rugi nonton youtube kamu ...,kasar,anjing anjing
...,...,...,...,...,...
10632,awal mula yutuber suksess,"[awal, mula, yutuber, suksess]",awal mula yutuber suksess,tidak kasar,
10633,aihh sedih tukang numpang promosi tolol bego g...,"[aihh, sedih, tukang, numpang, promosi, tolol,...",aihh sedih tukang numpang promosi tolol bego g...,kasar,tolol bego bangsat
10634,mampir chanelku yg suka bokeb,"[mampir, chanelku, yg, suka, bokeb]",mampir chanelku yg suka bokeb,tidak kasar,
10635,makasih,[makasih],makasih,tidak kasar,


Hitung Skor TF-IDF per kata kasar

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

def filter_kata_kasar(text, kata_kasar):
    words = str(text).split()
    filtered_words = [word for word in words if word in kata_kasar]
    return ' '.join(filtered_words)

vectorizer = TfidfVectorizer()

# Hitung tf-idf bagian kata kasar
tfidf_matrix = vectorizer.fit_transform(df['Filtered_Comments'])

# Perhitungan skor tf-idf untuk setiap kata kasar
features = vectorizer.get_feature_names_out()
scores = tfidf_matrix.toarray()

data = []
for doc_idx, doc_scores in enumerate(scores):
    tokens = [features[i] for i in range(len(features)) if doc_scores[i] > 0]
    tfidf_scores = [round(doc_scores[i], 2) for i in range(len(features)) if doc_scores[i] > 0]
    
    #  setiap kata dan skor, akan dibuat baris baru
    for token, score in zip(tokens, tfidf_scores):
        data.append({"Comments": token, "TF-IDF": score})

df_tfidf = pd.DataFrame(data)
df_tfidf = df_tfidf[df_tfidf['Comments'].str.len() > 0].dropna().drop_duplicates(subset='Comments')
df_tfidf.to_csv("skor_tfidf.csv", index=False)
df = pd.read_csv("skor_tfidf.csv", encoding='latin1')
df

Unnamed: 0,Comments,TF-IDF
0,kontol,1.00
1,anjing,1.00
6,babi,0.66
8,memek,0.52
11,goblok,0.36
...,...,...
2778,perek,0.86
2801,bangkai,1.00
3037,tuyul,0.80
3102,wong,1.00


# **3. Word Replacement**

In [11]:
with open('kamus_kasar.json', 'r') as file:
    kamus = json.load(file)

kata_kasar = set(kamus)

def replace_kata_kasar(text, kamus):
    words = text.split()  
    word_replacement = [kamus[key] if key in kamus else key for key in words]
    return ' '.join(word_replacement)

df = pd.read_csv('executed.csv')
df['Comments'] = df['Stemmed_Comments'].astype(str)
df['Fixed_Comments']=df['Stemmed_Comments'].astype(str).apply(lambda x: replace_kata_kasar(x, kamus))
df[['Comments', 'Fixed_Comments']].to_csv('hasil.csv', encoding='latin1')

dfc = pd.read_csv('hasil.csv')
dfc


Evaluasi Model

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
stop_words_id = list(set(stopwords.words('indonesian')))

data = pd.read_csv('hasil.csv')

data = data.dropna(subset=['Comments', 'Replaced_Comments'])
print(data.isnull().sum())

X = data['Comments'] 
y = data['Replaced_Comments']    

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(
    max_features=5000,   # Batas jumlah fitur
    min_df=2,            # Kata muncul minimal 2 kali
    max_df=0.95,         # Kata muncul di <=95% dokumen
    stop_words=stop_words_id
)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

print(f"Akurasi: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# **Eksekusi dalam input**

Testing Function

In [12]:
def execute(word):
    word = clean_data(word)
    word = normalize(word)
    word = stemming(word)
    word = translate_id(word)
    word = replace_kata_kasar(word, kamus)
    return word

contoh = 'Dasar manusia tolol'
execute(contoh)

dasar manusia tidak pengertian


In [None]:
%pip install dill

Saving Function

In [42]:
import dill

def clean_data(text):
    import re
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text
with open('clean_data.sav', 'wb') as file:
    dill.dump(clean_data, file)

def load_normalization_dict():
    with open('normalization_dict.json', 'r') as file:
        normalization_dict = json.load(file)
    return normalization_dict

def normalize(text):
    import re, json
    def load_normalization_dict():
        with open('normalization_dict.json', 'r') as file:
            normalization_dict = json.load(file)
        return normalization_dict
    normalization_dict = load_normalization_dict()
    for word, replacement in normalization_dict.items():
        pattern = r'\b' + re.escape(word) + r'\b'
        text = re.sub(pattern, replacement, text)
    return text
with open('normalize.sav', 'wb') as file:
    dill.dump(normalize, file)

def stemming(text_cleaning):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    return stemmer.stem(text_cleaning)
with open('stemming.sav', 'wb') as file:
    dill.dump(stemming, file)

def translate_id(text):
    try:
        translator = Translator(to_lang="id", from_lang='id')
        translation = translator.translate(text)
        return translation
    except Exception as e:
        print(f"Error in translation: {e}")
        return text
with open('translate_id.sav', 'wb') as file:
    dill.dump(translate_id, file)
    
def replace_kata_kasar(text):
    import json
    with open('kamus_kasar.json', 'r') as file:
        kamus = json.load(file)
        kata_kasar = set(kamus.keys())
        words = text.split()
        word_replacement = [kamus[key] if key in kata_kasar else key for key in words]
    return ' '.join(word_replacement)
with open('replace_kata_kasar.sav', 'wb') as file:
    dill.dump(replace_kata_kasar, file) 

In [36]:
import dill

def execute(word):
    word = clean_data(word)
    word = normalize(word)
    word = stemming(word)
    word = translate_id(word)
    word = replace_kata_kasar(word, kamus)
    return word

with open('execute_function.sav', 'wb') as file:
    dill.dump(execute, file)