In [2]:
import pandas as pd
import re
import seaborn as sns
import numpy
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words_id = list(set(stopwords.words('indonesian')))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [59]:
df = pd.read_csv('dataset.csv')
df[['text', 'sentiment']] = df[['text', 'sentiment']].astype(str).fillna('')
df = df[['text', 'sentiment']]
df.head(100)

Unnamed: 0,text,sentiment
0,haters bacot tapi masih nonton,negatif
1,mantap kontol,negatif
2,setan,negatif
3,tai,negatif
4,titit,negatif
...,...,...
95,akhirnya opening yg sangat gw tunggu,negatif
96,akhirnya pake intro ini lagi v,positif
97,akhirnya pake intro lama lagi,positif
98,akhirnya pake intro legend lagi ea,positif


# **1. Preprocessing Data**

In [60]:
df = df.drop_duplicates(subset='text')
df.duplicated().sum()

np.int64(0)

In [61]:
df = df.dropna()
df.isnull().sum()
df

Unnamed: 0,text,sentiment
0,haters bacot tapi masih nonton,negatif
1,mantap kontol,negatif
2,setan,negatif
3,tai,negatif
4,titit,negatif
...,...,...
10914,yg komen beli subcriber bego kali y mikir kamu...,negatif
10915,yo bangsat cakep aku suka kamu semangat bang t...,negatif
10916,yo lah anak pantek ang mah,negatif
10917,youtube emang ada atur buat konten yg baik did...,negatif


Cleaning Data

In [62]:
def clean_data(text):
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['text'] = df['text'].apply(clean_data)

Case Folding

In [63]:
#Case Folding
df['text'] = df['text'].str.lower()
df

Unnamed: 0,text,sentiment
0,haters bacot tapi masih nonton,negatif
1,mantap kontol,negatif
2,setan,negatif
3,tai,negatif
4,titit,negatif
...,...,...
10914,yg komen beli subcriber bego kali y mikir kamu...,negatif
10915,yo bangsat cakep aku suka kamu semangat bang t...,negatif
10916,yo lah anak pantek ang mah,negatif
10917,youtube emang ada atur buat konten yg baik did...,negatif


Normalization

In [64]:
def normalize(text):
    def load_normalization_dict():
        with open('normalization_dict.json', 'r') as file:
            normalization_dict = json.load(file)
        return normalization_dict
    normalization_dict = load_normalization_dict()
    for word, replacement in normalization_dict.items():
        pattern = r'\b' + re.escape(word) + r'\b'
        text = re.sub(pattern, replacement, text)
    return text

df['text'] = df['text'].astype(str).apply(lambda x: normalize(x))
df

Unnamed: 0,text,sentiment
0,pembenci bacot tapi masih nonton,negatif
1,mantap kontol,negatif
2,setan,negatif
3,tai,negatif
4,titit,negatif
...,...,...
10914,yg komentar beli subcriber bego kali y pikir k...,negatif
10915,yo bangsat cakep aku suka kamu semangat bang t...,negatif
10916,yo lah anak pantek ang mah,negatif
10917,youtube emang ada atur buat konten yg baik did...,negatif


Stopword Removal

In [65]:
import Sastrawi
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary
more_stop_words = ['kok', 'cuk','v', 'sih', 'kan', 'loh', 'duh', 'wah', 'yang', 'untuk', 'pada', 'ke', 'para', 'namun', 'menurut', 'antara', 'dia', 'dua', 'ia', 'seperti', 'jika', 'sehingga', 'kembali', 'dan', 'ini', 'karena', 'kepada', 'oleh', 'saat', 'harus', 'setelah', 'kami', 'sekitar', 'bagi', 'serta', 'di', 'dari', 'telah', 'sebagai', 'masih', 'hal', 'ketika', 'adalah', 'itu', 'dalam', 'bisa', 'bahwa', 'atau', 'hanya', 'kita', 'dengan', 'akan', 'juga', 'ada', 'mereka', 'sudah', 'saya', 'terhadap', 'secara', 'agar', 'lain', 'kenapa', 'yaitu', 'yakni', 'daripada', 'itulah', 'lagi', 'maka', 'tentang', 'demi', 'dimana', 'kemana', 'pula', 'sambil', 'supaya', 'guna', 'kah', 'pun', 'sampai', 'sedangkan', 'selagi', 'apakah', 'sebab', 'selain', 'seolah', 'seraya', 'seterusnya', 'tanpa', 'agak', 'boleh', 'dapat', 'dsb', 'dst', 'dll', 'dahulu', 'dulunya', 'anu', 'demikian', 'ingin', 'juga', 'nggak', 'mari', 'nanti', 'melainkan', 'oh', 'ok', 'seharusnya', 'sebetulnya', 'setiap', 'setidaknya', 'sesuatu', 'pasti', 'saja', 'toh', 'ya', 'walau', 'tolong', 'tentu', 'amat', 'apalagi', 'bagaimanapun', 'sekali', 'jadi', 'nya']
stop_words = StopWordRemoverFactory().get_stop_words()
stop_words.extend(more_stop_words)

new_array = ArrayDictionary(stop_words)
stop_words_remover_new = StopWordRemover(new_array)

def stopword_removal(str_text):
    str_text = stop_words_remover_new.remove(str_text)
    return str_text

df['text'] = df['text'].apply(lambda x: stopword_removal(x))
df

Unnamed: 0,text,sentiment
0,pembenci bacot masih nonton,negatif
1,mantap kontol,negatif
2,setan,negatif
3,tai,negatif
4,titit,negatif
...,...,...
10914,yg komentar beli subcriber bego kali y pikir k...,negatif
10915,yo bangsat cakep aku suka kamu semangat bang t...,negatif
10916,yo lah anak pantek ang mah,negatif
10917,youtube emang atur buat konten yg baik didik y...,negatif


Tokenizing

In [66]:
df['tokenized'] = df['text'].astype(str).apply(lambda x:x.split())
df

Unnamed: 0,text,sentiment,tokenized
0,pembenci bacot masih nonton,negatif,"[pembenci, bacot, masih, nonton]"
1,mantap kontol,negatif,"[mantap, kontol]"
2,setan,negatif,[setan]
3,tai,negatif,[tai]
4,titit,negatif,[titit]
...,...,...,...
10914,yg komentar beli subcriber bego kali y pikir k...,negatif,"[yg, komentar, beli, subcriber, bego, kali, y,..."
10915,yo bangsat cakep aku suka kamu semangat bang t...,negatif,"[yo, bangsat, cakep, aku, suka, kamu, semangat..."
10916,yo lah anak pantek ang mah,negatif,"[yo, lah, anak, pantek, ang, mah]"
10917,youtube emang atur buat konten yg baik didik y...,negatif,"[youtube, emang, atur, buat, konten, yg, baik,..."


Stemming

In [68]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

def stemming(text_cleaning):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    return stemmer.stem(text_cleaning)

df['stemmed'] = df['text'].astype(str).apply(stemming)
df[['stemmed', 'sentiment']].to_csv('executed.csv', index=False, encoding='latin1')

In [None]:
# from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
# import pandas as pd

# factory = StemmerFactory()
# stemmer = factory.create_stemmer()

# def stemming(text_cleaning):
#     try:
#         if not text_cleaning or pd.isna(text_cleaning):
#             return ""
#         return stemmer.stem(text_cleaning)
#     except Exception as e:
#         print(f"Error processing text: {text_cleaning}. Error: {e}")
#         return ""

# df['stemmed'] = df['tokenized'].astype(str).apply(stemming)
# df['stemmed'].to_csv('executed.csv', index=False, encoding='latin1')

In [None]:
# from Sastrawi.Stenner.StemmerFactory import StemmerFactory

# def stemming(text_cleaning):
#     factory = StemmerFactory()
#     stemmer = factory.create_stener()
#     do = []
#     for w in text_cleaning:
#         dt = stemmer.stem(w)
#         do.append(dt)
#     d_clean = []
#     d_clean = " ".join(do)
#     print(d_clean)
#     return d_clean

# tokenized = tokenized.apply(stemming)
# tokenized.to_csv("", index=False)

In [8]:
df = pd.read_csv('executed.csv', encoding='latin1')
# df = df.dropna()
df = df.drop('Unnamed: 0', axis=1)
df

Unnamed: 0,stemmed,sentiment
0,benci bacot masih nonton,-1
1,mantap kontol,-1
2,setan,-1
3,tai,-1
4,titit,-1
...,...,...
10913,yg komentar beli subcriber bego kali y pikir k...,-1
10914,yo bangsat cakep aku suka kamu semangat bang t...,-1
10915,yo lah anak pantek ang mah,-1
10916,youtube emang atur buat konten yg baik didik y...,-1


**Translate**

In [None]:
%pip install translate

Note: you may need to restart the kernel to use updated packages.


In [None]:
%pip install translate
from translate import Translator

def translate_id(text):
    try:
        translator = Translator(from_lang='en', to_lang="id")
        translation = translator.translate(text)
        return translation
    except Exception as e:
        print(f"Error in translation: {e}")
        return text

df['stemmed'] = df['stemmed'].astype(str).apply(translate_id)
df['stemmed'].to_csv('TranslatedSampleID.csv')

In [None]:
%pip install preprocessor
%pip install textblob
%pip install wordcloud
%pip install nltk

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [5]:
df = pd.read_csv('executed.csv', encoding='latin1')
# df = df.drop('Unnamed: 0', axis=1)
df


Unnamed: 0,stemmed,sentiment,labeling,filtered
0,benci bacot nonton,-1,kasar,bacot
1,mantap kontol,-1,kasar,kontol
2,setan,-1,kasar,setan
3,tai,-1,kasar,tai
4,titit,-1,kasar,titit
...,...,...,...,...
10913,yg komentar beli subcriber bego kali y pikir c...,-1,kasar,bego bego
10914,yo bangsat cakep suka semangat bang tetep duku...,-1,kasar,bangsat
10915,yo anak pantek ang mah,-1,kasar,pantek
10916,youtube emang atur konten yg didik youtube atu...,-1,kasar,tolol


In [9]:
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text) 
    text = text.lower().strip()             
    words = text.split()                    
    words = [word for word in words if word not in stop_words_id]
    return ' '.join(words)

df['stemmed'] = df['stemmed'].apply(clean_text)

# **Model 1**
(Positif/Netral/Negatif)

 **2. Labeling**

In [130]:
label = {'positif': 1, 'netral':0, 'negatif': -1}
df['sentiment'] = df['sentiment'].map(label)

In [11]:
X = df['stemmed']
y = df['sentiment']

Pembagian test dan train

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**3. TF-IDF Vectorization**

In [13]:
vectorizer = TfidfVectorizer(stop_words=stop_words_id)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)



In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(stop_words=stop_words_id)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model = MultinomialNB()
model.fit(X_train, y_train)

**Akurasi**

In [None]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Prediksi Kalimat Baru

In [None]:
def prediksiKalimatBaru(text):
    teksBaru = vectorizer.transform([text])
    prediksi = model.predict(teksBaru)

    if prediksi[0] == 1: 
        sentiment = "positif"
    elif prediksi[0] == 0: 
        sentiment = "netral"
    else: 
        sentiment = "negatif"
    return sentiment

Contoh Penggunaan

In [None]:
contohKalimat = "saya membeli anjing"
print(f"Teks: '{contohKalimat}' => Sentimen: {prediksiKalimatBaru(contohKalimat)}")

# **Model 2**
(kasar/tidak kasar)

 **1. Labeling**

In [None]:
with open('kamus_kasar.json', 'r') as file:
    kamus = json.load(file)

def labeling(text, kamus):
    words = set(text.split())
    kata_kasar = set(kamus.keys())  
    if words & kata_kasar:
        return 'kasar'
    else:
        return 'tidak kasar'
    
df['labeling'] = df['stemmed'].astype(str).apply(lambda x: labeling(x, kamus))
df.to_csv('labeled.csv', encoding='latin1')
df

Unnamed: 0,Stemmed_Comments,Tokenized_Comments,ID_Comments,labeling
0,rick kamu anak hasil kondom bocor,"[rick, kamu, anak, hasil, kondom, bocor]",rick kamu anak hasil kondom bocor,tidak kasar
1,kakak sok pede,"[kakak, sok, pede]",kakak sok pede,tidak kasar
2,keren bang slow aku subscribe kontol eenak,"[keren, bang, slow, aku, subscribe, kontol, ee...",keren bang slow aku subscribe kontol eenak,kasar
3,apa tangan mu tato,"[apa, tangan, mu, tato]",apa tangan mu tato,tidak kasar
4,muka kamu anjing aku rugi nonton youtube kamu ...,"[muka, kamu, anjing, aku, rugi, nonton, youtub...",muka kamu anjing aku rugi nonton youtube kamu ...,kasar
...,...,...,...,...
10632,awal mula yutuber suksess,"[awal, mula, yutuber, suksess]",awal mula yutuber suksess,tidak kasar
10633,aihh sedih tukang numpang promosi tolol bego g...,"[aihh, sedih, tukang, numpang, promosi, tolol,...",aihh sedih tukang numpang promosi tolol bego g...,kasar
10634,mampir chanelku yg suka bokeb,"[mampir, chanelku, yg, suka, bokeb]",mampir chanelku yg suka bokeb,tidak kasar
10635,makasih,[makasih],makasih,tidak kasar


Pembagian dataset

In [202]:
from sklearn.model_selection import train_test_split

X = df['stemmed'].astype(str).dropna() 
y = df['labeling']           

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Jumlah data latih: {len(X_train)}")
print(f"Jumlah data uji: {len(X_test)}")


KeyError: 'stemmed'

In [None]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

Dimensi X_train_tfidf: (8509, 8579)
Dimensi X_test_tfidf: (2128, 8579)


In [None]:
with open('kamus_kasar.json', 'r') as file:
    kamus = json.load(file)

kata_kasar = set(kamus.keys())

def filter_kata_kasar(text, kamus_keys):
    words = str(text).split()
    filtered_words = [word for word in words if word in kamus_keys]
    return ' '.join(filtered_words)

df['filtered'] = df['stemmed'].apply(lambda x: filter_kata_kasar(x, kata_kasar))
df.to_csv('executed.csv', index=False, encoding='latin1')
# df[['stemmed', 'Filtered_Comments']]
df

Unnamed: 0,Stemmed_Comments,Tokenized_Comments,ID_Comments,labeling,Filtered_Comments
0,rick kamu anak hasil kondom bocor,"[rick, kamu, anak, hasil, kondom, bocor]",rick kamu anak hasil kondom bocor,tidak kasar,
1,kakak sok pede,"[kakak, sok, pede]",kakak sok pede,tidak kasar,
2,keren bang slow aku subscribe kontol eenak,"[keren, bang, slow, aku, subscribe, kontol, ee...",keren bang slow aku subscribe kontol eenak,kasar,kontol
3,apa tangan mu tato,"[apa, tangan, mu, tato]",apa tangan mu tato,tidak kasar,
4,muka kamu anjing aku rugi nonton youtube kamu ...,"[muka, kamu, anjing, aku, rugi, nonton, youtub...",muka kamu anjing aku rugi nonton youtube kamu ...,kasar,anjing anjing
...,...,...,...,...,...
10632,awal mula yutuber suksess,"[awal, mula, yutuber, suksess]",awal mula yutuber suksess,tidak kasar,
10633,aihh sedih tukang numpang promosi tolol bego g...,"[aihh, sedih, tukang, numpang, promosi, tolol,...",aihh sedih tukang numpang promosi tolol bego g...,kasar,tolol bego bangsat
10634,mampir chanelku yg suka bokeb,"[mampir, chanelku, yg, suka, bokeb]",mampir chanelku yg suka bokeb,tidak kasar,
10635,makasih,[makasih],makasih,tidak kasar,


Hitung Skor TF-IDF per kata kasar

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def filter_kata_kasar(text, kata_kasar):
    words = str(text).split()
    filtered_words = [word for word in words if word in kata_kasar]
    return ' '.join(filtered_words)

vectorizer = TfidfVectorizer()

tfidf_matrix = vectorizer.fit_transform(df['Filtered_Comments'])

features = vectorizer.get_feature_names_out()
scores = tfidf_matrix.toarray()

data = []
for doc_idx, doc_scores in enumerate(scores):
    tokens = [features[i] for i in range(len(features)) if doc_scores[i] > 0]
    tfidf_scores = [round(doc_scores[i], 2) for i in range(len(features)) if doc_scores[i] > 0]
    
    for token, score in zip(tokens, tfidf_scores):
        data.append({"Comments": token, "TF-IDF": score})

df_tfidf = pd.DataFrame(data)
df_tfidf = df_tfidf[df_tfidf['Comments'].str.len() > 0].dropna().drop_duplicates(subset='Comments')
df_tfidf.to_csv("skor_tfidf.csv", index=False)
score = pd.read_csv("skor_tfidf.csv", encoding='latin1')
score

Unnamed: 0,Comments,TF-IDF
0,kontol,1.00
1,anjing,1.00
6,babi,0.66
8,memek,0.52
11,goblok,0.36
...,...,...
2778,perek,0.86
2801,bangkai,1.00
3037,tuyul,0.80
3102,wong,1.00


Ambil skor TF-IDF diatas 0.5

In [None]:
data = pd.DataFrame()
data = df[df['TF-IDF'] > 0.5]
data.to_csv('high_tfidf.csv')

Ambil kata yang TF-IDF diatas 0.5

In [None]:
data = pd.read_csv('high_tfidf.csv')
data = data.drop('Unnamed: 0', axis=1)
high_tfidf = data['Comments']

Cek apakah ada kata TF-IDF tinggi dalam kalimat

In [None]:
def cekTFIDF(text, data):
    words = set(text.split())
    high_tfidf = set(data)
    if words & high_tfidf:
        return 'ada'
    else:
        return 'tidak ada'
    
df['cekTF-IDF'] = df['text'].astype(str).apply(lambda x: cekTFIDF(x, data))
df

# **3. Word Replacement**

In [None]:
import json

with open('kamus_kasar.json', 'r') as file:
    kamus = json.load(file)

def replace_kata_kasar(text, kamus):
    words = text.split()  
    word_replacement = [kamus[key] if key in kamus else key for key in words]
    return ' '.join(word_replacement)

df = pd.read_csv('executed.csv')
df['text'] = df['stemmed'].astype(str)

df['fixed'] = df.apply(
    lambda row: replace_kata_kasar(row['text'], kamus) if row['sentiment'] != 'positif' else row['text'],
    axis=1
)

df[['text', 'fixed']].to_csv('hasil.csv', encoding='latin1', index=False)
dresult = pd.read_csv('hasil.csv')
print(dresult)


FileNotFoundError: [Errno 2] No such file or directory: 'kamus_kasar.json'

Evaluasi Model

In [None]:
nltk.download('stopwords')
stop_words_id = list(set(stopwords.words('indonesian')))

data = pd.read_csv('hasil.csv')

data = data.dropna(subset=['Comments', 'Replaced_Comments'])
print(data.isnull().sum())

X = data['Comments'] 
y = data['Replaced_Comments']    

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(
    max_features=5000,
    min_df=2,     
    max_df=0.95,
    stop_words=stop_words_id
)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

print(f"Akurasi: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# **Eksekusi dalam input**

Testing Function

In [None]:
def execute(word):
    word = clean_data(word)
    word = normalize(word)
    word = stemming(word)
    word = translate_id(word)
    word = replace_kata_kasar(word, kamus)
    return word

contoh = 'Dasar manusia tolol'
execute(contoh)

dasar manusia tidak pengertian


In [None]:
%pip install dill

## **Saving Function**

In [1]:
import dill

def clean_data(text):
    import re
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text
with open('clean_data.sav', 'wb') as file:
    dill.dump(clean_data, file)

def load_normalization_dict():
    with open('normalization_dict.json', 'r') as file:
        normalization_dict = json.load(file)
    return normalization_dict

def normalize(text):
    import re, json
    def load_normalization_dict():
        with open('normalization_dict.json', 'r') as file:
            normalization_dict = json.load(file)
        return normalization_dict
    normalization_dict = load_normalization_dict()
    for word, replacement in normalization_dict.items():
        pattern = r'\b' + re.escape(word) + r'\b'
        text = re.sub(pattern, replacement, text)
    return text
with open('normalize.sav', 'wb') as file:
    dill.dump(normalize, file)

def stemming(text_cleaning):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    return stemmer.stem(text_cleaning)
with open('stemming.sav', 'wb') as file:
    dill.dump(stemming, file)

def translate_id(text):
    try:
        translator = Translator(from_lang='en', to_lang="id" )
        translation = translator.translate(text)
        return translation
    except Exception as e:
        print(f"Error in translation: {e}")
        return text
with open('translate_id.sav', 'wb') as file:
    dill.dump(translate_id, file)
    
def replace_kata_kasar(text):
    import json
    with open('kamus_kasar.json', 'r') as file:
        kamus = json.load(file)
        kata_kasar = set(kamus.keys())
        words = text.split()
        word_replacement = [kamus[key] if key in kata_kasar else key for key in words]
    return ' '.join(word_replacement)
with open('replace_kata_kasar.sav', 'wb') as file:
    dill.dump(replace_kata_kasar, file) 

In [None]:
import dill

def execute(word):
    word = clean_data(word)
    word = normalize(word)
    word = stemming(word)
    word = translate_id(word)
    word = replace_kata_kasar(word, kamus)
    return word

with open('execute_function.sav', 'wb') as file:
    dill.dump(execute, file)