In [25]:
# ==============================
# 1. Import library
# ==============================
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.under_sampling import RandomUnderSampler

In [26]:
# load dataset
df = pd.read_excel("D:\File Kuliah D\Semester 7\Pemro Teks P\Tugas Besar\komentar_tiktok_labeled.xlsx")

df.head()

Unnamed: 0,clean_text,label
0,jefri penasaran gimana,non_kasar
1,dilalui,non_kasar
2,fefek jule sebenernya parian,kasar
3,fefeknya laris anjir,kasar
4,inara diselip jule terus,non_kasar


In [27]:
# ==============================
# 2. UNDER SAMPLING
# ==============================
X = df[['clean_text']]
y = df['label']

print("Distribusi sebelum undersampling:")
print(y.value_counts())

rus = RandomUnderSampler(
    sampling_strategy='auto',  # samakan semua kelas ke jumlah terkecil
    random_state=42
)

X_resampled, y_resampled = rus.fit_resample(X, y)

# Gabungkan kembali
df_undersample = pd.concat(
    [X_resampled, y_resampled],
    axis=1
)

print("\nDistribusi setelah undersampling:")
print(df_undersample['label'].value_counts())


Distribusi sebelum undersampling:
label
non_kasar    3924
kasar         589
Name: count, dtype: int64

Distribusi setelah undersampling:
label
kasar        589
non_kasar    589
Name: count, dtype: int64


In [28]:
# ==============================
# 3. Split fitur & label
# ==============================
X = df_undersample['clean_text']
y = df_undersample['label']


# ==============================
# 4. Train-Test Split
# ==============================
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


# ==============================
# 5. Pipeline TF-IDF + SVM
# ==============================
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        lowercase=True,
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.95
    )),
    ("nb", MultinomialNB(alpha=1.0))
])

# ==============================
# 6. Training model
# ==============================
pipeline.fit(X_train, y_train)

In [29]:
# ==============================
# 7. Evaluasi model
# ==============================
y_pred = pipeline.predict(X_test)


print("Accuracy:", accuracy_score(y_test, y_pred))
print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))

print("\n=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.7796610169491526

=== Classification Report ===
              precision    recall  f1-score   support

       kasar       0.80      0.75      0.77       118
   non_kasar       0.76      0.81      0.79       118

    accuracy                           0.78       236
   macro avg       0.78      0.78      0.78       236
weighted avg       0.78      0.78      0.78       236


=== Confusion Matrix ===
[[88 30]
 [22 96]]


In [30]:
# ==============================
# 8. Simpan model
# ==============================
with open("naivebayes_model.pkl", "wb") as f:
    pickle.dump(pipeline, f)

print("\nModel berhasil disimpan sebagai naivebayes_model.pkl")


Model berhasil disimpan sebagai naivebayes_model.pkl


In [None]:
# import pickle
# import pandas as pd

# # load model
# with open("naivebayes_model.pkl", "rb") as f:
#     model = pickle.load(f)

# # test cases
# test_texts = [
#     "anjing banget kelakuannya",
#     "kontol lu sok jago",
#     "bacot doang kerja kagak",
#     "anjgggg kelakuanmu parah",
#     "dasar kampret",
#     "video ini keren banget",
#     "suaranya bagus dan enak didengar",
#     "tidak suka tapi masih sopan",
#     "redup karirnya sekarang",
#     "gemeter pas tampil live"
# ]

# df_test = pd.DataFrame({"clean_text": test_texts})

# # prediksi
# df_test["predicted_label"] = model.predict(df_test["clean_text"])

# print(df_test)


                         clean_text predicted_label
0         anjing banget kelakuannya           kasar
1                kontol lu sok jago           kasar
2           bacot doang kerja kagak       non_kasar
3          anjgggg kelakuanmu parah           kasar
4                     dasar kampret           kasar
5            video ini keren banget           kasar
6  suaranya bagus dan enak didengar           kasar
7       tidak suka tapi masih sopan       non_kasar
8           redup karirnya sekarang           kasar
9           gemeter pas tampil live           kasar
