In [23]:
# Import Library
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [24]:
# ==============================
# 2. Load Dataset
# ==============================
df = pd.read_excel(
    "D:\File Kuliah D\Semester 7\Pemro Teks P\Tugas Besar\komentar_tiktok_jule_jefri.xlsx"
)

assert isinstance(df, pd.DataFrame), "df bukan DataFrame!"

In [25]:
# ==============================
# 3. Tentukan Kolom Teks
# ==============================
TEXT_COLUMN = "text"

if TEXT_COLUMN not in df.columns:
    raise ValueError(f"Kolom '{TEXT_COLUMN}' tidak ditemukan!")

df[TEXT_COLUMN] = df[TEXT_COLUMN].apply(lambda x: str(x))
df = df[df[TEXT_COLUMN].str.strip() != ""]

In [26]:
# ==============================
# 4. Fungsi Hapus Emoji
# ==============================
def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"
        "\U0001F300-\U0001F5FF"
        "\U0001F680-\U0001F6FF"
        "\U0001F1E0-\U0001F1FF"
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

# ==============================
# 5. Fungsi Preprocessing
# ==============================
def preprocessing(text):
    text = str(text).lower()
    text = remove_emojis(text)
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()

    tokens = word_tokenize(text)

    # Normalisasi huruf berulang
    normalized_tokens = [
        re.sub(r'(.)\1+', r'\1', word) for word in tokens
    ]

    stopword_factory = StopWordRemoverFactory()
    stopwords_id = set(stopword_factory.get_stop_words())

    filtered_tokens = [
        word for word in normalized_tokens
        if word not in stopwords_id and len(word) > 3
    ]

    return filtered_tokens

In [27]:
# ==============================
# 6. Lexicon Kata Kasar
# ==============================
lexicon_kasar = list(set([
    "anjing","bangsat","goblok","tolol","kontol",
    "memek","ngentot","asu","bego","tai",
    "kampret","bacot","sialan","brengsek",
    "anj","anjg","anjir","njir","jir",
    "cok","jancok","cuuk",
    "fefek","mmk","kontl","ngntt",
    "redup","gemeter","gemetar"
]))

# ==============================
# 7. Fungsi Deteksi Kata Kasar
# ==============================
def contains_profanity(tokens):
    for word in tokens:
        if word in lexicon_kasar:
            return True
    return False

In [28]:
# ==============================
# 8. Proses Preprocessing
# ==============================
df["tokens"] = df[TEXT_COLUMN].apply(preprocessing)
df["clean_text"] = df["tokens"].apply(lambda x: ' '.join(x))

# ==============================
# 9. Proses Pelabelan
# ==============================
df["label"] = df["tokens"].apply(
    lambda x: "kasar" if contains_profanity(x) else "non_kasar"
)


In [29]:
df.head()

Unnamed: 0,text,tokens,clean_text,label
0,Jefri penasaran apa gimana sihðŸ˜­ðŸ˜­,"[jefri, penasaran, gimana]",jefri penasaran gimana,non_kasar
1,12.12 dilalui,[dilalui],dilalui,non_kasar
2,fefek jule sebenernya parian apa sihðŸ˜­,"[fefek, jule, sebenernya, parian]",fefek jule sebenernya parian,kasar
3,fefeknya laris anjirðŸ˜­ðŸ˜­ðŸ˜­,"[fefeknya, laris, anjir]",fefeknya laris anjir,kasar
4,inara diselip jule terusðŸ˜­,"[inara, diselip, jule, terus]",inara diselip jule terus,non_kasar


In [36]:
df = df[['clean_text', 'label']]

In [37]:
df.isnull().sum()

clean_text    0
label         0
dtype: int64

In [38]:
df.duplicated().sum()

655

In [39]:
df.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(inplace=True)


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4513 entries, 0 to 5167
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   clean_text  4513 non-null   object
 1   label       4513 non-null   object
dtypes: object(2)
memory usage: 105.8+ KB


In [44]:
# ==============================
# 10. Simpan Dataset Hasil
# ==============================
output_file = "komentar_tiktok_labeled.xlsx"

df[["clean_text", "label"]].to_excel(
    output_file,
    index=False,
    engine="openpyxl"
)

In [45]:
# ==============================
# 11. Evaluasi Awal
# ==============================
print("Distribusi label:")
print(df["label"].value_counts())

print(f"\nDataset berhasil disimpan sebagai: {output_file}")

Distribusi label:
label
non_kasar    3924
kasar         589
Name: count, dtype: int64

Dataset berhasil disimpan sebagai: komentar_tiktok_labeled.xlsx
