In [None]:
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', None)

import matplotlib.pyplot as plt
import seaborn as sns

import re
import string
import nltk
nltk.data.path.append(r"C:\Users\User\AppData\Roaming\nltk_data")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')
nltk.download('stopwords')
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory 


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
df_RevMazii = pd.read_csv(r"C:\Latihan_Python\casptone-project-dicoding_nihongonavigator\ulasan_Mazii.csv")

In [27]:
jumlah_ulasan, jumlah_kolom = df_RevMazii.shape

print(jumlah_ulasan)
print(jumlah_kolom)

2725
1


In [28]:
df_RevMazii.head()

Unnamed: 0,Review
0,"pada fitur terjemahan dengan gambar, hampir se..."
1,"Tolong hapus iklan di bawah nyaaa, itu sangat ..."
2,aplikasi nya bagus dan kalau bisa mohon jadika...
3,overall buat latihan kanji atau kotoba nya oke...
4,setelah beberapa hari premium seumur hidup lal...


In [29]:
df_RevMazii.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2725 entries, 0 to 2724
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  2725 non-null   object
dtypes: object(1)
memory usage: 21.4+ KB


In [30]:
print("Jumlah Missing Value")
print(df_RevMazii.isnull().sum())

Jumlah Missing Value
Review    0
dtype: int64


In [31]:
print("Jumlah Duplikat Value")
print(df_RevMazii.duplicated().sum())

Jumlah Duplikat Value
822


In [32]:
df_RevMazii_clean = df_RevMazii.drop_duplicates()
print(df_RevMazii_clean.duplicated().sum())

0


In [33]:
df_RevMazii_clean.head()

Unnamed: 0,Review
0,"pada fitur terjemahan dengan gambar, hampir se..."
1,"Tolong hapus iklan di bawah nyaaa, itu sangat ..."
2,aplikasi nya bagus dan kalau bisa mohon jadika...
3,overall buat latihan kanji atau kotoba nya oke...
4,setelah beberapa hari premium seumur hidup lal...


In [34]:
df_RevMazii_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1903 entries, 0 to 2719
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1903 non-null   object
dtypes: object(1)
memory usage: 29.7+ KB


## **Text PreProcessing**

In [52]:
def cleaningText(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text) # menghapus mention
    text = re.sub(r'#[A-Za-z0-9]+', '', text) # menghapus hashtag
    text = re.sub(r'RT[\s]', '', text) # menghapus RT
    text = re.sub(r"http\S+", '', text) # menghapus link
    text = re.sub(r'[0-9]+', '', text) # menghapus angka
    text = re.sub(r'[^\w\s]', '', text) # menghapus karakter selain huruf dan angka
    text = re.sub(r'[^A-Za-z\s]', '', text)


    text = text.replace('\n', ' ') # mengganti baris baru dengan spasi
    text = text.translate(str.maketrans('', '', string.punctuation)) # menghapus semua tanda baca
    text = text.strip(' ') # menghapus karakter spasi dari kiri dan kanan teks
    return text

def casefoldingText(text): # Mengubah semua karakter dalam teks menjadi huruf kecil
    text = text.lower()
    return text

def tokenizingText(text): # Memecah atau membagi string, teks menjadi daftar token
    text = word_tokenize(text)
    return text

def filteringText(text): # Menghapus stopwords dalam teks
    listStopwords = set(stopwords.words('indonesian'))
    listStopwords1 = set(stopwords.words('english'))
    listStopwords.update(listStopwords1)
    listStopwords.update(['iya','yaa','gak','nya','na','sih','ku',"di","ga","ya","gaa","loh","kah","woi","woii","woy", "wkwk", "uuu"])
    important_words = {'baik', 'buruk', 'mantap', 'bagus', 'jelek', 'mudah', 'cepat', 'nyaman'} #Bisa ditambah
    # Hapus kata penting dari daftar stopwords
    listStopwords = listStopwords - important_words

    filtered = []
    for txt in text:
        if txt not in listStopwords:
            filtered.append(txt)
    text = filtered
    return text

def stemmingText(text): # Mengurangi kata ke bentuk dasarnya yang menghilangkan imbuhan awalan dan akhiran atau ke akar kata
    # Membuat objek stemmer
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()

    # Memecah teks menjadi daftar kata
    words = text.split()

    # Menerapkan stemming pada setiap kata dalam daftar
    stemmed_words = [stemmer.stem(word) for word in words]

    # Menggabungkan kata-kata yang telah distem
    stemmed_text = ' '.join(stemmed_words)

    return stemmed_text

def toSentence(list_words): # Mengubah daftar kata menjadi kalimat
    sentence = ' '.join(word for word in list_words)
    return sentence

In [36]:
def load_slang_dictionary():
    slangwords = {}

    def read_file(path, delimiter):
        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if delimiter not in line or not line:
                    continue  # lewati baris yang kosong atau tidak valid
                parts = line.split(delimiter, 1)
                if len(parts) != 2:
                    continue  # lewati jika tetap gagal dibagi jadi 2
                key, value = parts
                if key and value:
                    slangwords[key.lower()] = value.lower()

    # Gabungkan dari semua file
    read_file(r'C:\Latihan_Python\@-proyek-course-dicoding\proyek_analisis_sentimen\Indonesian Slang dictionary (from github)\formalizationDict.txt', '\t')
    read_file(r'C:\Latihan_Python\@-proyek-course-dicoding\proyek_analisis_sentimen\Indonesian Slang dictionary (from github)\kbba.txt', '\t')
    read_file(r'C:\Latihan_Python\@-proyek-course-dicoding\proyek_analisis_sentimen\Indonesian Slang dictionary (from github)\slangword.txt', ':')

    return slangwords

# Panggil fungsi
slangwords = load_slang_dictionary()

# Contoh tampilkan 10 kata
for i, (k, v) in enumerate(slangwords.items()):
    if i >= 10: break
    print(f"{k} -> {v}")

def fix_slangwords(text, slang_dict):
    words = text.split()  # Memecah teks jadi list kata
    fixed_words = [slang_dict.get(word.lower(), word) for word in words]  # Ganti jika ada di kamus
    return ' '.join(fixed_words)  # Gabung kembali jadi kalimat

7an -> tujuan
@ -> di
ababil -> anak ingusan
abis -> habis
acc -> accord
ad -> ada
adlah -> adalah
adlh -> adalah
adoh -> aduh
afaik -> as far as i know


In [37]:
def split_heuristic(text):
    # Ganti koma dan kata penghubung umum dengan titik
    text = re.sub(r'\s*(,|namun|tapi|tetapi|meskipun|walaupun|serta)\s*', '.', text)
    
    # Pisah berdasarkan titik atau baris baru
    sentences = re.split(r'\.|\n', text)

    # Bersihkan hasil
    sentences = [s.strip() for s in sentences if len(s.strip()) > 3]

    return sentences


In [38]:
custom_normalization = {
    "mantaaap": "mantap",
    "bangett": "banget",
    "kerenn": "keren",
    "prem": "premium",
    "bagusss": "bagus",
    "membantuuu": "membantu",
    "bangettt": "banget",
    "bingunggg": "bingung",
    "sangattt": "sangat",
    "waaaah": "wah",
    "polll": "pol",           
    "tulisannnya": "tulisannya",
    "maziiini": "mazii ini",
    "salahhadehhhh": "salah hadeh",
    "sangaatttt": "sangat",
    "lagiii": "lagi",
    "bagusssss": "bagus",
    "bagusssbisa": "bagus bisa",
    "mantaaapberjamjam": "mantap berjam-jam",
    "waaaaah": "wah",
    "yaaa": "ya",
    "mmmmungkin": "mungkin",
    "bagussss": "bagus",
    "mantappppuuu": "mantap",
    "majuuu": "maju",
    "bagussssss": "bagus",
    "akuuu": "aku",
    "hehheee": "hehe",
    "kerennnn": "keren",
    "makasiii": "makasih",
    "gimanasi": "bagaimana sih",
    "arigato": "terima kasih",
    "arigatou": "terima kasih",
    "radical": "radikal",
    "knji": "kanji",
    "knj": "kanji",
}


In [16]:
def normalize_tokens(tokens, norm_dict):
    return [norm_dict.get(word, word) for word in tokens]

In [None]:
# Membersihkan teks dan menyimpannya di kolom 'text_clean'
df_RevMazii_clean['text_clean'] = df_RevMazii_clean['Review'].apply(cleaningText)

# Mengubah huruf dalam teks menjadi huruf kecil dan menyimpannya di 'text_casefoldingText'
df_RevMazii_clean['text_casefoldingText'] = df_RevMazii_clean['text_clean'].apply(casefoldingText)

df_RevMazii_clean['text_final_splitted'] = df_RevMazii_clean['text_casefoldingText'].apply(split_heuristic).apply(toSentence)#Baru

# Mengganti kata-kata slang dengan kata-kata standar dan menyimpannya di 'text_slangwords'
df_RevMazii_clean['text_slangwords'] = df_RevMazii_clean['text_final_splitted'].apply(lambda x: fix_slangwords(x, slangwords))

# Memecah teks menjadi token (kata-kata) dan menyimpannya di 'text_tokenizingText'
df_RevMazii_clean['text_tokenizingText'] = df_RevMazii_clean['text_slangwords'].apply(tokenizingText)

df_RevMazii_clean['text_normalized'] = df_RevMazii_clean['text_tokenizingText'].apply(lambda x: normalize_tokens(x, custom_normalization))#Baru

# Menghapus kata-kata stop (kata-kata umum) dan menyimpannya di 'text_stopword'
df_RevMazii_clean['text_stopword'] = df_RevMazii_clean['text_normalized'].apply(filteringText)

# Menggabungkan token-token menjadi kalimat dan menyimpannya di 'text_akhir'
df_RevMazii_clean['text_sentenced'] = df_RevMazii_clean['text_stopword'].apply(toSentence)

df_RevMazii_clean['text_final_stemmed'] = df_RevMazii_clean['text_sentenced'].apply(stemmingText)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_RevMazii_clean['text_clean'] = df_RevMazii_clean['Review'].apply(cleaningText)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_RevMazii_clean['text_casefoldingText'] = df_RevMazii_clean['text_clean'].apply(casefoldingText)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_RevMazii_clean['text_

In [54]:
df_RevMazii_clean.to_csv(r"C:\Latihan_Python\casptone-project-dicoding_nihongonavigator\Preprocessesedtext\review_mazii_processedtext.csv", index=False)