In [26]:
import pandas as pd
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from deep_translator import GoogleTranslator
import logging

In [27]:
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [28]:
# Load the dataset
try:
    df = pd.read_csv('dataset.csv')
    logging.info("Dataset loaded successfully.")
except FileNotFoundError as e:
    logging.error(f"Error loading dataset: {e}")
    exit()

2025-01-13 18:42:12,821 - INFO - Dataset loaded successfully.


In [29]:
# Load synonym dictionary
try:
    synonym_df = pd.read_csv('colloquial-indonesian-lexicon.csv')
    synonym_dict = dict(zip(synonym_df['slang'], synonym_df['formal']))
    logging.info("Synonym dictionary loaded successfully.")
except FileNotFoundError as e:
    logging.error(f"Error loading synonym dictionary: {e}")
    exit()

2025-01-13 18:42:12,913 - INFO - Synonym dictionary loaded successfully.


In [30]:
# Load stopwords from external files
try:
    with open('id.stopwords.02.01.2016.txt', 'r', encoding='utf-8') as file:
        stopwords = set(file.read().splitlines())
    logging.info("Stopwords loaded successfully.")
except FileNotFoundError as e:
    logging.error(f"Error loading stopwords: {e}")
    exit()

2025-01-13 18:42:12,934 - INFO - Stopwords loaded successfully.


In [31]:
# Create a stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [32]:
# Inisialisasi translator
translator = GoogleTranslator(source='auto', target='id')

In [33]:
# Define preprocessing functions
def clean_text(text):
    text = re.sub(r'http\S+|www\.\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
    text = re.sub(r'[\U00010000-\U0010FFFF]', '', text, flags=re.UNICODE)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [34]:
def cf_text(text):
    return text.lower()

In [35]:
def tokenize_text(text):
    tokens = re.split(r'\W+', text)
    return [token for token in tokens if token]

In [36]:
def remove_stopwords(tokens):
    return [token for token in tokens if token not in stopwords]

In [37]:
def stemming(tokens):
    return [
        stemmer.stem(token)
        for token in tokens
    ]

In [38]:
def translate_to_bahasa(tokens):
    try:
        translations = translator.translate_batch(tokens, batch_size=10)
        return translations
    except Exception as e:
        logging.warning(f"Translation failed: {e}")
        return tokens

In [39]:
def replace_with_synonyms(words, synonym_dict):
    return [synonym_dict.get(word, word) for word in words]

In [40]:
text_columns = df.select_dtypes(include=['object']).columns 

In [41]:
# Step 1: Clean text
logging.info("Starting text cleaning...")
df_cleaned = df.copy()
for col in text_columns:
    df_cleaned[col] = df_cleaned[col].apply(lambda x: clean_text(x) if isinstance(x, str) else x)
df_cleaned.to_csv('cleaned_dataset.csv', index=False)
logging.info("Text cleaning completed.")

2025-01-13 18:42:13,189 - INFO - Starting text cleaning...
2025-01-13 18:42:13,224 - INFO - Text cleaning completed.


In [42]:
# Step 2: Case folding
logging.info("Starting case folding...")
df_cf = df_cleaned.copy()
for col in text_columns:
    df_cf[col] = df_cf[col].apply(lambda x: cf_text(x) if isinstance(x, str) else x)
df_cf.to_csv('cf_dataset.csv', index=False)
logging.info("Case folding completed.")

2025-01-13 18:42:13,242 - INFO - Starting case folding...
2025-01-13 18:42:13,253 - INFO - Case folding completed.


In [43]:
# Step 3: Tokenization
logging.info("Starting tokenization...")
df_tokenized = df_cf.copy()
for col in text_columns:
    df_tokenized[col] = df_tokenized[col].apply(lambda x: tokenize_text(x) if isinstance(x, str) else x)
df_tokenized.to_csv('tokenized_dataset.csv', index=False)
logging.info("Tokenization completed.")

2025-01-13 18:42:13,291 - INFO - Starting tokenization...
2025-01-13 18:42:13,317 - INFO - Tokenization completed.


In [44]:
# Step 4: Remove stopwords
logging.info("Removing stopwords...")
df_sr = df_tokenized.copy()
for col in text_columns:
    df_sr[col] = df_sr[col].apply(lambda x: remove_stopwords(translate_to_bahasa(x)) if isinstance(x, list) else x)
df_sr.to_csv('stopword_removed_dataset.csv', index=False)
logging.info("Stopwords removed.")


2025-01-13 18:42:13,332 - INFO - Removing stopwords...
2025-01-13 18:42:13,351 - INFO - Stopwords removed.


In [45]:
# Step 5: Stemming
logging.info("Starting stemming...")
df_stemmed = df_sr.copy()
for col in text_columns:
    df_stemmed[col] = df_stemmed[col].apply(lambda x: stemming(x) if isinstance(x, list) else x)
df_stemmed.to_csv('stemmed_dataset.csv', index=False)
logging.info("Stemming completed.")

2025-01-13 18:42:13,399 - INFO - Starting stemming...
2025-01-13 18:43:13,761 - INFO - Stemming completed.


In [46]:
# Step 6: Synonym replacement
logging.info("Starting synonym replacement...")
df_synonym_replaced = df_stemmed.copy()
for col in text_columns:
    df_synonym_replaced[col] = df_synonym_replaced[col].apply(
        lambda x: replace_with_synonyms(x, synonym_dict) if isinstance(x, list) else x
    )
df_synonym_replaced.to_csv('synonym_replaced_dataset.csv', index=False)
logging.info("Synonym replacement completed.")

2025-01-13 18:43:13,797 - INFO - Starting synonym replacement...
2025-01-13 18:43:13,844 - INFO - Synonym replacement completed.


In [47]:
# Display the final dataset
print(df_synonym_replaced.head())

                                               Tweet
0        [kampus, keren, banget, fasilitas, lengkap]
1       [duh, tugas, kuliah, numpuk, banget, pusing]
2                       [tugas, kerja, minggu, seru]
3  [kanan, dosen, asih, nilai, sulit, mahasiswa, ...
4  [tau, anjyr, muak, gue, ajar, bisnis, internas...
