<a href="https://colab.research.google.com/github/LatiefDataVisionary/text-mining-and-natural-language-processing-college-task/blob/main/preprocessing_text_ramadan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install Sastrawi

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m204.8/209.7 kB[0m [31m7.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [4]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# Download necessary NLTK data packages
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab') # Download the punkt_tab data package

# Baca data
df = pd.read_csv('fix_combined_ramadan.csv')

# 1. Cleaning
def clean_text(text):
    # Handle NaN
    text = str(text) if not pd.isna(text) else ''

    # Remove URL
    text = re.sub(r'http\S+', '', text)
    # Remove mention dan hashtag
    text = re.sub(r'@\w+|#\w+', '', text)
    # Remove karakter khusus dan angka
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['cleaned_text'] = df['full_text'].apply(clean_text)

# 2. Case Folding
df['case_folded'] = df['cleaned_text'].str.lower()

# 3. Tokenizing
def tokenize_text(text):
    return nltk.word_tokenize(text)

df['tokens'] = df['case_folded'].apply(tokenize_text)

# 4. Filtering (Stopword Removal)
stop_words = set(stopwords.words('indonesian') + stopwords.words('english') + ['rt'])

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

df['filtered_tokens'] = df['tokens'].apply(remove_stopwords)

# 5. Stemming
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stem_text(tokens):
    return [stemmer.stem(word) for word in tokens]

df['stemmed_tokens'] = df['filtered_tokens'].apply(stem_text)

# Simpan hasil preprocessing
df.to_csv('preprocessed_ramadan.csv', index=False)

# Contoh hasil preprocessing
print("Contoh Hasil Preprocessing:")
print("Original Text:", df['full_text'][0])
print("Cleaned Text:", df['cleaned_text'][0])
print("Case Folded:", df['case_folded'][0])
print("Tokens:", df['tokens'][0])
print("Filtered Tokens:", df['filtered_tokens'][0])
print("Stemmed Tokens:", df['stemmed_tokens'][0])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Contoh Hasil Preprocessing:
Original Text: Ramadan Mubarak to all Muslim! https://t.co/D9QoB5eyjd
Cleaned Text: Ramadan Mubarak to all Muslim
Case Folded: ramadan mubarak to all muslim
Tokens: ['ramadan', 'mubarak', 'to', 'all', 'muslim']
Filtered Tokens: ['ramadan', 'mubarak', 'muslim']
Stemmed Tokens: ['ramadan', 'mubarak', 'muslim']
