# Preprocessing

In [2]:
! pip install sastrawi

Collecting sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: sastrawi
Successfully installed sastrawi-1.0.1


In [3]:
import re
import numpy as np
import pandas as pd
import tensorflow as tf
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

In [4]:
# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


True

In [5]:
data = pd.read_csv("/kaggle/input/portfolio-nlp-for-bdc-satriadata/dataset_penyisihan_bdc_2024.csv", delimiter=";")

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5000 non-null   object
 1   label   5000 non-null   object
dtypes: object(2)
memory usage: 78.3+ KB


In [7]:
data.head()

Unnamed: 0,text,label
0,Kunjungan Prabowo ini untuk meresmikan dan men...,Sumber Daya Alam
1,RT Anies dapat tepuk tangan meriah saat jadi R...,Politik
2,@CIqXqwGAT04tMtx4OCATxjoVq7vv/Y8HeYaIOgMFg8Y= ...,Demografi
3,RT @L3R8XFBw3WGbxRPSj0/0hHZTbqVGX7qtfwRg9zmhK7...,Politik
4,Anies Baswedan Harap ASN termasuk TNI dan Polr...,Politik


In [8]:
print(data['label'].unique())
print(data.label.value_counts())

['Sumber Daya Alam' 'Politik' 'Demografi' 'Pertahanan dan Keamanan'
 'Ideologi' 'Ekonomi' 'Sosial Budaya' 'Geografi']
label
Politik                    2972
Sosial Budaya               587
Ideologi                    400
Pertahanan dan Keamanan     400
Ekonomi                     367
Sumber Daya Alam            192
Demografi                    62
Geografi                     20
Name: count, dtype: int64


### Stopword dan Lematisasi

In [9]:
# Initialize Sastrawi Stemmer and Stopword Remover
stemmer = StemmerFactory().create_stemmer()
stopwords = set(StopWordRemoverFactory().get_stop_words())

### Cleaning

In [None]:
# Text Cleaning Function
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'http\S+', '', text)  # hapus URL
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # hapus karakter non-alfanumerik
    text = re.sub(r'\s+', ' ', text).strip()  # hapus spasi berlebih
    words = text.split()  # tokenisasi manual
    # hapus stopword dan stemming
    words = [stemmer.stem(word) for word in words if word not in stopwords]
    return ' '.join(words)

data['clean_text'] = data['text'].apply(clean_text)

### Padding dan Tokenisasi

In [None]:
# Tokenization and Padding
MAX_WORDS = 10000  # Vocabulary size
MAX_LEN = 100  # Max length of sequences
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(data['clean_text'])

sequences = tokenizer.texts_to_sequences(data['clean_text'])
padded_sequences = pad_sequences(sequences, maxlen=MAX_LEN, padding='post', truncating='post')

In [None]:
data.head()