In [1]:
import copy
import torch
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Masking
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

## Load Data

In [2]:
csv_path = '../datasets/uud1945/uud1945.csv'
csv_title = 'uud1945'
cache_path = '../cache'

df = pd.read_csv('../datasets/uud1945/uud1945.csv')

In [3]:
print(f"data length = {(df.shape[0])} | columns = {(df.shape[1])} \n\n")
print(df.columns)
print(df.head(4))

data length = 40 | columns = 4 


Index(['Bab', 'Judul Bab', 'Pasal', 'Isi'], dtype='object')
   Bab                       Judul Bab Pasal  \
0    I           BENTUK DAN KEDAULATAN     1   
1   II  MAJELIS PERMUSYAWARATAN RAKYAT     2   
2  III   KEKUASAAN PEMERINTAHAN NEGARA     3   
3   IV          KEKUASAAN KEPEMIMPINAN     4   

                                                 Isi  
0  (1) Negara Indonesia ialah Negara Kesatuan, ya...  
1  (1) Majelis Permusyawaratan Rakyat terdiri ata...  
2  Majelis Permusyawaratan Rakyat menetapkan Unda...  
3  (1) Presiden Republik Indonesia memegang kekua...  


## Preprocessing Data

In [4]:
df_proc = copy.deepcopy(df)

# Menghapus tanda kutip ganda di awal dan akhir teks
df_proc['Bab'] = df_proc['Bab'].str.strip('""')
df_proc['Judul Bab'] = df_proc['Judul Bab'].str.strip('""')
df_proc['Pasal'] = df_proc['Pasal'].str.strip('""')
df_proc['Isi'] = df_proc['Isi'].str.strip('""')

# Mengubah huruf kecil semua
df_proc['Bab'] = df_proc['Bab'].str.lower()
df_proc['Judul Bab'] = df_proc['Judul Bab'].str.lower()
df_proc['Pasal'] = df_proc['Pasal'].str.lower()
df_proc['Isi'] = df_proc['Isi'].str.lower()

# Memisahkan teks yang panjang menjadi kalimat-kalimat pendek
df_proc['Isi'] = df_proc['Isi'].str.split('. ')

# Menghilangkan tanda baca
df_proc['Isi'] = df_proc['Isi'].apply(lambda x: [sentence.replace(',', '').replace('.', '') for sentence in x])

# tampilkan Isi aja
print(df_proc.head(4))

# safe cache dataframe
df_proc.to_csv(f'{cache_path}/{csv_title}.csv')


   Bab                       Judul Bab Pasal  \
0    i           bentuk dan kedaulatan     1   
1   ii  majelis permusyawaratan rakyat     2   
2  iii   kekuasaan pemerintahan negara     3   
3   iv          kekuasaan kepemimpinan     4   

                                                 Isi  
0  [(1, negar, indonesi, iala, negar, kesatuan, y...  
1  [(1, majeli, permusyawarata, rakya, terdir, at...  
2  [majeli, permusyawarata, rakya, menetapka, und...  
3  [(1, preside, republi, indonesi, memegan, keku...  


In [5]:
df_indexed = df_proc.set_index('Judul Bab')
df_indexed.to_csv(f'{cache_path}/{csv_title}.csv')

## Training Model

In [6]:
# Contoh encoding label
label_encoder = LabelEncoder()
df['Pasal'] = label_encoder.fit_transform(df['Pasal'])

# Menggunakan Tokenizer untuk mengonversi teks ke dalam urutan angka
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Isi'])
sequences = tokenizer.texts_to_sequences(df['Isi'])
max_length = max([len(seq) for seq in sequences])

# Padding agar semua urutan memiliki panjang yang sama
input_sequences = pad_sequences(sequences, maxlen=max_length)

# Jumlah kata dan jumlah kelas
vocab_size = len(tokenizer.word_index) + 1
num_classes = len(df['Pasal'].unique())

# Membuat model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=100, input_length=max_length),
    LSTM(64),
    Dense(num_classes, activation='softmax')
])

# Compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Melatih model
num_epochs = 5
batch_size = 32

model.fit(input_sequences, df['Pasal'], epochs=num_epochs, batch_size=batch_size)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x2a535e090>

In [7]:
input_text = "presiden"
input_sequence = tokenizer.texts_to_sequences([input_text])
padded_input = pad_sequences(input_sequence, maxlen=max_length)
predictions = model.predict(padded_input)
predicted_class = np.argmax(predictions)
predicted_label = label_encoder.inverse_transform([predicted_class])
predicted_label



array(['30'], dtype=object)