In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import yaml
import pickle
import json
import re
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, Bidirectional, LSTM, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Load Config
with open("../config/config.yaml", "r") as f:
    config = yaml.safe_load(f)

In [None]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\.\S+', ' ', text)
    text = re.sub(r'@\w+', ' ', text)
    text = re.sub(r'#', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

# Load & Clean
df = pd.read_csv(config['data']['train_path'])
df['clean_text'] = df['tweets'].apply(clean_text)

# Encode Label
le = LabelEncoder()
y = le.fit_transform(df['class'])
X = df['clean_text'].values

In [None]:
# Tokenizer
tokenizer = Tokenizer(num_words=config['data']['vocab_size'], oov_token="<OOV>")
tokenizer.fit_on_texts(X)

X_seq = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=config['data']['max_len'], padding='post', truncating='post')

# Split Data
X_train, X_val, y_train, y_val = train_test_split(X_pad, y, test_size=0.2, random_state=42, stratify=y)
print("Shape Train:", X_train.shape)
print("Shape Val:", X_val.shape)

In [None]:
# Definisi Model sesuai Config
inp = Input(shape=(config['data']['max_len'],))
x = Embedding(input_dim=len(tokenizer.word_index)+1, 
              output_dim=config['model']['embedding_dim'])(inp)
x = SpatialDropout1D(0.4)(x)
x = Bidirectional(LSTM(config['model']['lstm_units'], return_sequences=True))(x)
x = Conv1D(config['model']['cnn_filters'], kernel_size=3, padding='same', activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(config['model']['dense_units'], activation='relu')(x)
x = Dropout(config['model']['dropout_rate'])(x)
out = Dense(len(le.classes_), activation='softmax')(x)

model = tf.keras.Model(inputs=inp, outputs=out)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10, # Percobaan awal cukup 10 epoch
    batch_size=config['training']['batch_size'],
    verbose=1
)

In [None]:
# Plot Accuracy
plt.plot(history.history['accuracy'], label='Train Acc')
plt.plot(history.history['val_accuracy'], label='Val Acc')
plt.title('Experiment Results')
plt.legend()
plt.show()