In [None]:
#Import Library
import pandas as pd
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import pickle
import os
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Bidirectional
from tensorflow.keras.models import load_model
import gradio as gr #opsional untuk antarmuka pengguna
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
df = pd.read_excel('testing_converted.xlsx')

print("Dataframe head:")
print(df.head())

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.strip()
    return text

df['clean_text'] = df.iloc[:, 0].apply(clean_text)
df['Sentiment'] = df.iloc[:, 1]


print("\nDataframe head after cleaning text:")
print(df.head())

print("\nInitial Sentiment Distribution:")
print(df['Sentiment'].value_counts())

In [None]:
TOKENIZER_PATH = '/content/drive/MyDrive/sentiment_model/tokenizer.pkl'
MODEL_PATH = '/content/drive/MyDrive/sentiment_model/sentiment_lstm_model.h5'
MAX_WORDS = 5000
MAX_LEN = 100


label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
df['label'] = df['Sentiment'].map(label_mapping)

if df['label'].isnull().any():
    print("Warning: Some sentiment labels could not be mapped. These rows will be dropped.")
    df.dropna(subset=['label'], inplace=True)

df['label'] = df['label'].astype(int)

X_original_texts = df['clean_text']


tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='<OOV>')
tokenizer.fit_on_texts(df['clean_text'])

X = tokenizer.texts_to_sequences(df['clean_text'])
X = pad_sequences(X, maxlen=MAX_LEN, padding='post')

y = to_categorical(df['label'], num_classes=len(label_mapping))


y_labels_for_weights = np.argmax(y, axis=1)

class_weights = compute_class_weight('balanced', classes=np.unique(y_labels_for_weights), y=y_labels_for_weights)
class_weight_dict = dict(enumerate(class_weights))
print(f"Calculated class weights: {class_weight_dict}")

X_train, X_test, y_train, y_test, X_train_original, X_test_original = train_test_split(
    X, y, X_original_texts, test_size=0.1, random_state=42, stratify=y_labels_for_weights
)

print(f"\nShape of X_train (tokenized): {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of X_test (tokenized): {X_test.shape}")
print(f"Shape of y_test: {y_test.shape}")
print(f"Distribution of classes in y_train: {Counter(np.argmax(y_train, axis=1))}")
print(f"Distribution of classes in y_test: {Counter(np.argmax(y_test, axis=1))}")
print("\n--- 5 Data Latih Teks Asli (Sebelum Tokenisasi) ---")
print('\n'.join(X_train_original.head().to_list()))
print("\n--- 5 Data Uji Teks Asli (Sebelum Tokenisasi) ---")
print('\n'.join(X_test_original.head().to_list()))

In [None]:
if os.path.exists(MODEL_PATH):
    print("Loading existing model...")
    model = load_model(MODEL_PATH)
    with open(TOKENIZER_PATH, 'rb') as handle:
        tokenizer = pickle.load(handle)
else:
    print("Building and training new model...")
    model = Sequential()
    model.add(Embedding(input_dim=MAX_WORDS, output_dim=128, input_length=MAX_LEN))
    model.add(SpatialDropout1D(0.3))
    model.add(Bidirectional(LSTM(100, dropout=0.3, recurrent_dropout=0.3)))
    model.add(Dense(len(label_mapping), activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print("\nModel Summary:")
    model.summary()

    history = model.fit(X_train, y_train, epochs=10, batch_size=64,
                        validation_data=(X_test, y_test),
                        class_weight=class_weight_dict)

    model.save(MODEL_PATH)
    with open(TOKENIZER_PATH, 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("\nModel and tokenizer saved.")

print("\nEvaluating model on test set...")
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy*100:.2f}%")

from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test, verbose=0)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

cm = confusion_matrix(y_true_classes, y_pred_classes)

labels = list(label_mapping.keys())
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()
print("\nClassification Report on Test Set:")
print(classification_report(y_true_classes, y_pred_classes, target_names=list(label_mapping.keys())))