### Inference Model Terbaik

Program ini menganalisis sentimen teks menggunakan model CNN + LSTM. Teks yang dimasukkan oleh pengguna akan melalui tahap preprocessing, seperti penghapusan URL, tanda baca, stopwords, serta konversi ke huruf kecil. Setelah itu, teks dikonversi menjadi vektor numerik menggunakan TF-IDF, dinormalisasi dengan StandardScaler, lalu diproses oleh model Random Forest untuk menentukan sentimennya. Jika hasil prediksi menunjukkan 1, maka sentimen positif (‚úÖ POSITIF), sedangkan jika 0, maka sentimen negatif (‚ùå NEGATIF). Dengan kombinasi preprocessing yang baik, representasi fitur yang optimal, serta model yang telah disempurnakan, program ini mampu memberikan hasil analisis sentimen dengan akurasi tinggi. üöÄ

In [1]:
import numpy as np
import re
import nltk
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer

# Download stopwords (if not already downloaded)
nltk.download('stopwords')

# Load the trained CNN+LSTM model
model_cnn_lstm = load_model('/content/text_classification_CNN+LSTM_model.h5')  # Update path as needed

# Define the tokenizer and max sequence length used during training
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
max_sequence_length = 100  # Replace with the max_sequence_length used during training

# Assuming tokenizer is trained on the training dataset (if you're reusing the same tokenizer)
# tokenizer.fit_on_texts(X_train)  # Uncomment and train on the original dataset if needed

# Preprocessing functions
def cleaningText(text):
    # Remove URL
    text = re.sub(r'https\S+', ' ', text, flags=re.IGNORECASE)
    # Convert to lowercase
    text = text.lower()
    # Remove mentions (@)
    text = re.sub(r'@\S+', ' ', text, flags=re.IGNORECASE)
    # Remove hashtags (#)
    text = re.sub(r'#\S+', ' ', text, flags=re.IGNORECASE)
    # Remove any special characters or unwanted symbols
    text = re.sub(r"[^\w\s]", " ", text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def casefoldingText(text):
    return text.lower()

def fix_slangwords(text):
    # Add slang replacement logic here if necessary
    return text

def tokenizingText(text):
    tokenizer = RegexpTokenizer(r'\w+')
    return tokenizer.tokenize(text)

def filteringText(text):
    stop = stopwords.words('indonesian')
    return [word for word in text if word not in stop]

def toSentence(list_words):
    return ' '.join(list_words)

# Function to prepare text for CNN+LSTM model
def prepare_input_for_cnn_lstm(text, tokenizer, max_sequence_length):
    # Tokenization and padding
    sequence = tokenizer.texts_to_sequences([text])

    # Check if sequence is empty and handle it
    if sequence == [[]]:  # If the sequence is empty after tokenization
        # You can either return an array of zeros or handle it differently
        # Here, we return an array of zeros with the correct shape
        padded_sequence = np.zeros((1, max_sequence_length), dtype=np.int32)
    else:
        # Replace None values with 0 before padding
        sequence = [[x if x is not None else 0 for x in sublist] for sublist in sequence]
        padded_sequence = pad_sequences(sequence, maxlen=max_sequence_length, dtype=np.int32) # Ensure dtype is specified

    return padded_sequence

# Get user input
kalimat_baru = input("Masukkan kalimat baru: ")

# Preprocess the input text
kalimat_baru_cleaned = cleaningText(kalimat_baru)
kalimat_baru_casefolded = casefoldingText(kalimat_baru_cleaned)
kalimat_baru_slangfixed = fix_slangwords(kalimat_baru_casefolded)
kalimat_baru_tokenized = tokenizingText(kalimat_baru_slangfixed)
kalimat_baru_filtered = filteringText(kalimat_baru_tokenized)
kalimat_baru_final = toSentence(kalimat_baru_filtered)

# Prepare the input text for the CNN+LSTM model
X_kalimat_baru = prepare_input_for_cnn_lstm(kalimat_baru_final, tokenizer, max_sequence_length)

# Make predictions using the CNN+LSTM model
prediksi_sentimen = model_cnn_lstm.predict(X_kalimat_baru)

# Get the index of the class with the highest probability
predicted_class_index = np.argmax(prediksi_sentimen[0])

# Map the class index to sentiment label with "NEGATIVE_NEUTRAL" combined
if predicted_class_index == 0 or predicted_class_index == 1:
    sentiment_label = "NEGATIVE_NEUTRAL"
else:
    sentiment_label = "POSITIVE"

# Display the result
print(f"‚úÖ Sentimen kalimat baru adalah {sentiment_label}.")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Masukkan kalimat baru: aplikasinya biasa saja
[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 518ms/step
‚úÖ Sentimen kalimat baru adalah NEGATIVE_NEUTRAL.
