# Import

In [434]:
import re
import string
import csv
from io import StringIO
import pandas as pd
import swifter
import requests
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np
from keras.layers import MaxPooling1D
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.utils import resample
from keras.layers import GRU
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Embedding, Bidirectional, LSTM,
    Conv1D, GlobalMaxPooling1D,
    Dense, Dropout
)
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2


# Data Loading & Wrangling

In [17]:
app_reviews_df = pd.read_csv("hasil_scrapping.csv", delimiter=",")

In [18]:
app_reviews_df = app_reviews_df[['content']]
app_reviews_df.head()

Unnamed: 0,content
0,terlalu terlalu terlalu... apk yg tidak bisa d...
1,"Gak usah pasang tarif tarif hemat, soalnya par..."
2,tinggal 2menit lg driver sampe di lokasi tiba-...
3,sebagai pengguna lama baru kali ini saya kecew...
4,susah untuk dpt driver walaupun di map ada ban...


In [19]:
def analisis_data(df):
    print(df.info())
    print("\n-------------------------------------------------------------------------------\n")
    print(f"Data Kosong : \n\n{df.isna().sum()}")
    print("\n-------------------------------------------------------------------------------\n")
    print(f"Data ganda  : {df.duplicated().sum()}")
    print("\n-------------------------------------------------------------------------------\n")
    print(df.describe)

In [20]:
analisis_data(app_reviews_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   content  50000 non-null  object
dtypes: object(1)
memory usage: 390.8+ KB
None

-------------------------------------------------------------------------------

Data Kosong : 

content    0
dtype: int64

-------------------------------------------------------------------------------

Data ganda  : 165

-------------------------------------------------------------------------------

<bound method NDFrame.describe of                                                  content
0      terlalu terlalu terlalu... apk yg tidak bisa d...
1      Gak usah pasang tarif tarif hemat, soalnya par...
2      tinggal 2menit lg driver sampe di lokasi tiba-...
3      sebagai pengguna lama baru kali ini saya kecew...
4      susah untuk dpt driver walaupun di map ada ban...
...                                                 

In [21]:
cleaned_df = app_reviews_df.drop_duplicates()
analisis_data(cleaned_df)

<class 'pandas.core.frame.DataFrame'>
Index: 49835 entries, 0 to 49999
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   content  49835 non-null  object
dtypes: object(1)
memory usage: 778.7+ KB
None

-------------------------------------------------------------------------------

Data Kosong : 

content    0
dtype: int64

-------------------------------------------------------------------------------

Data ganda  : 0

-------------------------------------------------------------------------------

<bound method NDFrame.describe of                                                  content
0      terlalu terlalu terlalu... apk yg tidak bisa d...
1      Gak usah pasang tarif tarif hemat, soalnya par...
2      tinggal 2menit lg driver sampe di lokasi tiba-...
3      sebagai pengguna lama baru kali ini saya kecew...
4      susah untuk dpt driver walaupun di map ada ban...
...                                                  ...
49

# Preprocessing Text

In [22]:
def cleaningText(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text) 
    text = re.sub(r'#[A-Za-z0-9]+', '', text) 
    text = re.sub(r'RT[\s]', '', text) 
    text = re.sub(r"http\S+", '', text) 
    text = re.sub(r'[0-9]+', '', text) 
    text = re.sub(r'[^\w\s]', '', text) 
 
    text = text.replace('\n', ' ') 
    text = text.translate(str.maketrans('', '', string.punctuation)) 
    text = text.strip(' ') 
    return text
 
def casefoldingText(text): 
    text = text.lower()
    return text
 
def tokenizingText(text): 
    text = word_tokenize(text)
    return text
 
def filteringText(text): 
    listStopwords = set(stopwords.words('indonesian'))
    listStopwords1 = set(stopwords.words('english'))
    listStopwords.update(listStopwords1)

    keep_words = {"baik", "buruk", "jelek", "bagus", "senang", "marah", "puas", "kecewa"}
    listStopwords = listStopwords - keep_words

    listStopwords.update(['iya','yaa','gak','nya','na','sih','ku',"di","ga","ya","gaa","loh","kah","woi","woii","woy"])
    
    filtered = [txt for txt in text if txt not in listStopwords]
    return filtered


factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stemmingText(text): 
    stemmed_words = [stemmer.stem(word) for word in text]
    return stemmed_words
 
def toSentence(list_words): 
    sentence = ' '.join(word for word in list_words)
    return sentence

In [23]:
slangwords = {}
with open('kamusalay.csv', mode='r', encoding='utf-8') as file:
    reader = csv.reader(file)
    for row in reader:
        if len(row) >= 2:
            slang = row[0].strip().lower()
            normal = row[1].strip().lower()
            slangwords[slang] = normal

def fix_slangwords(text):
    words = text.split()
    fixed_words = []
 
    for word in words:
        if word.lower() in slangwords:
            fixed_words.append(slangwords[word.lower()])
        else:
            fixed_words.append(word)
 
    fixed_text = ' '.join(fixed_words)
    return fixed_text

In [None]:
cleaned_df['text_clean'] = cleaned_df['content'].swifter.progress_bar(True).apply(cleaningText)
cleaned_df['text_casefoldingText'] = cleaned_df['text_clean'].swifter.progress_bar(True).apply(casefoldingText)
cleaned_df['text_slangwords'] = cleaned_df['text_casefoldingText'].swifter.progress_bar(True).apply(fix_slangwords)
cleaned_df['text_tokenizingText'] = cleaned_df['text_slangwords'].swifter.progress_bar(True).apply(tokenizingText)
cleaned_df['text_stopword'] = cleaned_df['text_tokenizingText'].swifter.progress_bar(True).apply(filteringText)
cleaned_df['text_stemming'] = cleaned_df['text_stopword'].swifter.progress_bar(True).apply(stemmingText)
cleaned_df['text_akhir'] = cleaned_df['text_stemming'].swifter.progress_bar(True).apply(toSentence)


Pandas Apply:   0%|          | 0/49835 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['text_clean'] = cleaned_df['content'].swifter.progress_bar(True).apply(cleaningText)


Pandas Apply:   0%|          | 0/49835 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['text_casefoldingText'] = cleaned_df['text_clean'].swifter.progress_bar(True).apply(casefoldingText)


Pandas Apply:   0%|          | 0/49835 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['text_slangwords'] = cleaned_df['text_casefoldingText'].swifter.progress_bar(True).apply(fix_slangwords)


Pandas Apply:   0%|          | 0/49835 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['text_tokenizingText'] = cleaned_df['text_slangwords'].swifter.progress_bar(True).apply(tokenizingText)


Pandas Apply:   0%|          | 0/49835 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['text_stopword'] = cleaned_df['text_tokenizingText'].swifter.progress_bar(True).apply(filteringText)


Pandas Apply:   0%|          | 0/49835 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['text_stemming'] = cleaned_df['text_stopword'].swifter.progress_bar(True).apply(stemmingText)


Pandas Apply:   0%|          | 0/49835 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['text_akhir'] = cleaned_df['text_stemming'].swifter.progress_bar(True).apply(toSentence)


In [None]:
import csv
import requests
from io import StringIO

# UNTUK SKIP YANG TERDAPAT DI LEXICON
words_to_exclude = {"aplikasi", "aplikasinya", "aja"}

def load_lexicon_from_tsv(url, skip_words=None):
    lexicon = {}
    response = requests.get(url)
    if response.status_code == 200:
        tsv_content = response.text.strip().split('\n')
        reader = csv.reader(tsv_content, delimiter='\t')
        next(reader)  # skip header
        for row in reader:
            if len(row) >= 2:
                word = row[0].strip().lower()
                weight = int(row[1])
                if not skip_words or word not in skip_words:
                    lexicon[word] = weight
        return lexicon
    else:
        raise Exception(f"Gagal memuat lexicon dari {url}")

url_positive = "https://raw.githubusercontent.com/fajri91/InSet/master/positive.tsv"
url_negative = "https://raw.githubusercontent.com/fajri91/InSet/master/negative.tsv"

lexicon_positive = load_lexicon_from_tsv(url_positive, skip_words=words_to_exclude)
lexicon_negative = load_lexicon_from_tsv(url_negative, skip_words=set(lexicon_positive.keys()) | words_to_exclude)

print(f"Lexicon Positif: {len(lexicon_positive)} kata")
print(f"Lexicon Negatif: {len(lexicon_negative)} kata")

overlap = set(lexicon_positive.keys()) & set(lexicon_negative.keys())
if overlap:
    print("Duplikat ditemukan:", overlap)
else:
    print("Tidak ada duplikat antar lexicon")

Lexicon Positif: 3606 kata
Lexicon Negatif: 5463 kata
Tidak ada duplikat antar lexicon


In [None]:
def sentiment_analysis_lexicon_indonesia(text):
    score = 0
    for word in text.split():
        if word in lexicon_positive:
            score += lexicon_positive[word]
        if word in lexicon_negative:
            score += lexicon_negative[word]
    
    #Memperluas Neutral
    if score > 2:
        polarity = 'positive'
    elif score < -1:
        polarity = 'negative'
    else:
        polarity = 'neutral'
    
    return polarity


In [394]:
swifter.config.display_progressbar = True
cleaned_df['polarity'] = cleaned_df['text_akhir'].swifter.apply(sentiment_analysis_lexicon_indonesia)
print(cleaned_df['polarity'].value_counts())


Pandas Apply:   0%|          | 0/49835 [00:00<?, ?it/s]

polarity
positive    32075
negative     9869
neutral      7891
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['polarity'] = cleaned_df['text_akhir'].swifter.apply(sentiment_analysis_lexicon_indonesia)


In [395]:
text_input = "biasa aja"
result = sentiment_analysis_lexicon_indonesia(text_input)
print("Prediksi Sentimen:", result)


Prediksi Sentimen: neutral


# Training

### Traditional Machine Learning

In [402]:

X = cleaned_df['text_akhir']
y = cleaned_df['polarity']

tfidf = TfidfVectorizer(max_features=7000, min_df=2, max_df=0.9)
X_tfidf = tfidf.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42
)


In [403]:
logistic_regression = LogisticRegression()
 
logistic_regression.fit(X_train.toarray(), y_train)
 
y_pred_train_lr = logistic_regression.predict(X_train.toarray())
y_pred_test_lr = logistic_regression.predict(X_test.toarray())
 
accuracy_train_lr = accuracy_score(y_pred_train_lr, y_train)
 
accuracy_test_lr = accuracy_score(y_pred_test_lr, y_test)
 
print('Logistic Regression - accuracy_train:', accuracy_train_lr)
print('Logistic Regression - accuracy_test:', accuracy_test_lr)

Logistic Regression - accuracy_train: 0.9027290057188723
Logistic Regression - accuracy_test: 0.8557238888331494


### DeepLearning


In [404]:
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

In [405]:
MAX_FEATURES = 7000
MAX_LEN = 150
tokenizer = Tokenizer(num_words=MAX_FEATURES, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=MAX_LEN)

In [406]:
X_train, X_test, y_train, y_test = train_test_split(X_pad, y_encoded, test_size=0.2, random_state=42)

In [422]:
model_lstm = Sequential([
    Embedding(MAX_FEATURES, 128, input_length=MAX_LEN),
    Bidirectional(LSTM(128)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(3, activation='softmax')
])

model_lstm.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2)

history = model_lstm.fit(
    X_train, y_train,
    epochs=10,
    batch_size=64,
    validation_data=(X_test, y_test),
    callbacks=[early_stop, lr_scheduler]
)


Epoch 1/10




[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m163s[0m 290ms/step - accuracy: 0.7390 - loss: 0.6545 - val_accuracy: 0.8648 - val_loss: 0.3282 - learning_rate: 0.0010
Epoch 2/10
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 276ms/step - accuracy: 0.9000 - loss: 0.2508 - val_accuracy: 0.8896 - val_loss: 0.2678 - learning_rate: 0.0010
Epoch 3/10
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m166s[0m 305ms/step - accuracy: 0.9233 - loss: 0.1931 - val_accuracy: 0.9043 - val_loss: 0.2430 - learning_rate: 0.0010
Epoch 4/10
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 277ms/step - accuracy: 0.9458 - loss: 0.1435 - val_accuracy: 0.9052 - val_loss: 0.2627 - learning_rate: 0.0010
Epoch 5/10
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 282ms/step - accuracy: 0.9561 - loss: 0.1176 - val_accuracy: 0.9044 - val_loss: 0.2966 - learning_rate: 0.0010
Epoch 6/10
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [423]:
train_loss, train_accuracy = model_lstm.evaluate(X_train, y_train, verbose=0)
print("LSTM Train Accuracy:", train_accuracy)

test_loss, test_accuracy = model_lstm.evaluate(X_test, y_test, verbose=0)
print("LSTM Test Accuracy:", test_accuracy)

LSTM Train Accuracy: 0.953875720500946
LSTM Test Accuracy: 0.9042873382568359


In [424]:
X_train, X_test, y_train, y_test = train_test_split(X_pad, y_encoded, test_size=0.3, random_state=42)

In [425]:
def build_cnn_lstm():
    model = Sequential([
        Embedding(MAX_FEATURES, 128),
        Conv1D(64, kernel_size=5, activation='relu'),
        MaxPooling1D(pool_size=2),
        Bidirectional(LSTM(128)),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(3, activation='softmax')
    ])
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


In [426]:
callbacks = [EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
             ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2)]

model_cnn_lstm = build_cnn_lstm()
model_cnn_lstm.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64, callbacks=callbacks)

Epoch 1/10
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 101ms/step - accuracy: 0.7190 - loss: 0.6869 - val_accuracy: 0.8816 - val_loss: 0.2894 - learning_rate: 0.0010
Epoch 2/10
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 102ms/step - accuracy: 0.9013 - loss: 0.2508 - val_accuracy: 0.8834 - val_loss: 0.2863 - learning_rate: 0.0010
Epoch 3/10
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 103ms/step - accuracy: 0.9343 - loss: 0.1706 - val_accuracy: 0.9019 - val_loss: 0.2530 - learning_rate: 0.0010
Epoch 4/10
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 106ms/step - accuracy: 0.9539 - loss: 0.1262 - val_accuracy: 0.9044 - val_loss: 0.2661 - learning_rate: 0.0010
Epoch 5/10
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 105ms/step - accuracy: 0.9641 - loss: 0.1028 - val_accuracy: 0.9036 - val_loss: 0.2879 - learning_rate: 0.0010
Epoch 6/10
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0

<keras.src.callbacks.history.History at 0x22079bc9e50>

In [437]:
train_loss, train_accuracy = model_cnn_lstm.evaluate(X_train, y_train, verbose=0)
print("LTSM-CNN Train Accuracy:", train_accuracy)

test_loss, test_accuracy = model_cnn_lstm.evaluate(X_test, y_test, verbose=0)
print("LTSM-CNN Test Accuracy:", test_accuracy)


LTSM-CNN Train Accuracy: 0.9627049565315247
LTSM-CNN Test Accuracy: 0.9019463658332825


In [431]:
def build_gru():
    model = Sequential([
        Embedding(MAX_FEATURES, 128),
        Bidirectional(GRU(128)),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(3, activation='softmax')
    ])
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


In [432]:
model_gru = build_gru()
model_gru.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64, callbacks=callbacks)

Epoch 1/10
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m175s[0m 314ms/step - accuracy: 0.7396 - loss: 0.6458 - val_accuracy: 0.8673 - val_loss: 0.3162 - learning_rate: 0.0010
Epoch 2/10
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m168s[0m 307ms/step - accuracy: 0.8878 - loss: 0.2749 - val_accuracy: 0.8731 - val_loss: 0.3058 - learning_rate: 0.0010
Epoch 3/10
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m165s[0m 302ms/step - accuracy: 0.9225 - loss: 0.1968 - val_accuracy: 0.9040 - val_loss: 0.2438 - learning_rate: 0.0010
Epoch 4/10
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m165s[0m 303ms/step - accuracy: 0.9438 - loss: 0.1455 - val_accuracy: 0.9032 - val_loss: 0.2433 - learning_rate: 0.0010
Epoch 5/10
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m166s[0m 304ms/step - accuracy: 0.9540 - loss: 0.1227 - val_accuracy: 0.8881 - val_loss: 0.3066 - learning_rate: 0.0010
Epoch 6/10
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x220703e6010>

In [433]:
train_loss, train_accuracy = model_gru.evaluate(X_train, y_train, verbose=0)
print("GRU Train Accuracy:", train_accuracy)

test_loss, test_accuracy = model_gru.evaluate(X_test, y_test, verbose=0)
print("GRU Test Accuracy:", test_accuracy)


GRU Train Accuracy: 0.9643676280975342
GRU Test Accuracy: 0.9031503200531006


In [435]:
probs_lstm = model_lstm.predict(X_test)
probs_cnn_lstm = model_cnn_lstm.predict(X_test)
probs_gru = model_gru.predict(X_test)

avg_probs = (probs_lstm + probs_cnn_lstm + probs_gru) / 3.0

ensemble_preds = np.argmax(avg_probs, axis=1)

[1m468/468[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 32ms/step
[1m468/468[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 19ms/step
[1m468/468[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 26ms/step


In [436]:
ensemble_accuracy = accuracy_score(y_test, ensemble_preds)
print("Ensemble Accuracy:", ensemble_accuracy)

Ensemble Accuracy: 0.9227476422981741


In [444]:
model_lstm.save("model_lstm.h5")
model_cnn_lstm.save("model_cnn_lstm.h5")
model_gru.save("model_gru.h5")



# inference 

In [None]:
def predict_sentiment_probs(text, model, tokenizer, max_len):
    # Tokenize and pad
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=max_len)
    
    # Predict probabilities
    probs = model.predict(padded, verbose=0)[0] 
    return probs


In [None]:
import numpy as np

def predict_ensemble_sentiment(text, models, tokenizer, max_len):
    
    all_probs = [predict_sentiment_probs(text, model, tokenizer, max_len) for model in models]
    
    avg_probs = np.mean(all_probs, axis=0)
    
    final_label = np.argmax(avg_probs)

    return final_label, avg_probs


In [None]:
text_input = "aplikasinya biasa aja"

models = [model_lstm, model_cnn_lstm, model_gru]  
label_index = {0: 'Negatif', 1: 'Netral', 2: 'Positif'}

pred_label, probs = predict_ensemble_sentiment(text_input, models, tokenizer, MAX_LEN)
print("Ensemble Prediksi Sentimen:", label_index[pred_label])
print("Probabilities:", probs)


Ensemble Prediksi Sentimen: Netral
Probabilities: [0.01215679 0.95123476 0.03660843]


In [None]:
text_input = "aplikasinya sangat bagus sekali"

models = [model_lstm, model_cnn_lstm, model_gru]  
label_index = {0: 'Negatif', 1: 'Netral', 2: 'Positif'}

pred_label, probs = predict_ensemble_sentiment(text_input, models, tokenizer, MAX_LEN)
print("Ensemble Prediksi Sentimen:", label_index[pred_label])
print("Probabilities:", probs)


Ensemble Prediksi Sentimen: Positif
Probabilities: [2.9339817e-06 1.1116184e-02 9.8888087e-01]


In [None]:
text_input = "aplikasinya jelek banget"

models = [model_lstm, model_cnn_lstm, model_gru]  
label_index = {0: 'Negatif', 1: 'Netral', 2: 'Positif'}

pred_label, probs = predict_ensemble_sentiment(text_input, models, tokenizer, MAX_LEN)
print("Ensemble Prediksi Sentimen:", label_index[pred_label])
print("Probabilities:", probs)


Ensemble Prediksi Sentimen: Negatif
Probabilities: [9.9078912e-01 9.2096915e-03 1.1557902e-06]


In [445]:
!pip freeze > requirements.txt