In [150]:
# All necessary dependencies (import)
import pickle
import re
import string
import emoji

from tensorflow.keras.models import model_from_json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

### Load Models

In [151]:
# Model Architecture
with open('./sentiment_models/model_architecture.json', 'r') as json_file:
    model_architecture = json_file.read()

model = model_from_json(model_architecture)

# Model Weights
model.load_weights('./sentiment_models/model_weights.h5')

# Tokenizer Words
with open('./sentiment_models/tokenizer.pickle', 'rb') as tokenizer_file:
    tokenizer = pickle.load(tokenizer_file)

# Load Stop-Words
with open("./sentiment_models/stopwords.txt", "r") as my_file:
    data = my_file.read()

stopwords = data.split("\n")

# Initialize stemmer
stemmer = StemmerFactory().create_stemmer()

### Preprocessing

In [152]:
# Hyperparameters
vocab_size = 5000
max_length = 120
oov_tok = '<OOV>'

In [153]:
# Pre-processing steps (removing symbols, numbers, converting emojis, stemming, etc.)
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"\d+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = text.strip()
    text = re.sub('\s+', ' ', text)
    text = re.sub(r'[\u2070-\u209F\u00B2-\u00B3\u00B9-\u00BF\u02B0-\u036F\u1AB0-\u1AFF\u2090-\u2094]+', '', text)
    text = emoji.demojize(text)
    text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    text = text.strip()
    text = re.sub('\s+', ' ', text)
    text = re.sub(r'[^\u0000-\u007F\uD800-\uDBFF\uDC00-\uDFFF]+', '', text)
    
    text = text.lower()
    words = text.split()
    no_words = [w for w in words if w not in stopwords]
    text = " ".join(no_words)
    text = stemmer.stem(text)

    tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
    tokenizer.fit_on_texts(text)

    return text

### Predicting

In [154]:
# Prediction function 
def predict_sentiment(input_text):
    preprocessed_input_text = preprocess_text(input_text)

    input_sequence = pad_sequences(tokenizer.texts_to_sequences([preprocessed_input_text]), maxlen=max_length)
    sentiment_score = (model.predict(input_sequence) * 5)

    return sentiment_score

In [155]:
# Test predicting (0-2.5 (Negative), 2.5-5(Positive))
input_text = predict_sentiment("bagus banget pelayanannya, mana 24 jam lagi")
print(input_text)

[[4.4764047]]
