In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences, to_categorical
from sklearn.preprocessing import LabelEncoder
from keras import Sequential, regularizers
from keras.layers import LSTM, Embedding, Dropout, Dense
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from apiclient import discovery
tokenizer = None
model = None
FILEPATH = 'indo.csv'
SLANGPATH = 'colloquial-indonesian-lexicon.csv'
slang_dictionary = None
sentiment_amount = 2
sentiments = ["positive", "negative"]

In [2]:
def get_dataset():
    dataset = pd.read_csv(FILEPATH)

    for index, data in dataset.iterrows():
        text = data['Text']
        text = str(text).lower()
        word_list = word_tokenize(text)
        temp = []
        word_list = temp
        id_stopwords = stopwords.words('indonesian')
        word_list = [word for word in word_list if word not in id_stopwords]
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()
        word_list = [stemmer.stem(word) for word in word_list]
        text = ' '.join(word_list)
        dataset.at[index, 'Tweet'] = text

    dataset = dataset.sample(frac = 1)

    return dataset

def initialize_slang_dictionary():
    global slang_dictionary
    slang_dictionary = {}
    dataset = pd.read_csv(SLANGPATH)
    for _, data in dataset.iterrows():
        key = data['slang']
        value = data['formal']
        slang_dictionary[key] = value

def train_model():
    dataset = get_dataset()
    x, y = preprocess_data(dataset['Text'], dataset['Sentiment'])

    global model
    initialize_model()
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=1, mode='auto')

    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8)

    model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_test, y_test), callbacks=[early_stopping])
    
    score = model.evaluate(x_test, y_test, verbose = 1)
    y_pred = model.predict(x_test)
    y_pred = np.argmax(y_pred, axis=1)
    y_test_argmax = np.argmax(y_test, axis=1)

    print('Score: ', score)
    print('Accuracy: ', accuracy_score(y_test_argmax, y_pred))
    print('Precision:', precision_score(y_test_argmax, y_pred, average='weighted'))
    print('Recall:', recall_score(y_test_argmax, y_pred, average='weighted'))
    print('F1 score:', f1_score(y_test_argmax, y_pred, average='weighted'))
    print('Confusion Matrix: \n', confusion_matrix(y_test_argmax, y_pred))
    
    model.save('LSTM.keras')

def preprocess_data(text_list, sentiments):
    global tokenizer
    if tokenizer == None:
        initialize_tokenizer()

    sequences = tokenizer.texts_to_sequences(text_list)

    x = pad_sequences(sequences)

    le = LabelEncoder()
    y = le.fit_transform(sentiments)
    y = to_categorical(y)

    return x, y

def initialize_tokenizer():
    dataset = get_dataset()
    global tokenizer
    tokenizer = Tokenizer(num_words=4000)
    tokenizer.fit_on_texts(dataset['Text'])

def initialize_model():
    global model
    word_count = get_word_count()
    model = Sequential([
        Embedding(input_dim=word_count + 1, output_dim=200),
        LSTM(128, return_sequences=True),
        Dropout(0.5),
        LSTM(128),
        Dense(64, activation='relu', 
            kernel_regularizer=regularizers.l2(0.01), 
            activity_regularizer=regularizers.l1(0.01)),
        Dense(sentiment_amount, activation='softmax')
    ])

def get_word_count():
    global tokenizer
    if tokenizer == None:
        initialize_tokenizer()
    return len(tokenizer.word_index)

def load_model():
    global model
    if model == None:
        initialize_model()
    model.load_weights('LSTM.keras')

def predict_comment(text):
    global model
    global tokenizer
    if tokenizer == None:
        initialize_tokenizer()
    if model == None:
        initialize_model()
    sequences = tokenizer.texts_to_sequences(text)
    x = pad_sequences(sequences)
    y_pred = model.predict(x)
    y_pred = np.argmax(y_pred, axis=1)

    for index, comment in enumerate(text):
        print(f"{comment}: {sentiments[y_pred[index]]}")

In [3]:
train_model()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 5: early stopping
Score:  [0.6879544854164124, 0.7783224582672119]
Accuracy:  0.778322440087146
Precision: 0.7772386009337106
Recall: 0.778322440087146
F1 score: 0.7763638371145903
Confusion Matrix: 
 [[900 165]
 [242 529]]
