In [17]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences, to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.feature_extraction.text import CountVectorizer
tokenizer = None
model = None
FILEPATH = 'indo.csv'
SLANGPATH = 'colloquial-indonesian-lexicon.csv'
slang_dictionary = None
sentiment_amount = 2
sentiments = ["positive", "negative"]
dataset = pd.DataFrame
vectorizer = None

In [18]:
def get_dataset():
    dataset = pd.read_csv(FILEPATH)

    for index, data in dataset.iterrows():
        text = data['Text']
        text = str(text).lower()
        word_list = word_tokenize(text)
        id_stopwords = stopwords.words('indonesian')
        word_list = [word for word in word_list if word not in id_stopwords]
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()
        word_list = [stemmer.stem(word) for word in word_list]
        text = ' '.join(word_list)
        dataset.at[index, 'Text'] = text

    dataset = dataset.sample(frac = 1)

    return dataset

def initialize_slang_dictionary():
    global slang_dictionary
    slang_dictionary = {}
    dataset = pd.read_csv(SLANGPATH)
    for _, data in dataset.iterrows():
        key = data['slang']
        value = data['formal']
        slang_dictionary[key] = value

def preprocess_data(text_list, sentiments):
    global vectorizer
    vectorizer = CountVectorizer(max_features=1000)
    vec = vectorizer.fit_transform(text_list)

    return vec, sentiments

def initialize_tokenizer():
    dataset = get_dataset()
    global tokenizer
    tokenizer = Tokenizer(num_words=4000)
    tokenizer.fit_on_texts(dataset['Text'])

def initialize_model():
    global model
    model = MultinomialNB()

def get_word_count():
    global tokenizer
    if tokenizer == None:
        initialize_tokenizer()
    return len(tokenizer.word_index)

def load_model():
    global model
    if model == None:
        initialize_model()
    model.load_weights('multinomial.keras')

def predict_comment(text):
    global model
    global tokenizer
    if tokenizer == None:
        initialize_tokenizer()
    if model == None:
        initialize_model()
    sequences = tokenizer.texts_to_sequences(text)
    x = pad_sequences(sequences)
    y_pred = model.predict(x)
    y_pred = np.argmax(y_pred, axis=1)

    for index, comment in enumerate(text):
        print(f"{comment}: {sentiments[y_pred[index]]}")

In [19]:
def train_model():
    global dataset
    if dataset.empty:
        dataset = get_dataset()
    x, y = preprocess_data(dataset['Text'], dataset['Sentiment'])

    global model
    initialize_model()

    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8)

    model.fit(x_train, y_train)
    
    y_pred = model.predict(x_test)

    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print('Precision:', precision_score(y_test, y_pred, average='weighted'))
    print('Recall:', recall_score(y_test, y_pred, average='weighted'))
    print('F1 score:', f1_score(y_test, y_pred, average='weighted'))
    print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred))

In [20]:
global dataset
dataset = get_dataset()

In [21]:
train_model()

Accuracy:  0.7521786492374728
Precision: 0.7520157724209799
Recall: 0.7521786492374728
F1 score: 0.7520915988653188
Confusion Matrix: 
 [[807 225]
 [230 574]]


In [22]:
global dataset

dataset = pd.read_csv('indo.csv')
vectorizer = CountVectorizer()

dataset = dataset.dropna()
x = vectorizer.fit_transform(dataset['Text'])
le = LabelEncoder()
y = le.fit_transform(dataset['Sentiment'])
global model
initialize_model()

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8)

model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred, average='weighted'))
print('Recall:', recall_score(y_test, y_pred, average='weighted'))
print('F1 score:', f1_score(y_test, y_pred, average='weighted'))
print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred))

Accuracy:  0.7766884531590414
Precision: 0.7766126446004125
Recall: 0.7766884531590414
F1 score: 0.776649617549059
Confusion Matrix: 
 [[856 204]
 [206 570]]
