In [2]:
import string
import pickle
import pandas as pd
import random
import numpy as np
from nltk.classify import NaiveBayesClassifier, accuracy
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
classifier = None
FILEPATH = 'indo.csv'
import re

pattern = r"[^A-Za-z ]"

In [3]:
def remove_punctuation(word_list):
    punc = string.punctuation
    removed_symbols = []
    for word in word_list:
        removed_symbols.append(''.join([let for let in word if let not in punc]))
    return removed_symbols

def preprocess_data(text):
    text = str(text).lower()
    text = re.sub(pattern, '', text)
    word_list = word_tokenize(text)
    in_stopwords = stopwords.words('indonesian')
    more_stopwords = ['lu', 'gua', 'i', 'gue', 'lo', 'gw']
    word_list = [word for word in word_list if word not in in_stopwords]
    word_list = [word for word in word_list if word not in more_stopwords]
    word_list = [word for word in word_list if word.isalpha()]
    lemmatizer = WordNetLemmatizer()
    word_list = [lemmatizer.lemmatize(word) for word in word_list]
    return word_list

def extract_features(dataset):
    all_words = []
    for text in dataset:
        word_list = preprocess_data(text)
        all_words.extend(word_list)
    fd = FreqDist(all_words)
    common_words = [word for word, _ in fd.most_common(1000)]
    common_words = list(set(common_words))
    return common_words

def extract_dataset():
    dataset = get_dataset()
    word_dictionary = extract_features(dataset['Text'])
    document = []
    for _, data in dataset.iterrows():
        features = {}
        word_list = preprocess_data(data['Text'])
        for word in word_dictionary:
            key = word
            value = word in word_list
            features[key] = value
        
        document.append((features, data['Sentiment']))
    return document

def train_data(dataset):
    random.shuffle(dataset)
    training_len = int(len(dataset) * 0.7)
    training_data = dataset[:training_len]
    testing_data = dataset[training_len:]

    global classifier
    classifier = NaiveBayesClassifier.train(training_data)
    print(f"Accuracy: {accuracy(classifier, testing_data) * 100}%")

    file = open('model.pickle', 'wb')
    pickle.dump(classifier, file)
    file.close()

def predict_comment(comment):
    dataset = get_dataset()
    word_dictionary = extract_features(dataset['Text'])
    word_list = preprocess_data(comment)
    features = {}
    for word in word_dictionary:
        features[word] = word in word_list
    print(f"Prediction: {classifier.classify(features)}")

def get_dataset():
    dataset = pd.read_csv(FILEPATH)
    return dataset

In [4]:
dataset = extract_dataset()
train_data(dataset)

Accuracy: 76.1437908496732%
