In [1]:
#API
from flask import Flask, jsonify
from flask import request
from flasgger import Swagger, LazyString, LazyJSONEncoder
from flasgger import swag_from

#Cleansing
import re 
import pandas as pd
import numpy as np
import demoji
import pickle

#Stemmer
from pathlib import Path
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import nltk
from nltk.corpus import stopwords

#Extraction
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

#NLTK
nltk.download('stopwords')
nltk.download('punkt')

#CallFile
stop_words = set(stopwords.words('indonesian'))
factory = StemmerFactory()
stemmer = factory.create_stemmer()

id_stopword_dict = pd.read_csv('stopwordbahasa.csv', header=None)
id_stopword_dict = id_stopword_dict.rename(columns={0: 'stopword'})

df_kbbi = pd.read_csv('new_kamusalay.csv', header=None, encoding='ISO-8859-1', names=['TIDAKBAKU', 'BAKU'])

#defination of data test, train, and validation
df_train = pd.read_csv("train_preprocess.txt", delimiter = "\t", header=None)
df_valid = pd.read_csv("train_preprocess.txt", delimiter = "\t", header=None)
df_test = pd.read_csv("train_preprocess.txt", delimiter = "\t", header=None)
df = df_train._append(df_valid, ignore_index=True)

df.columns =['text', 'label']
sentiment = ['negative', 'neutral', 'positive']

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hivan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hivan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def remove_emoji(text):
    dem = demoji.findall(text)
    for demoj in dem.keys():
        text = text.replace(demoj, '')
    return text

def lowercase(text):
    return text.lower()

def removechars(text):
    text = re.sub(r'[^\w]', ' ', text)
    text = re.sub('rt',' ',text) 
    text = re.sub('user',' ',text) 
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text)
    text = re.sub('xf0',' ',text) 
    text = re.sub('x9f',' ',text) 
    text = re.sub('x98',' ',text) 
    text = re.sub('x82',' ',text) 
    text = re.sub('x84',' ',text) 
    text = re.sub('x86',' ',text) 
    text = re.sub('x8f',' ',text) 
    text = re.sub('xa4',' ',text)
    text = re.sub('xa2',' ',text)
    text = re.sub('x8b',' ',text)
    return text

def changealay(text):
    alay = dict(zip(df_kbbi['TIDAKBAKU'], df_kbbi['BAKU']))
    text = ' '.join([alay[word] if word in alay else word for word in text.split(' ')])
    return text


##################################################################
def remove_stopword(text):
    text = ' '.join(['' if word in id_stopword_dict.stopword.values else word for word in text.split(' ')])
    text = re.sub('  +', ' ', text) # Remove extra spaces
    text = text.strip()
    return text

def stemming(text):
    return stemmer.stem(text)

def cleaning(text):
    text = remove_emoji(text)
    text = removechars(text)
    text = lowercase(text)
    text = changealay(text)
    text = stemming(text) 
    text = remove_stopword(text)
    words = nltk.word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    text = ' '.join(words)
    return text

In [3]:
df = pd.read_csv("train_preprocess.txt", delimiter = "\t", header=None)
df.columns =['text', 'label']
sentiment = ['negative', 'neutral', 'positive']

In [4]:
df

Unnamed: 0,text,label
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral
2,lokasi strategis di jalan sumatera bandung . t...,positive
3,betapa bahagia nya diri ini saat unboxing pake...,positive
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative
...,...,...
10995,tidak kecewa,positive
10996,enak rasa masakan nya apalagi kepiting yang me...,positive
10997,hormati partai-partai yang telah berkoalisi,neutral
10998,"pagi pagi di tol pasteur sudah macet parah , b...",negative


In [5]:
# tfidf_vect = TfidfVectorizer()
# tfidf_vect.fit(df['text'])

# x = tfidf_vect.transform(df['text'])
# print('Feature Extraction TFIDF Selesai')

In [6]:
# pickle.dump(tfidf_vect, open('tfidf_vect.pkl', 'wb'))
# classes = df['label']
# x_train, x_test, y_train, y_test = train_test_split(x, classes, test_size = 0.2, stratify=classes)
# model = MLPClassifier()
# model.fit(x_train, y_train)

# print ('Training selesai')

In [7]:
# pickle.dump(model, open('model.p', 'wb'))
# test = model.predict(x_test)

# print ("Testing selesai")

# print(classification_report(y_test, test))

In [8]:
# original_text =  '''
# ulama sehat
# '''

# # Feature Extraction
# text = tfidf_vect.transform([cleaning(original_text)])

# # Kita prediksi sentimennya
# result = model.predict(text)[0]
# print("Sentiment:")
# print()
# print(result)

In [9]:
# model.predict_proba(text)

In [10]:
# # Untuk lebih menyakinkan lagi, kita juga bisa melakukan "Cross Validation"

# kf = KFold(n_splits=5,random_state=42,shuffle=True)

# accuracies = []

# y = classes

# for iteration, data in enumerate(kf.split(x), start=1):

#     data_train   = x[data[0]]
#     target_train = y[data[0]]

#     data_test    = x[data[1]]
#     target_test  = y[data[1]]

#     clf = MultinomialNB()
#     # clf = svm.SVC(gamma=0.01, C=100., probability=True)
#     clf.fit(data_train,target_train)

#     preds = clf.predict(data_test)

#     # for the current fold only    
#     accuracy = accuracy_score(target_test,preds)

#     print("Training ke-", iteration)
#     print(classification_report(target_test,preds))
#     print("======================================================")

#     accuracies.append(accuracy)

# # this is the average accuracy over all folds
# average_accuracy = np.mean(accuracies)

# print()
# print()
# print()
# print("Rata-rata Accuracy: ", average_accuracy)

In [11]:
df['text'] = df['text'].apply(cleaning)

In [12]:
df

Unnamed: 0,text,label
0,warung milik usaha pabrik puluh kenal putih ba...,positive
1,mohon ulama lurus k212 mmbri hujjah ai diwlh s...,neutral
2,lokasi strategis jalan sumatra bandung nya nya...,positive
3,betapa bahagia nya unboxing paket barang nya b...,positive
4,aduh mahasiswa sombong kasih kakak kuning ajar...,negative
...,...,...
10995,kecewa,positive
10996,enak masakan nya kepiting senang pilih kepitin...,positive
10997,hormat ai ai koalisi,neutral
10998,pagi pagi tol pasteur macet parah bikin jengkel,negative


In [13]:
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(df['text'])

x = tfidf_vect.transform(df['text'])
print('Feature Extraction TFIDF Selesai')

Feature Extraction TFIDF Selesai


In [14]:
pickle.dump(tfidf_vect, open('tfidf_vect.pkl', 'wb'))
classes = df['label']
x_train, x_test, y_train, y_test = train_test_split(x, classes, test_size = 0.2, stratify=classes)
model = MLPClassifier()
model.fit(x_train, y_train)

print ('Training selesai')

Training selesai


In [15]:
classes

0        positive
1         neutral
2        positive
3        positive
4        negative
           ...   
10995    positive
10996    positive
10997     neutral
10998    negative
10999    positive
Name: label, Length: 11000, dtype: object

In [16]:
pickle.dump(model, open('model.p', 'wb'))
test = model.predict(x_test)

print ("Testing selesai")

print(classification_report(y_test, test))

Testing selesai
              precision    recall  f1-score   support

    negative       0.68      0.73      0.70       687
     neutral       0.75      0.49      0.59       230
    positive       0.84      0.86      0.85      1283

    accuracy                           0.78      2200
   macro avg       0.76      0.69      0.71      2200
weighted avg       0.78      0.78      0.78      2200



In [17]:
original_text =  '''
ulama sakit
'''

# Feature Extraction
count_vect = TfidfVectorizer()
text = tfidf_vect.transform([cleaning(original_text)])

# Kita prediksi sentimennya
result = model.predict(text)[0]
print("Sentiment:")
print()
print(result)

Sentiment:

positive


In [18]:
model.predict_proba(text)

array([[7.50499959e-03, 6.05021573e-06, 9.92488950e-01]])

In [19]:
# Untuk lebih menyakinkan lagi, kita juga bisa melakukan "Cross Validation"

kf = KFold(n_splits=5,random_state=42,shuffle=True)

accuracies = []

y = classes

for iteration, data in enumerate(kf.split(x), start=1):

    data_train   = x[data[0]]
    target_train = y[data[0]]

    data_test    = x[data[1]]
    target_test  = y[data[1]]

    clf = MLPClassifier()
    # clf = svm.SVC(gamma=0.01, C=100., probability=True)
    clf.fit(data_train,target_train)

    preds = clf.predict(data_test)

    # for the current fold only    
    accuracy = accuracy_score(target_test,preds)

    print("Training ke-", iteration)
    print(classification_report(target_test,preds))
    print("======================================================")

    accuracies.append(accuracy)

# this is the average accuracy over all folds
average_accuracy = np.mean(accuracies)

print()
print()
print()
print("Rata-rata Accuracy: ", average_accuracy)

Training ke- 1
              precision    recall  f1-score   support

    negative       0.72      0.74      0.73       680
     neutral       0.74      0.54      0.63       239
    positive       0.85      0.88      0.86      1281

    accuracy                           0.80      2200
   macro avg       0.77      0.72      0.74      2200
weighted avg       0.80      0.80      0.80      2200

Training ke- 2
              precision    recall  f1-score   support

    negative       0.71      0.71      0.71       706
     neutral       0.70      0.55      0.62       220
    positive       0.83      0.86      0.85      1274

    accuracy                           0.78      2200
   macro avg       0.75      0.71      0.72      2200
weighted avg       0.78      0.78      0.78      2200

Training ke- 3
              precision    recall  f1-score   support

    negative       0.70      0.74      0.72       682
     neutral       0.78      0.55      0.64       215
    positive       0.85      0