In [17]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import pickle
import numpy as np
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from tensorflow.keras.layers import Flatten
from tensorflow.keras import backend as K
from sklearn import metrics
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from collections import defaultdict

In [2]:
df = pd.read_csv('train_preprocess.tsv.txt', sep='\t', names=['tweet','HS'])

In [11]:
def cleansing(sent):
    string = sent.lower()
    string = re.sub(r'[^a-zA-Z0-9]', ' ', string)
    string = re.sub("[,]", " ,", string)
    string = re.sub("[.]", " .", string)
    string = re.sub("[?]", " ? ", string)
    string = re.sub("[!]", " !", string)
    return string

In [16]:
open_tokenizer = open('tokenizer.p', 'rb')
tokenizer = pickle.load(open_tokenizer)

x = open("x_pad_sequences.p",'rb')
X = pickle.load(x)

y = open("y_labels.p",'rb')
Y = pickle.load(y)

In [17]:
df['text_clean'] = df.tweet.apply(cleansing)

In [19]:
neg = df.loc[df['HS'] == 'negative'].text_clean.tolist()
neu = df.loc[df['HS'] == 'neutral'].text_clean.tolist()
pos = df.loc[df['HS'] == 'positive'].text_clean.tolist()

neg_label = df.loc[df['HS'] == 'negative'].HS.tolist()
neu_label = df.loc[df['HS'] == 'neutral'].HS.tolist()
pos_label = df.loc[df['HS'] == 'positive'].HS.tolist()

In [24]:
total_data = pos + neu + neg
labels = pos_label + neu_label + neg_label

print("Pos: %s, Neu: %s, Neg: %s" % (len(pos), len(neu), len(neg)))
print("Total data: %s" % len(total_data))

Pos: 6416, Neu: 1148, Neg: 3436
Total data: 11000


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

In [34]:

embed_dim = 100
max_features = 100000
vocab_size = len(tokenizer.word_index)
maxlen = max(len(x) for x in X)

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length=maxlen))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(3, activation='softmax'))

model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)
history = model.fit(X_train, y_train, epochs=10, batch_size=10, validation_data=(X_test, y_test), verbose=1, callbacks=[es])

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 96, 100)           10000000  
                                                                 
 conv1d (Conv1D)             (None, 92, 128)           64128     
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 10)                1290      
                                                                 
 dense_1 (Dense)             (None, 3)                 33        
                                                                 
Total params: 10,065,451
Trainable params: 10,065,451
Non-trainable params: 0
__________________________________________

In [36]:
# predictions = model.predict(X_test)
# y_pred = predictions
# matrix_test = metrics.classification_report(y_test.argmax(axis=1), y_pred.argmax(axis=1))
# print("Testing selesai")
# print(matrix_test)

Testing selesai
              precision    recall  f1-score   support

           0       0.83      0.82      0.83       704
           1       0.79      0.82      0.80       222
           2       0.91      0.92      0.92      1274

    accuracy                           0.88      2200
   macro avg       0.85      0.85      0.85      2200
weighted avg       0.88      0.88      0.88      2200



In [39]:
# Cross Validation Percobaan 5
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

kf = KFold(n_splits=5,random_state=42,shuffle=True)

accuracies = []

y = Y

embed_dim = 100

for iteration, data in enumerate(kf.split(X), start=1):

    data_train   = X[data[0]]
    target_train = y[data[0]]

    data_test    = X[data[1]]
    target_test  = y[data[1]]

    model = Sequential()
    model.add(Embedding(max_features, embed_dim, input_length=maxlen))
    model.add(layers.Conv1D(128, 5, activation='relu'))
    model.add(layers.GlobalMaxPooling1D())
    model.add(layers.Dense(10, activation='relu'))
    model.add(layers.Dense(3, activation='softmax'))

    model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])

    es = EarlyStopping(monitor='val_loss', mode='min', verbose=0)
    history = model.fit(X_train, y_train, epochs=10, batch_size=10, validation_data=(X_test, y_test), verbose=0, callbacks=[es])

    predictions = model.predict(X_test)
    y_pred = predictions
    model.save("model_CNN{}.h5".format(iteration))

    # for the current fold only    
    accuracy = accuracy_score(y_test.argmax(axis=1), y_pred.argmax(axis=1))

    print("Training ke-", iteration)
    print(classification_report(y_test.argmax(axis=1), y_pred.argmax(axis=1)))
    print("======================================================")

    accuracies.append(accuracy)

# this is the average accuracy over all folds
average_accuracy = np.mean(accuracies)

print()
print()
print()
print("Rata-rata Accuracy: ", average_accuracy)

Training ke- 1
              precision    recall  f1-score   support

           0       0.86      0.80      0.83       704
           1       0.82      0.76      0.79       222
           2       0.90      0.94      0.92      1274

    accuracy                           0.88      2200
   macro avg       0.86      0.83      0.85      2200
weighted avg       0.88      0.88      0.88      2200

Training ke- 2
              precision    recall  f1-score   support

           0       0.84      0.81      0.82       704
           1       0.81      0.79      0.80       222
           2       0.91      0.93      0.92      1274

    accuracy                           0.88      2200
   macro avg       0.85      0.84      0.85      2200
weighted avg       0.88      0.88      0.88      2200

Training ke- 3
              precision    recall  f1-score   support

           0       0.84      0.84      0.84       704
           1       0.79      0.80      0.80       222
           2       0.92      0

In [None]:
# history.history
import matplotlib.pyplot as plt
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

%matplotlib inline
plot_history(history)

In [30]:
import re 
from keras.models import load_model

input_text = """
Rasa syukur, cukup.
"""

def cleansing(sent):
    # Mengubah kata menjadi huruf kecil semua dengan menggunakan fungsi lower()
    string = sent.lower()
    # Menghapus emoticon dan tanda baca menggunakan "RegEx" dengan script di bawah
    string = re.sub(r'[^a-zA-Z0-9]', ' ', string)
    return string

sentiment = ['negative', 'neutral', 'positive']

text = [cleansing(input_text)]
predicted = tokenizer.texts_to_sequences(text)
guess = pad_sequences(predicted, maxlen=X.shape[1])

model = load_model('model_CNN5.h5')
prediction = model.predict(guess)
polarity = np.argmax(prediction[0])

print("Text: ",text[0])
print("Sentiment: ",sentiment[polarity])

Text:   rasa syukur  cukup  
Sentiment:  positive
