In [None]:
import pandas as pd
import numpy as np
import re
import csv
import random
import tensorflow as tf

from matplotlib import pyplot as plt
from google.colab import files
from numpy import array
from tensorflow import keras
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data = pd.read_csv('/content/drive/MyDrive/capstone/dataset_safe/data.csv', encoding='latin-1')
alay_dict = pd.read_csv('/content/drive/MyDrive/capstone/dataset_safe/new_kamusalay.csv', encoding='latin-1', header=None)
stopwords = pd.read_csv('/content/drive/MyDrive/capstone/dataset_safe/stopword.csv', encoding='latin-1')
alay_dict = alay_dict.rename(columns={0: 'original', 
                                      1: 'replacement'})
stopwords = stopwords.rename(columns={0: 'stopword'})

In [None]:
EMBEDDING_DIM = 64
MAXLEN = 300
TRUNCATING = 'post'
PADDING = 'post'
OOV_TOKEN = "<OOV>"

In [None]:
def lowercase(text):
    return text.lower()

def remove_unnecessary_char(text):
    text = re.sub('\n',' ',text) # Remove every '\n'
    text = re.sub('rt',' ',text) # Remove every retweet symbol
    text = re.sub('user',' ',text) # Remove every username
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text) # Remove every URL
    text = re.sub('  +', ' ', text) # Remove extra spaces
    text = re.sub(r"\d+", "", text) # Remove number
    text = text.encode('ascii', 'replace').decode('ascii') # Remove non ASCII
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split()) #Remove hastag, mention
    return text

def remove_nonaplhanumeric(text):
    text = re.sub('[^0-9a-zA-Z]+', ' ', text) 
    return text

def normalize_alay(text):
    return ' '.join([alay_dict_map[word] if word in alay_dict_map else word for word in text.split(' ')])

def remove_stopword(text):
    text = ' '.join(['' if word in stopwords.stopword.values else word for word in text.split(' ')])
    text = re.sub('  +', ' ', text) # Remove extra spaces
    text = text.strip()
    return text

def preprocess(text):
    text = lowercase(text) # 1
    text = remove_nonaplhanumeric(text) # 2
    text = remove_unnecessary_char(text) # 3
    text = normalize_alay(text) # 4
    text = remove_stopword(text) # 5
    return text

In [None]:
alay_dict_map = dict(zip(alay_dict['original'], alay_dict['replacement']))

In [None]:
data = data[['Tweet', 'HS']]
data['Tweet'] = data['Tweet'].apply(preprocess)

In [None]:
train_size = 10000
sentences = data['Tweet']
labels = data['HS']
    
train_sentences = sentences[:train_size]
train_labels = labels[:train_size]

val_sentences = sentences[train_size:]
val_labels = labels[train_size:]

In [None]:
def fit_tokenizer(sentences, oov_token):
    tokenizer = Tokenizer(oov_token=oov_token)
    tokenizer.fit_on_texts(sentences)
    return tokenizer

In [None]:
tokenizer = fit_tokenizer(train_sentences, OOV_TOKEN)
word_index = tokenizer.word_index
VOCAB_SIZE = len(tokenizer.word_index)

In [None]:
def seq_pad_and_trunc(sentences, tokenizer, padding, truncating, maxlen):
    sequences = tokenizer.texts_to_sequences(sentences)
    pad_trunc_sequences = pad_sequences(sequences, maxlen=maxlen, padding=padding, truncating=truncating)
    return pad_trunc_sequences

In [None]:
sentences_train_pad_trunc_seq = seq_pad_and_trunc(train_sentences, tokenizer, PADDING, TRUNCATING, MAXLEN)
sentences_val_pad_trunc_seq = seq_pad_and_trunc(val_sentences, tokenizer, PADDING, TRUNCATING, MAXLEN)

In [None]:
training_padded = np.array(sentences_train_pad_trunc_seq)
training_labels = np.array(train_labels)
testing_padded = np.array(sentences_val_pad_trunc_seq)
testing_labels = np.array(val_labels)

In [None]:
print("Total words", VOCAB_SIZE)

In [None]:
EMBEDDINGS_MATRIX = np.zeros((VOCAB_SIZE+1, EMBEDDING_DIM))

In [None]:
model = tf.keras.Sequential([ 
        tf.keras.layers.Embedding(VOCAB_SIZE+1, EMBEDDING_DIM, input_length=MAXLEN),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(24, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
model.summary()

In [None]:

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model and save the training history
history = model.fit(training_padded, training_labels, epochs=15, validation_data=(testing_padded, testing_labels))

In [None]:
cek_akurasi = loss, accuracy=model.evaluate(testing_padded, testing_labels)

In [None]:
#Evaluasi

import matplotlib.pyplot as plt

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])

plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')

plt.legend(['Train', 'Test'], loc='lower right')
plt.show()

In [None]:
#Prediksi
def sentiment(score):
 return "POSITIF" if score < 0.5 else "NEGATIVE"
def predict(sentences):
 cek = pad_sequences(tokenizer.texts_to_sequences([sentences]), maxlen=MAXLEN)
 score = model.predict([cek])[0]
 label = sentiment(score)
 return {"label": label, "score": float(score)}



In [None]:
predict("")

In [None]:
#save model
export_dir = 'saved_model/1'
tf.saved_model.save(model,export_dir)

In [None]:
#convert model
converter = tf.lite.TFLiteConverter.from_saved_model(export_dir)
tflite_model = converter.convert()

In [None]:
import pathlib

tflite_model_file = pathlib.Path('./model.tflite')
tflite_model_file.write_bytes(tflite_model)

In [None]:
# Load TFLite model and allocate tensors.
interpreter = tf.lite.Interpreter(model_content=tflite_model)
interpreter.allocate_tensors()

input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

In [None]:
#Download tflite
from google.colab import files

files.download(tflite_model_file)

In [None]:
model.save('/content/drive/MyDrive/capstone/sentimen_model.h5')

In [None]:
##TESTING
new_model = tf.keras.models.load_model("/content/drive/MyDrive/capstone/sentimen_model.h5")

In [None]:
new_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 64)           1060800   
                                                                 
 global_average_pooling1d (G  (None, 64)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 24)                1560      
                                                                 
 dropout (Dropout)           (None, 24)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 25        
                                                                 
Total params: 1,062,385
Trainable params: 1,062,385
Non-trainable params: 0
______________________________________________

In [None]:
loss, acc = new_model.evaluate(testing_padded, testing_labels)



In [None]:
print(loss)

0.4226473271846771


In [None]:
print(acc)

0.8400126099586487


In [None]:
#Prediksi
def sentiment(nilai):
 return "POSITIF" if nilai < 0.5 else "NEGATIVE"
def predict(sentences):
 cek = pad_sequences(tokenizer.texts_to_sequences([sentences]), maxlen=MAXLEN)
 nilai = new_model.predict([cek])[0]
 label = sentiment(nilai)
 return {"label": label, "nilai": float(nilai)}

In [None]:
predict("Halo, nama ku Lucky Anggari Kusumaningtias. Aku berasal dari Ngawi Jawa Timur. Yang sekarang sedang mengikuti MBKM Bangkit 2022. Salam Kenal")

{'label': 'POSITIF', 'nilai': 0.001206815242767334}

In [None]:
predict("""Cowonya di slebew aja pake matras.Hornkee with fitness mat? Go Ask your brain to control your kntl, goblok get some help""")

{'label': 'NEGATIVE', 'nilai': 0.5282399654388428}