## **LSTM**

In [1]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

df = pd.read_csv('/content/dataset_UG-01-01.csv')
df.shape

(2026, 2)

In [2]:
df.head()

Unnamed: 0,clean_english,sentimen_textblob
0,New DKI DKI SATPOL LOGE MO MUST,positive
1,Manggarai Water Gate Status Standby 2,neutral
2,Transjakarta Corridor Setop Operations,neutral
3,Anies must be the hands of optimal flood hooks,neutral
4,Jakut Social Sudin Ready to 500 Food Packages,positive


In [3]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()
df['sentimen_textblob'] = enc.fit_transform(df['sentimen_textblob'])

In [4]:
df['sentimen_textblob'].unique()

array([2, 1, 0])

In [5]:
# from sklearn.preprocessing import MinMaxScaler

# sc = MinMaxScaler()
# df['sentimen_textblob'] = sc.fit_transform(df['sentimen_textblob'].values)

Text Preprocessing

In [6]:
# convert to lowercase
df['clean_english'] = df['clean_english'].str.lower()

In [7]:
# remove stopwords

#from nltk.corpus import stopwords #comment jika Error dan gunakan 2 sintaks dibawah
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:

stop = set(stopwords.words('english'))
df['clean_english'] = df['clean_english'].apply(lambda x:' '.join([word for word in x.split() if word not in (stop)]))
df.head()

Unnamed: 0,clean_english,sentimen_textblob
0,new dki dki satpol loge mo must,2
1,manggarai water gate status standby 2,1
2,transjakarta corridor setop operations,1
3,anies must hands optimal flood hooks,1
4,jakut social sudin ready 500 food packages,2


Tokenize

In [9]:
vocab_size = 2026
oov_tok = "<OOV>"
filt = '!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ' #remove symbols

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_tok, filters = filt)
tokenizer.fit_on_texts(df['clean_english'].values)

word2index = tokenizer.word_index
print(len(word2index))

2624


In [10]:
import json

with open('word2index.json', 'w') as fp:
    json.dump(word2index, fp)

In [11]:
max_length =  max(len(values.split()) for i, values in enumerate(df['clean_english']))
max_length

16

In [12]:
trunc_type='post'

all_seq = tokenizer.texts_to_sequences(df['clean_english'].values)
all_padded = pad_sequences(all_seq, maxlen = max_length, padding = trunc_type)
all_padded.shape

(2026, 16)

In [13]:
# split train and test sets
from sklearn.model_selection import train_test_split

X = all_padded
#y = pd.get_dummies(df['label'].values)
y = df['sentimen_textblob']

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

#kalimat = df['sentence'].values
#y = df['label'].values

#kalimat_latih, kalimat_test, y_latih, y_test = train_test_split(kalimat, y, 
#                                                                test_size=0.2, random_state=1000)

(1620, 16) (1620,)
(406, 16) (406,)


In [14]:
from tensorflow.keras.utils import to_categorical

y_train = to_categorical(y_train, 3)
y_test = to_categorical(y_test, 3)

In [15]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim = vocab_size, output_dim=16),
    tf.keras.layers.LSTM(64),
    # tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    # tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(128, activation='relu'),
    # tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(3, activation='softmax'),
])
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 16)          32416     
                                                                 
 lstm (LSTM)                 (None, 64)                20736     
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 3)                 387       
                                                                 
Total params: 61,859
Trainable params: 61,859
Non-trainable params: 0
_________________________________________________________________


In [16]:
num_epochs = 30
history = model.fit(X_train, y_train, epochs=num_epochs, verbose=2, validation_data=(X_test, y_test))

Epoch 1/30
51/51 - 7s - loss: 0.8756 - accuracy: 0.6932 - val_loss: 0.7297 - val_accuracy: 0.7217 - 7s/epoch - 130ms/step
Epoch 2/30
51/51 - 0s - loss: 0.6528 - accuracy: 0.7340 - val_loss: 0.5225 - val_accuracy: 0.7956 - 422ms/epoch - 8ms/step
Epoch 3/30
51/51 - 0s - loss: 0.3472 - accuracy: 0.8704 - val_loss: 0.4542 - val_accuracy: 0.8424 - 456ms/epoch - 9ms/step
Epoch 4/30
51/51 - 0s - loss: 0.2054 - accuracy: 0.9142 - val_loss: 0.4506 - val_accuracy: 0.8719 - 446ms/epoch - 9ms/step
Epoch 5/30
51/51 - 0s - loss: 0.1578 - accuracy: 0.9198 - val_loss: 0.4756 - val_accuracy: 0.8793 - 442ms/epoch - 9ms/step
Epoch 6/30
51/51 - 0s - loss: 0.1251 - accuracy: 0.9247 - val_loss: 0.4736 - val_accuracy: 0.8867 - 437ms/epoch - 9ms/step
Epoch 7/30
51/51 - 0s - loss: 0.1052 - accuracy: 0.9438 - val_loss: 0.5639 - val_accuracy: 0.8621 - 420ms/epoch - 8ms/step
Epoch 8/30
51/51 - 0s - loss: 0.0877 - accuracy: 0.9698 - val_loss: 0.5732 - val_accuracy: 0.8719 - 423ms/epoch - 8ms/step
Epoch 9/30
51/51 

In [17]:
#def toSequence(sentence):
#  pad = []
#  for stc in sentence.split():
#    if stc.lower() in word2index.keys(): 
#      pad.append(word2index[stc.lower()])
#    else: 
#      continue
#  return pad

#pad = toSequence('affordable price and nice dessert')
#pad = [269, 353, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0 ,0,0,0,0]
#len(pad)
#model.predict([pad])

Save Model


In [18]:
!pip install tensorflowjs

Collecting tensorflowjs
  Downloading tensorflowjs-3.12.0-py3-none-any.whl (77 kB)
[?25l[K     |████▎                           | 10 kB 20.4 MB/s eta 0:00:01[K     |████████▌                       | 20 kB 23.1 MB/s eta 0:00:01[K     |████████████▊                   | 30 kB 24.9 MB/s eta 0:00:01[K     |█████████████████               | 40 kB 19.5 MB/s eta 0:00:01[K     |█████████████████████▏          | 51 kB 15.8 MB/s eta 0:00:01[K     |█████████████████████████▍      | 61 kB 10.9 MB/s eta 0:00:01[K     |█████████████████████████████▊  | 71 kB 12.1 MB/s eta 0:00:01[K     |████████████████████████████████| 77 kB 4.4 MB/s 
Installing collected packages: tensorflowjs
Successfully installed tensorflowjs-3.12.0


In [19]:
saved_model_path = '/content/mymodel/'
tf.saved_model.save(model, saved_model_path)



INFO:tensorflow:Assets written to: /content/mymodel/assets


INFO:tensorflow:Assets written to: /content/mymodel/assets


In [20]:
!tensorflowjs_converter \
  --input_format=tf_saved_model \
  /content/mymodel/ \
  /content/modeltfjs

2021-12-13 13:15:51.118107: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
Writing weight file /content/modeltfjs/model.json...
