<a href="https://colab.research.google.com/github/parsa-abbasi/Basic-Sentiment-Analysis/blob/master/SentiPers/Classifier/NN/GoogleColab/LSTM_KerasEmb/LSTM_KerasEmb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install hazm
!pip install stopwords_guilannlp



# Import Libraries

In [0]:
from keras.layers import Dense, Input, LSTM, Embedding, Dropout
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Convolution1D
from keras.models import Sequential
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.models import Model
from keras import optimizers
import numpy as np
import pandas as pd
from keras.utils.np_utils import to_categorical
from keras.metrics import categorical_accuracy
from keras.utils import plot_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from gensim.models import KeyedVectors
import codecs
from stopwords_guilannlp import stopwords_output
from hazm import *

Using TensorFlow backend.


# File uploader

In [0]:
from google.colab import files
uploaded = files.upload()

Saving vocab.txt to vocab.txt
Saving x_test.csv to x_test.csv
Saving x_train.csv to x_train.csv
Saving y_test.csv to y_test.csv
Saving y_train.csv to y_train.csv



# Import Dataset

In [0]:
x_train = pd.Series.from_csv('x_train.csv', sep='\t')
x_test = pd.Series.from_csv('x_test.csv', sep='\t')
y_train = pd.Series.from_csv('y_train.csv', sep='\t', header=0)
y_test = pd.Series.from_csv('y_test.csv', sep='\t', header=0)

  infer_datetime_format=infer_datetime_format)


In [0]:
x_train = x_train.iloc[1:, ]
x_test = x_test.iloc[1:, ]

In [0]:
x_train.shape

(5561,)

In [0]:
x_test.shape

(1854,)

In [0]:
y_train.shape

(5561,)

In [0]:
y_test.shape

(1854,)

# Import Vocabulary

In [0]:
def load_doc(filename):
    file = codecs.open(filename, 'r', "utf8")
    text = file.read()
    file.close()
    return text

vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)
print('The size of vocab : ', len(vocab))

The size of vocab :  2671


# Vectorize

In [0]:
stop_set = stopwords_output("Persian", "set")
stop_useful = ['سلام', 'دیجی', 'کالا']
useful_set = ['خوب', 'بد', 'کاملا', 'کاملاً', 'بسیار', 'واقعا', 'واقعاً', 'فوق', 'بخش', 'طرفدارترین', 'نیست', 'هست']
puncs = ['،', '.', ',', ':', ';']

for word in stop_useful:
  if word not in stop_set:
    stop_set.add(word)

for word in useful_set:
  if word in stop_set:
    stop_set.remove(word)
  if word not in vocab:
    vocab.add(word)

# turn a doc into clean tokens
def clean_doc(doc, vocabulary):
    tokenized = word_tokenize(doc)  # Tokenize text
    tokens = []
    for t in tokenized:
      temp = t
      for p in puncs:
        temp = temp.replace(p, '')
      tokens.append(temp)
    tokens = [w for w in tokens if not w in stop_set]    # Remove stop words
    tokens = [w for w in tokens if not len(w) <= 1]
    tokens = [w for w in tokens if not w.isdigit()]
    tokens = [w for w in tokens if w in vocabulary]
    tokens = ' '.join(tokens)
    return tokens


train_docs = list()
for document in x_train:
    train_docs.append(clean_doc(document, vocab))

In [0]:
num_words = 2500

# create the tokenizer
tokenizer = Tokenizer(num_words=num_words)

# fit the tokenizer on the documents
tokenizer.fit_on_texts(train_docs)

# sequence encode
encoded_docs = tokenizer.texts_to_sequences(train_docs)

# pad sequences
max_length = max([len(s.split()) for s in train_docs])
x_train = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

test_docs = list()
for document in x_test:
    test_docs.append(clean_doc(document, vocab))

# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) + 1

In [0]:
encoded_docs = tokenizer.texts_to_sequences(test_docs)
x_test = pad_sequences(encoded_docs, maxlen=max_length, padding='post')


# LSTM Model

In [0]:
categorical_y_train = to_categorical(y_train, 5)
categorical_y_test = to_categorical(y_test, 5)

model = Sequential()
model.add(Embedding(vocab_size, 2000, input_length=max_length))
model.add(Bidirectional(LSTM(300, return_sequences=True, name='lstm_layer', dropout=0.1, recurrent_dropout=0.1)))
model.add(GlobalMaxPool1D())
model.add(Dropout(0.1))
model.add(Dense(300, activation="relu"))
model.add(Dropout(0.1))
model.add(Dense(5, activation='softmax'))

# optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0)
# optimizers.Adagrad(lr=0.01, epsilon=None, decay=0.0)
# optimizers.Adadelta(lr=1.0, rho=0.95, epsilon=None, decay=0.0)
# optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=[categorical_accuracy])

model.summary()
batch_size = 64
epochs = 6
hist = model.fit(x_train, categorical_y_train, batch_size=batch_size, epochs=epochs)

loss, acc = model.evaluate(x_test, categorical_y_test, verbose=0)
print('Test Accuracy: %f' % (acc*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_24 (Embedding)     (None, 181, 2000)         5176000   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 181, 600)          5522400   
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 600)               0         
_________________________________________________________________
dropout_6 (Dropout)          (None, 600)               0         
_________________________________________________________________
dense_67 (Dense)             (None, 300)               180300    
_________________________________________________________________
dropout_7 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_68 (Dense)             (None, 5)                 1505      
Total para