<a href="https://colab.research.google.com/github/parsa-abbasi/Basic-Sentiment-Analysis/blob/master/SentiPers/Classifier/NN/GoogleColab/LSTM_FastTextEmb/LSTM_FastText.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Get FastText Persian

In [0]:
!wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.fa.vec

--2019-01-16 14:46:03--  https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.fa.vec
Resolving s3-us-west-1.amazonaws.com (s3-us-west-1.amazonaws.com)... 52.219.28.41
Connecting to s3-us-west-1.amazonaws.com (s3-us-west-1.amazonaws.com)|52.219.28.41|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1105157170 (1.0G) [binary/octet-stream]
Saving to: ‘wiki.fa.vec’


2019-01-16 14:46:22 (54.8 MB/s) - ‘wiki.fa.vec’ saved [1105157170/1105157170]



In [0]:
!pip install hazm
!pip install stopwords_guilannlp

Installing collected packages: libwapiti, nltk, hazm
  Found existing installation: nltk 3.2.5
    Uninstalling nltk-3.2.5:
      Successfully uninstalled nltk-3.2.5
Successfully installed hazm-0.7.0 libwapiti-0.2.1 nltk-3.3
Collecting stopwords_guilannlp
  Downloading https://files.pythonhosted.org/packages/44/bc/a01c003b59a91187e89d11e73e8bb2834bb9ae6b36fe576a4b617c90bd23/stopwords_guilannlp-13.2019.3.5-py3-none-any.whl
Installing collected packages: stopwords-guilannlp
Successfully installed stopwords-guilannlp-13.2019.3.5


# Import Libraries

In [0]:
from keras.layers import Dense, Input, LSTM, Embedding, Dropout
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
import numpy as np
import pandas as pd
from keras.utils.np_utils import to_categorical
from keras.metrics import categorical_accuracy
from keras.utils import plot_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from gensim.models import KeyedVectors
import codecs
from stopwords_guilannlp import stopwords_output
from hazm import *

Using TensorFlow backend.


# File uploader

In [0]:
from google.colab import files
uploaded = files.upload()

Saving vocab.txt to vocab.txt
Saving x_test.csv to x_test.csv
Saving x_train.csv to x_train.csv
Saving y_test.csv to y_test.csv
Saving y_train.csv to y_train.csv



# Import Dataset

In [0]:
x_train = pd.Series.from_csv('x_train.csv', sep='\t')
x_test = pd.Series.from_csv('x_test.csv', sep='\t')
y_train = pd.Series.from_csv('y_train.csv', sep='\t', header=0)
y_test = pd.Series.from_csv('y_test.csv', sep='\t', header=0)

  infer_datetime_format=infer_datetime_format)


In [0]:
x_train = x_train.iloc[1:, ]
x_test = x_test.iloc[1:, ]

In [0]:
x_train.shape

(5561,)

In [0]:
x_test.shape

(1854,)

In [0]:
y_train.shape

(5561,)

In [0]:
y_test.shape

(1854,)

# Import Vocabulary

In [0]:
def load_doc(filename):
    file = codecs.open(filename, 'r', "utf8")
    text = file.read()
    file.close()
    return text

vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)
print('The size of vocab : ', len(vocab))

The size of vocab :  2671


# Make FastText Model

In [0]:
EMBEDDING_FILE = 'wiki.fa.vec'
embed_size = 300

def import_with_gensim(file_address):
    # Creating the model
    ft_model = KeyedVectors.load_word2vec_format(file_address)
    # Getting the tokens
    ft_words = []
    for ft_word in ft_model.vocab:
        ft_words.append(ft_word)

    return ft_model, ft_words
  
model, words = import_with_gensim(EMBEDDING_FILE)

In [0]:
# We get the mean and standard deviation of the embedding weights so that we could maintain the
# same statistics for the rest of our own random generated weights.\
embedding_list = list()
for w in words:
    embedding_list.append(model[w])

all_embedding = np.stack(embedding_list)
emb_mean, emb_std = all_embedding.mean(), all_embedding.std()

In [0]:
stop_set = stopwords_output("Persian", "set")

# turn a doc into clean tokens
def clean_doc(doc, vocabulary):
    tokenized = word_tokenize(doc)  # Tokenize text
    tokens = [w for w in tokenized if not w in stop_set]    # Remove stop words
    tokens = [w for w in tokens if not len(w) <= 1]
    tokens = [w for w in tokens if not w.isdigit()]
    tokens = [w for w in tokens if w in vocabulary]
    tokens = ' '.join(tokens)
    return tokens


train_docs = list()
for document in x_train:
    train_docs.append(clean_doc(document, vocab))

In [0]:
num_words = 2500

# create the tokenizer
tokenizer = Tokenizer(num_words=num_words)

# fit the tokenizer on the documents
tokenizer.fit_on_texts(train_docs)

# sequence encode
encoded_docs = tokenizer.texts_to_sequences(train_docs)

# pad sequences
max_length = max([len(s.split()) for s in train_docs])
x_train = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

test_docs = list()
for document in x_test:
    test_docs.append(clean_doc(document, vocab))

encoded_docs = tokenizer.texts_to_sequences(test_docs)
x_test = pad_sequences(encoded_docs, maxlen=max_length, padding='post')


# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) + 1

In [0]:
# We are going to set the embedding size to the pre-trained dimension as we are replicating it
nb_words = len(tokenizer.word_index)

# the size will be Number of Words in Vocab X Embedding Size
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))

# With the newly created embedding matrix, we'll fill it up with the words that we have in both
# our own dictionary and loaded pre-trained embedding.
embeddedCount = 0
for word, i in tokenizer.word_index.items():
    i -= 1
    # then we see if this word is in glove's dictionary, if yes, get the corresponding weights
    if word in model.vocab:
        embedding_vector = model[word]
        # and store inside the embedding matrix that we will train later on.
        embedding_matrix[i] = embedding_vector
        embeddedCount += 1
    else:   # Unknown words
        embedding_vector = model['subdivision_name']
        # and store inside the embedding matrix that we will train later on.
        embedding_matrix[i] = embedding_vector
        embeddedCount += 1

print('total embedded:', embeddedCount, 'common words')
print('Embedding matrix shape:', embedding_matrix.shape)

total embedded: 2587 common words
Embedding matrix shape: (2587, 300)


# LSTM Model

In [0]:

categorical_y_train = to_categorical(y_train, 5)
categorical_y_test = to_categorical(y_test, 5)

inp = Input(shape=(max_length, ))
x = Embedding(len(tokenizer.word_index), embedding_matrix.shape[1], weights=[embedding_matrix], trainable=False)(inp)
x = Bidirectional(LSTM(300, return_sequences=True, name='lstm_layer', dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(900, activation="relu")(x)
x = Dense(600, activation="relu")(x)
x = Dense(300, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(5, activation="sigmoid")(x)

model = Model(inputs=inp, outputs=x)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=[categorical_accuracy])

model.summary()
batch_size = 32
epochs = 5
hist = model.fit(x_train, categorical_y_train, batch_size=batch_size, epochs=epochs)

loss, acc = model.evaluate(x_test, categorical_y_test, verbose=0)
print('Test Accuracy: %f' % (acc*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 183)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 183, 300)          776100    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 183, 600)          1442400   
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 600)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 600)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 900)               540900    
_________________________________________________________________
dense_2 (Dense)              (None, 600)               540600    
__________