<a href="https://colab.research.google.com/github/parsa-abbasi/Basic-Sentiment-Analysis/blob/master/SentiPers/Classifier/NN/GoogleColab/CNN_FastText/CNN_FastText.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Get FastText Persian

In [1]:
!wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.fa.vec

--2019-01-21 17:21:12--  https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.fa.vec
Resolving s3-us-west-1.amazonaws.com (s3-us-west-1.amazonaws.com)... 52.219.112.24
Connecting to s3-us-west-1.amazonaws.com (s3-us-west-1.amazonaws.com)|52.219.112.24|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1105157170 (1.0G) [binary/octet-stream]
Saving to: ‘wiki.fa.vec’


2019-01-21 17:21:27 (73.6 MB/s) - ‘wiki.fa.vec’ saved [1105157170/1105157170]



In [2]:
!pip install hazm
!pip install stopwords_guilannlp

Collecting hazm
[?25l  Downloading https://files.pythonhosted.org/packages/22/13/5a7074bc11d20dbbb46239349ac3f85f7edc148b4cf68e9b8c2f8263830c/hazm-0.7.0-py3-none-any.whl (316kB)
[K    100% |████████████████████████████████| 317kB 26.3MB/s 
[?25hCollecting nltk==3.3 (from hazm)
[?25l  Downloading https://files.pythonhosted.org/packages/50/09/3b1755d528ad9156ee7243d52aa5cd2b809ef053a0f31b53d92853dd653a/nltk-3.3.0.zip (1.4MB)
[K    100% |████████████████████████████████| 1.4MB 19.5MB/s 
[?25hCollecting libwapiti>=0.2.1; platform_system != "Windows" (from hazm)
[?25l  Downloading https://files.pythonhosted.org/packages/bc/0f/1c9b49bb49821b5856a64ea6fac8d96a619b9f291d1f06999ea98a32c89c/libwapiti-0.2.1.tar.gz (233kB)
[K    100% |████████████████████████████████| 235kB 28.4MB/s 
Building wheels for collected packages: nltk, libwapiti
  Running setup.py bdist_wheel for nltk ... [?25l- \ | / - done
[?25h  Stored in directory: /root/.cache/pip/wheels/d1/ab/40/3bceea46922767e4

# Import Libraries

In [3]:
from keras.layers import Dense, Input, LSTM, Embedding, Dropout
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Convolution1D, GlobalMaxPooling1D
from keras.layers.merge import concatenate
from keras.models import Sequential
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.models import Model
from keras import optimizers
import numpy as np
import pandas as pd
from keras.utils.np_utils import to_categorical
from keras.metrics import categorical_accuracy
from keras.utils import plot_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from gensim.models import KeyedVectors
import codecs
from stopwords_guilannlp import stopwords_output
from hazm import *

Using TensorFlow backend.


# File uploader

In [4]:
from google.colab import files
uploaded = files.upload()

Saving vocab.txt to vocab.txt
Saving x_test.csv to x_test.csv
Saving x_train.csv to x_train.csv
Saving y_test.csv to y_test.csv
Saving y_train.csv to y_train.csv



# Import Dataset

In [5]:
x_train = pd.Series.from_csv('x_train.csv', sep='\t')
x_test = pd.Series.from_csv('x_test.csv', sep='\t')
y_train = pd.Series.from_csv('y_train.csv', sep='\t', header=0)
y_test = pd.Series.from_csv('y_test.csv', sep='\t', header=0)

  infer_datetime_format=infer_datetime_format)


In [0]:
x_train = x_train.iloc[1:, ]
x_test = x_test.iloc[1:, ]

In [14]:
print('x_train shape: ', x_train.shape)
print('x_test shape: ', x_test.shape)
print('y_train shape: ', y_train.shape)
print('y_test shape: ', y_test.shape)

x_train shape:  (5561,)
x_test shape:  (1854,)
y_train shape:  (5561,)
y_test shape:  (1854,)


# Import Vocabulary

In [15]:
def load_doc(filename):
    file = codecs.open(filename, 'r', "utf8")
    text = file.read()
    file.close()
    return text

vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)
print('The size of vocab : ', len(vocab))

The size of vocab :  2671


# Make FastText Model

In [0]:
EMBEDDING_FILE = 'wiki.fa.vec'

def import_with_gensim(file_address):
    # Creating the model
    ft_model = KeyedVectors.load_word2vec_format(file_address)
    # Getting the tokens
    ft_words = []
    for ft_word in ft_model.vocab:
        ft_words.append(ft_word)

    return ft_model, ft_words
  
model, words = import_with_gensim(EMBEDDING_FILE)

In [0]:
embed_size = 300

In [0]:
# We get the mean and standard deviation of the embedding weights so that we could maintain the
# same statistics for the rest of our own random generated weights.\
embedding_list = list()
for w in words:
    embedding_list.append(model[w])

all_embedding = np.stack(embedding_list)
emb_mean, emb_std = all_embedding.mean(), all_embedding.std()

In [0]:
stop_set = stopwords_output("Persian", "set")
stop_useful = ['سلام', 'دیجی', 'کالا']
useful_set = ['خوب', 'بد', 'کاملا', 'کاملاً', 'بسیار', 'واقعا', 'واقعاً', 'فوق', 'بخش', 'طرفدارترین', 'نیست', 'هست']
puncs = ['،', '.', ',', ':', ';']

for word in stop_useful:
  if word not in stop_set:
    stop_set.add(word)

for word in useful_set:
  if word in stop_set:
    stop_set.remove(word)
  if word not in vocab:
    vocab.add(word)

# turn a doc into clean tokens
def clean_doc(doc, vocabulary):
    tokenized = word_tokenize(doc)  # Tokenize text
    tokens = []
    for t in tokenized:
      temp = t
      for p in puncs:
        temp = temp.replace(p, '')
      tokens.append(temp)
    tokens = [w for w in tokens if not w in stop_set]    # Remove stop words
    tokens = [w for w in tokens if not len(w) <= 1]
    tokens = [w for w in tokens if not w.isdigit()]
    tokens = [w for w in tokens if w in vocabulary]
    tokens = ' '.join(tokens)
    return tokens


train_docs = list()
for document in x_train:
    train_docs.append(clean_doc(document, vocab))

In [0]:
num_words = 2500

# create the tokenizer
tokenizer = Tokenizer(num_words=num_words)

# fit the tokenizer on the documents
tokenizer.fit_on_texts(train_docs)

# sequence encode
encoded_docs = tokenizer.texts_to_sequences(train_docs)

# pad sequences
max_length = max([len(s.split()) for s in train_docs])
x_train = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

test_docs = list()
for document in x_test:
    test_docs.append(clean_doc(document, vocab))

# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) + 1

In [21]:
# We are going to set the embedding size to the pre-trained dimension as we are replicating it
nb_words = len(tokenizer.word_index)

# the size will be Number of Words in Vocab X Embedding Size
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))

# With the newly created embedding matrix, we'll fill it up with the words that we have in both
# our own dictionary and loaded pre-trained embedding.
embeddedCount = 0
for word, i in tokenizer.word_index.items():
    i -= 1
    # then we see if this word is in glove's dictionary, if yes, get the corresponding weights
    if word in model.vocab:
        embedding_vector = model[word]
        # and store inside the embedding matrix that we will train later on.
        embedding_matrix[i] = embedding_vector
        embeddedCount += 1
    else:   # Unknown words
        embedding_vector = model['subdivision_name']
        # and store inside the embedding matrix that we will train later on.
        embedding_matrix[i] = embedding_vector
        embeddedCount += 1

print('total embedded:', embeddedCount, 'common words')
print('Embedding matrix shape:', embedding_matrix.shape)

total embedded: 2587 common words
Embedding matrix shape: (2587, 300)


In [0]:
encoded_docs = tokenizer.texts_to_sequences(test_docs)
x_test = pad_sequences(encoded_docs, maxlen=max_length, padding='post')


# CNN Model

In [0]:
categorical_y_train = to_categorical(y_train, 5)
categorical_y_test = to_categorical(y_test, 5)

In [31]:
inp = Input(shape=(max_length, ))
x = Embedding(len(tokenizer.word_index), embedding_matrix.shape[1], weights=[embedding_matrix], trainable=False)(inp)
x = Conv1D(filters=64, kernel_size=4, activation='relu', padding='same')(x)
x = MaxPooling1D(pool_size=2)(x)
x = Conv1D(filters=64, kernel_size=8, activation='relu', padding='same')(x)
x = MaxPooling1D(pool_size=2)(x)
x = Conv1D(filters=64, kernel_size=16, activation='relu', padding='same')(x)
x = GlobalMaxPooling1D()(x)
x = Dropout(0.5)(x)
x = Dense(500, activation="sigmoid")(x)
x = Dense(5, activation="softmax")(x)

# optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0)
# optimizers.Adagrad(lr=0.01, epsilon=None, decay=0.0)
# optimizers.Adadelta(lr=1.0, rho=0.95, epsilon=None, decay=0.0)
# optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

opt = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

model = Model(inputs=inp, outputs=x)
model.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=[categorical_accuracy])

model.summary()
batch_size = 64
epochs = 10
hist = model.fit(x_train, categorical_y_train, batch_size=batch_size, epochs=epochs)

loss, acc = model.evaluate(x_test, categorical_y_test, verbose=0)
print('Test Accuracy: %f' % (acc*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_18 (InputLayer)        (None, 181)               0         
_________________________________________________________________
embedding_18 (Embedding)     (None, 181, 300)          776100    
_________________________________________________________________
conv1d_22 (Conv1D)           (None, 181, 64)           76864     
_________________________________________________________________
max_pooling1d_19 (MaxPooling (None, 90, 64)            0         
_________________________________________________________________
conv1d_23 (Conv1D)           (None, 90, 64)            32832     
_________________________________________________________________
max_pooling1d_20 (MaxPooling (None, 45, 64)            0         
_________________________________________________________________
conv1d_24 (Conv1D)           (None, 45, 64)            65600     
__________