<a href="https://colab.research.google.com/github/parsa-abbasi/Basic-Sentiment-Analysis/blob/master/SentiPers/Classifier/NN/GoogleColab/CNN_KerasEmb/CNN_KerasEmb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install hazm
!pip install stopwords_guilannlp



# Import Libraries

In [0]:
from keras.layers import Dense, Input, LSTM, Embedding, Dropout
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Convolution1D, GlobalMaxPooling1D
from keras.layers.merge import concatenate
from keras.models import Sequential
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.models import Model
from keras import optimizers
import numpy as np
import pandas as pd
from keras.utils.np_utils import to_categorical
from keras.metrics import categorical_accuracy
from keras.utils import plot_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from gensim.models import KeyedVectors
import codecs
from stopwords_guilannlp import stopwords_output
from hazm import *

# File uploader

In [3]:
from google.colab import files
uploaded = files.upload()

Saving vocab.txt to vocab (1).txt
Saving x_test.csv to x_test (1).csv
Saving x_train.csv to x_train (1).csv
Saving y_test.csv to y_test (1).csv
Saving y_train.csv to y_train (1).csv



# Import Dataset

In [4]:
x_train = pd.Series.from_csv('x_train.csv', sep='\t')
x_test = pd.Series.from_csv('x_test.csv', sep='\t')
y_train = pd.Series.from_csv('y_train.csv', sep='\t', header=0)
y_test = pd.Series.from_csv('y_test.csv', sep='\t', header=0)

  infer_datetime_format=infer_datetime_format)


In [0]:
x_train = x_train.iloc[1:, ]
x_test = x_test.iloc[1:, ]

In [6]:
x_train.shape

(5561,)

In [7]:
x_test.shape

(1854,)

In [8]:
y_train.shape

(5561,)

In [9]:
y_test.shape

(1854,)

# Import Vocabulary

In [10]:
def load_doc(filename):
    file = codecs.open(filename, 'r', "utf8")
    text = file.read()
    file.close()
    return text

vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)
print('The size of vocab : ', len(vocab))

The size of vocab :  2671


# Vectorize

In [0]:
stop_set = stopwords_output("Persian", "set")
stop_useful = ['سلام', 'دیجی', 'کالا']
useful_set = ['خوب', 'بد', 'کاملا', 'کاملاً', 'بسیار', 'واقعا', 'واقعاً', 'فوق', 'بخش', 'طرفدارترین', 'نیست', 'هست']
puncs = ['،', '.', ',', ':', ';']

for word in stop_useful:
  if word not in stop_set:
    stop_set.add(word)

for word in useful_set:
  if word in stop_set:
    stop_set.remove(word)
  if word not in vocab:
    vocab.add(word)

# turn a doc into clean tokens
def clean_doc(doc, vocabulary):
    tokenized = word_tokenize(doc)  # Tokenize text
    tokens = []
    for t in tokenized:
      temp = t
      for p in puncs:
        temp = temp.replace(p, '')
      tokens.append(temp)
    tokens = [w for w in tokens if not w in stop_set]    # Remove stop words
    tokens = [w for w in tokens if not len(w) <= 1]
    tokens = [w for w in tokens if not w.isdigit()]
    tokens = [w for w in tokens if w in vocabulary]
    tokens = ' '.join(tokens)
    return tokens


train_docs = list()
for document in x_train:
    train_docs.append(clean_doc(document, vocab))

In [0]:
num_words = 2500

# create the tokenizer
tokenizer = Tokenizer(num_words=num_words)

# fit the tokenizer on the documents
tokenizer.fit_on_texts(train_docs)

# sequence encode
encoded_docs = tokenizer.texts_to_sequences(train_docs)

# pad sequences
max_length = max([len(s.split()) for s in train_docs])
x_train = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

test_docs = list()
for document in x_test:
    test_docs.append(clean_doc(document, vocab))

# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) + 1

In [0]:
encoded_docs = tokenizer.texts_to_sequences(test_docs)
x_test = pad_sequences(encoded_docs, maxlen=max_length, padding='post')


# CNN Model

In [0]:
categorical_y_train = to_categorical(y_train, 5)
categorical_y_test = to_categorical(y_test, 5)

## Model 1

In [15]:
model = Sequential()
model.add(Embedding(vocab_size, 2000, input_length=max_length))
model.add(Conv1D(filters=64, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(500, activation='sigmoid'))
model.add(Dense(500, activation='sigmoid'))
model.add(Dense(5, activation='softmax'))

# optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0)
# optimizers.Adagrad(lr=0.01, epsilon=None, decay=0.0)
# optimizers.Adadelta(lr=1.0, rho=0.95, epsilon=None, decay=0.0)
# optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=[categorical_accuracy])

model.summary()
batch_size = 16
epochs = 12
hist = model.fit(x_train, categorical_y_train, batch_size=batch_size, epochs=epochs)

loss, acc = model.evaluate(x_test, categorical_y_test, verbose=0)
print('Test Accuracy: %f' % (acc*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 181, 2000)         5176000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 174, 64)           1024064   
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 87, 64)            0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 5568)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 500)               2784500   
_________________________________________________________________
dense_2 (Dense)              (None, 500)               250500    
_________________________________________________________________
dense_3 (Dense)              (None, 5)                 2505      
Total para

## Model 2

In [19]:
inp = Input(shape=(max_length, ))
x = Embedding(vocab_size, 2000, input_length=max_length)(inp)
x = Conv1D(filters=64, kernel_size=4, activation='relu', padding='same')(x)
x = MaxPooling1D(pool_size=2)(x)
x = Conv1D(filters=64, kernel_size=8, activation='relu', padding='same')(x)
x = MaxPooling1D(pool_size=2)(x)
x = Conv1D(filters=64, kernel_size=16, activation='relu', padding='same')(x)
x = GlobalMaxPooling1D()(x)
x = Dropout(0.5)(x)
x = Dense(500, activation="sigmoid")(x)
x = Dense(5, activation="softmax")(x)

# optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0)
# optimizers.Adagrad(lr=0.01, epsilon=None, decay=0.0)
# optimizers.Adadelta(lr=1.0, rho=0.95, epsilon=None, decay=0.0)
# optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

opt = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

model = Model(inputs=inp, outputs=x)
model.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=[categorical_accuracy])

model.summary()
batch_size = 64
epochs = 10
hist = model.fit(x_train, categorical_y_train, batch_size=batch_size, epochs=epochs)

loss, acc = model.evaluate(x_test, categorical_y_test, verbose=0)
print('Test Accuracy: %f' % (acc*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 181)               0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 181, 2000)         5176000   
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 181, 64)           512064    
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 90, 64)            0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 90, 64)            32832     
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 45, 64)            0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 45, 64)            65600     
__________

## Model 3 (A Multi-Cannel Convolutional Neural Network)
 [See More](https://machinelearningmastery.com/develop-n-gram-multichannel-convolutional-neural-network-sentiment-analysis/)

In [27]:
# Channel 1
inputs1 = Input(shape=(max_length, ))
emb1 = Embedding(vocab_size, 2000, input_length=max_length)(inputs1)
conv1 = Conv1D(filters=32, kernel_size=2, activation='relu')(emb1)
drop1 = Dropout(0.5)(conv1)
pool1 = MaxPooling1D(pool_size=2)(drop1)
flat1 = Flatten()(pool1)

# Channel 2
inputs2 = Input(shape=(max_length, ))
emb2 = Embedding(vocab_size, 2000, input_length=max_length)(inputs2)
conv2 = Conv1D(filters=32, kernel_size=4, activation='relu')(emb2)
drop2 = Dropout(0.5)(conv2)
pool2 = MaxPooling1D(pool_size=2)(drop2)
flat2 = Flatten()(pool2)

# Channel 3
inputs3 = Input(shape=(max_length, ))
emb3 = Embedding(vocab_size, 2000, input_length=max_length)(inputs3)
conv3 = Conv1D(filters=32, kernel_size=6, activation='relu')(emb3)
drop3 = Dropout(0.5)(conv3)
pool3 = MaxPooling1D(pool_size=2)(drop3)
flat3 = Flatten()(pool3)

# Merge Channels
merged = concatenate([flat1, flat2, flat3])

dense1 = Dense(500, activation="relu")(merged)
outputs1 = Dense(5, activation="softmax")(dense1)

# optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0)
# optimizers.Adagrad(lr=0.01, epsilon=None, decay=0.0)
# optimizers.Adadelta(lr=1.0, rho=0.95, epsilon=None, decay=0.0)
# optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

model = Model(inputs=[inputs1, inputs2, inputs3], outputs=outputs1)

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=[categorical_accuracy])

model.summary()
batch_size = 16
epochs = 10
hist = model.fit([x_train, x_train, x_train], categorical_y_train, batch_size=batch_size, epochs=epochs)

loss, acc = model.evaluate([x_test, x_test, x_test], categorical_y_test, verbose=0)
print('Test Accuracy: %f' % (acc*100))

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_21 (InputLayer)           (None, 181)          0                                            
__________________________________________________________________________________________________
input_22 (InputLayer)           (None, 181)          0                                            
__________________________________________________________________________________________________
input_23 (InputLayer)           (None, 181)          0                                            
__________________________________________________________________________________________________
embedding_22 (Embedding)        (None, 181, 2000)    5176000     input_21[0][0]                   
__________________________________________________________________________________________________
embedding_