In [None]:
!pip install hazm
!pip install tensorflow-gpu==2.0

Collecting hazm
[?25l  Downloading https://files.pythonhosted.org/packages/22/13/5a7074bc11d20dbbb46239349ac3f85f7edc148b4cf68e9b8c2f8263830c/hazm-0.7.0-py3-none-any.whl (316kB)
[K     |████████████████████████████████| 317kB 2.7MB/s 
[?25hCollecting libwapiti>=0.2.1; platform_system != "Windows"
[?25l  Downloading https://files.pythonhosted.org/packages/bc/0f/1c9b49bb49821b5856a64ea6fac8d96a619b9f291d1f06999ea98a32c89c/libwapiti-0.2.1.tar.gz (233kB)
[K     |████████████████████████████████| 235kB 8.7MB/s 
[?25hCollecting nltk==3.3
[?25l  Downloading https://files.pythonhosted.org/packages/50/09/3b1755d528ad9156ee7243d52aa5cd2b809ef053a0f31b53d92853dd653a/nltk-3.3.0.zip (1.4MB)
[K     |████████████████████████████████| 1.4MB 12.7MB/s 
Building wheels for collected packages: libwapiti, nltk
  Building wheel for libwapiti (setup.py) ... [?25l[?25hdone
  Created wheel for libwapiti: filename=libwapiti-0.2.1-cp36-cp36m-linux_x86_64.whl size=153859 sha256=b48dc5292b3c02103173fbe69

In [None]:
import numpy as np
import pandas as pd

from tensorflow.keras import optimizers
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Input, Embedding, Dropout, GlobalMaxPool1D
from tensorflow.keras.layers import LSTM, Bidirectional
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Preprocessing
from hazm import *

In [None]:
dataset = pd.read_excel('digikala.xlsx', encoding="utf-8")
dataset

Unnamed: 0,text,label
0,دوربین اصلی این گوشی دارای یک سنسور مگاپیکسلی ...,0
1,کارت‌های حافظه ای که معمولا بر روی گوشی‌های هو...,0
2,اندازه رزولوشن نمایشگر iPod دقیقا همانند iPhon...,0
3,اما صفحه نمایش دوربین فیلمبرداری JVC GZ-MG335،...,0
4,نهایتا، بر روی قاب پشتی گوشی نیز، دوربین به هم...,0
...,...,...
7018,ولي يه مشکلي داشت که در بلند مدت خيلي ضد حال م...,-2
7019,همچنین اینکه برای اسکن یک عکس 6X4 به بیش از یک...,-2
7020,"چون علاوه بر طول عرض, ضخامت هم کم تر شده که اي...",-2
7021,بعد از استيو iphone هم مرد دو ساله منتظريم اون...,-2


In [None]:
sentence_train = dataset['text']
label_train = dataset['label']

print('Number of training sentence: ', sentence_train.shape)
print('Number of training label: ', label_train.shape)

Number of training sentence:  (7023,)
Number of training label:  (7023,)


In [None]:
from collections import Counter
cnt = Counter(label_train)
cnt = dict(cnt)
print(cnt)

{0: 2000, 1: 2000, 2: 1928, -1: 937, -2: 158}


In [None]:
# Convert dataframes to numpy arrays
sentence_train = np.asarray(sentence_train)
label_train = np.asarray(label_train)

In [None]:
# Prepare labels for categorical prediction
categorical_label_train = to_categorical(label_train, 5)
categorical_label_train

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.]], dtype=float32)

In [None]:
import re
normalizer = Normalizer()
lemmatizer = Lemmatizer()
stemmer = Stemmer()
# turn a doc into clean tokens
def clean_data(doc):
    doc = normalizer.normalize(doc) # Normalize document using Hazm Normalizer
    tokenized = word_tokenize(doc)  # Tokenize text
    # ['یه', 'ساله', 'خریدم', 'خیلی', 'ضعیف', 'است', 'بازی', 'هم', 'که', 'چی', 'بگم', 'هیچ', 'کدوم', 'رو', 'باز', 'نمیکنه']
    tokens = []
    for token in tokenized:
      token = re.sub("[،:.,;()/+]", " ", token) 
      token = re.sub(r"\!+","!", token)
      token = re.sub(r"\؟+","؟", token)
      token = re.sub(r"\u200c", " ", token)
      tokens.append(token)

    tokens = [w for w in tokens if not len(w) <= 1] # single character removal 
    tokens = [w for w in tokens if not w.isdigit()] # digit remove
    tokens = [lemmatizer.lemmatize(w) for w in tokens] # Lemmatize sentence words using Hazm Lemmatizer
    tokens = [stemmer.stem(w) for w in tokens] 
    tokens = ' '.join(tokens)
    return tokens

In [None]:
# Apply preprocessing step to training data
train_docs = np.empty_like(sentence_train)
for index, document in enumerate(sentence_train):
  train_docs[index] = clean_data(document)

In [None]:
num_words = 2000

# Create the tokenizer
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_docs)
encoded_docs = tokenizer.texts_to_sequences(train_docs)
print(encoded_docs)

[[9, 389, 3, 8, 187, 17, 146, 182, 7, 41, 2, 73, 89, 90, 6, 24, 1588, 6, 66, 243, 22, 210, 10, 23, 1206], [621, 110, 38, 2, 841, 22, 12, 96, 316, 622, 165, 286, 9, 44, 7, 303], [390, 147, 127, 524, 711, 257, 270, 7, 41], [55, 29, 27, 9, 410, 1146, 968, 1147, 4, 204, 1589, 5, 390, 1042, 119, 6, 375, 173, 4, 204, 346, 11], [842, 22, 12, 423, 79, 8, 23, 9, 1, 165, 347, 17, 330, 1, 205, 7, 1389], [331, 5, 139, 3, 8, 17, 371, 882, 48, 300, 2, 15, 10, 7], [83, 228, 4, 190, 459, 737, 1, 105, 191, 691, 419, 15, 6, 136, 129, 7, 41, 2, 5, 290, 129, 218, 229, 119, 4, 151, 1268, 7, 568], [1483, 20, 19, 2, 71, 220, 53, 1713, 1590, 1855, 187, 9, 6, 337, 1148, 7, 41, 134, 363, 15, 5, 1856, 813, 1484, 304, 159, 5, 1084, 7, 41], [1207, 140, 67, 814, 1, 969, 206, 67, 218, 1857, 1, 969, 1858, 206, 66, 298, 206, 738, 1, 70, 277, 1859, 1, 969, 1858, 206, 738, 206, 66, 298, 1, 70, 277, 1, 969, 1714, 2, 1715, 4, 3, 220, 140, 7, 87], [637, 3, 9, 4, 312, 80, 1149, 815, 73, 166, 1329, 7, 87, 2, 692, 346, 5, 71,

In [None]:
# Find maximum length of training sentences
max_length = max([len(s.split()) for s in train_docs])
max_length

263

In [None]:
# Pad embeded training sequences
x_train_padded = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [None]:
x_train_padded[1]

array([621, 110,  38,   2, 841,  22,  12,  96, 316, 622, 165, 286,   9,
        44,   7, 303,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   

In [None]:
#vocabulary size
vocab_size = len(tokenizer.word_index)

In [None]:
model_blstm = Sequential()

model_blstm.add(Embedding(vocab_size, 300, input_length=max_length))
model_blstm.add(Bidirectional(LSTM(300, return_sequences=True, name='lstm_layer')))

model_blstm.add(GlobalMaxPool1D())
model_blstm.add(Dropout(0.25))
model_blstm.add(Dense(300, activation="relu"))
model_blstm.add(Dropout(0.2))
model_blstm.add(Dense(5, activation='softmax'))

In [None]:
model_blstm.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=["categorical_accuracy"])

model_blstm.summary()
batch_size_blstm = 64
epochs_blstm = 5

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 263, 300)          1829400   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 263, 600)          1442400   
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 600)               0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 600)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 300)               180300    
_________________________________________________________________
dropout_5 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 5)                

In [None]:
hist_blstm = model_blstm.fit(x_train_padded, categorical_label_train,
                             batch_size=batch_size_blstm, epochs=epochs_blstm,
                             shuffle=True)

Train on 7023 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
test = pd.read_excel('digikala_test.xlsx',encoding="utf-8")

x_test = test['text']
y_test = test['label']

In [None]:
print('Number of testing sentence: ', x_test.shape)
print('Number of testing label: ', y_test.shape)

Number of testing sentence:  (1854,)
Number of testing label:  (1854,)


In [None]:
# Convert dataframes to numpy arrays
x_test = np.asarray(x_test)
y_test = np.asarray(y_test)

In [None]:
# Applying preprocessing step to test data
test_docs = np.empty_like(x_test)
for index, document in enumerate(x_test):
  test_docs[index] = clean_data(document)

In [None]:
# Embed testing sequences
encoded_docs = tokenizer.texts_to_sequences(test_docs)
x_test_padded = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [None]:
categorical_y_test = to_categorical(y_test, 5)

In [None]:
# Evaluate model
loss_blstm, acc_blstm = model_blstm.evaluate(x_test_padded, categorical_y_test, verbose=1)
print('Test Accuracy: %f' % (acc_blstm*100))

Test Accuracy: 65.372169


In [None]:
y_pred_blstm = model_blstm.predict_classes(x_test_padded)

In [None]:
text = []
true_label = []
pred_label = []
angry = 0
happy = 0
neutral = 0
for i in range(len(y_pred_blstm)):
  text.append(x_test[i])
  true_label.append(y_test[i])

  if y_pred_blstm[i] == 0:
    neutral +=1
    pred_label.append("Neutral")
  elif y_pred_blstm[i] == 1:
    happy +=1
    pred_label.append("Happy")
  elif y_pred_blstm[i] == 2:
    happy +=1
    pred_label.append("Delighted")
  if y_pred_blstm[i] == 3:
    angry +=1
    pred_label.append("Angry")
  elif y_pred_blstm[i] == 4:
    angry +=1
    pred_label.append("Furious")

In [None]:
print("angry: " , (angry/1854)*100)
print("happy: " , (happy/1854)*100)
print("neutral: " , (neutral/1854)*100)

angry:  12.351672060409925
happy:  55.71736785329018
neutral:  31.93096008629989


In [None]:
dataFrame = pd.DataFrame({"text":text, "true label":true_label, "prediction label":pred_label})

In [None]:
dataFrame.to_excel("prediction2.xlsx", index=False)