##clone repo

In [None]:
!git clone https://github.com/HamidrezaHayatjou/Persian_Sentiment_Analysis.git

##install and import libs

In [None]:
!pip install hazm
!pip install clean-text
# after installing please Restart Runtime

In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy import stats as s
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from sklearn.metrics import (accuracy_score, precision_score, recall_score,f1_score)
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from statistics import mode

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten,Dense, GRU, Embedding, LSTM, Dropout,Conv1D
from tensorflow.keras.layers import BatchNormalization, GlobalMaxPooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk import FreqDist

from cleantext.clean import clean
from hazm import *
import hazm
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


##Reading and exploring digikala comment dataset

In [4]:
cd Persian_Sentiment_Analysis

/content/Persian_Sentiment_Analysis


In [5]:
df = pd.read_excel('digikala_comment_dataset.xlsx')
df = df[["recommend", "comment"]]
df.rename(columns = {'recommend':'LABEL'}, inplace = True)
df.rename(columns = {'comment':'COMMENT'}, inplace = True)
print(df.shape) #(100000, 12)

(100000, 2)


In [None]:
print(df['LABEL'].value_counts())
print("-------------------------")
print(df.isnull().sum())

In [7]:
df.dropna(subset=['LABEL', 'COMMENT'], inplace=True)
print(df.isnull().sum())
print("-------------------------")
print(df.shape)

LABEL      0
COMMENT    0
dtype: int64
-------------------------
(99883, 2)


In [None]:
df = df[~df.LABEL.isin(["\\N"])]
print(df.shape)

df.reset_index(drop=True, inplace=True)
print(df.shape)

In [9]:
LABEL_names = df.LABEL.unique()
LABEL_names

array(['recommended', 'not_recommended', 'no_idea'], dtype=object)

In [10]:
label2id= {'recommended': 0, 'not_recommended': 1,'no_idea': 2}
id2label= {0: 'recommended', 1: 'not_recommended', 2:'no_idea'}

print(f'label2id: {label2id}')
print(f'id2label: {id2label}')

label2id: {'recommended': 0, 'not_recommended': 1, 'no_idea': 2}
id2label: {0: 'recommended', 1: 'not_recommended', 2: 'no_idea'}


In [None]:
df['LABEL'] = df.LABEL.replace(label2id)
df.head()

In [None]:
Sentiment_labels = ['recommended', 'not_recommended','no_idea']
label_counts = [0 for e in Sentiment_labels]
labelcount = df['LABEL'].value_counts()
label_counts = label_counts

plt.bar(Sentiment_labels, labelcount)
plt.xlabel('Sentiments')
plt.ylabel('Number of comments for each label')
plt.show()

##preproces

In [13]:
df["comment_len_by_words"] = np.nan
for i in tqdm(range (0, len(df['COMMENT']))):
  df['comment_len_by_words'][i] = len(hazm.word_tokenize(str(df['COMMENT'][i])))

#df['comment_len_by_words'] = df['COMMENT'].apply(lambda t: len(hazm.word_tokenize(str(t))))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['comment_len_by_words'][i] = len(hazm.word_tokenize(str(df['COMMENT'][i])))
100%|██████████| 63586/63586 [00:27<00:00, 2306.34it/s]


In [14]:
min_len = df["comment_len_by_words"].min()
max_len = df["comment_len_by_words"].max()
print(f'Minimum: {min_len} \tMaximum: {max_len}')

Minimum: 1.0 	Maximum: 985.0


In [15]:
def data_gl_than(data, less_than=100.0, greater_than=0.0, col='comment_len_by_words'):

    data_length = data[col].values
    data_glt = sum([1 for length in data_length if greater_than < length <= less_than])
    data_glt_rate = (data_glt / len(data_length)) * 100
    print(f'Texts with word length of greater than {greater_than} and less than {less_than} includes {data_glt_rate:.2f}% of the whole!')

data_gl_than(df, 256, 3)

Texts with word length of greater than 3 and less than 256 includes 94.49% of the whole!


In [16]:
# remove comments with the length of fewer than three words
minlim, maxlim = 3, 256
df['comment_len_by_words'] = df['comment_len_by_words'].apply(lambda len_t: len_t if minlim < len_t <= maxlim else None)
df = df.dropna(subset=['comment_len_by_words'])
df = df.reset_index(drop=True)

In [None]:
fig1 = go.Figure()
fig1.add_trace(go.Histogram(x=df['comment_len_by_words']))
fig1.update_layout(title_text='Distribution of word counts within comments', xaxis_title_text='Word Count', yaxis_title_text='Frequency', bargap=0.2, bargroupgap=0.2)

fig2 = go.Figure()
groupby_rate = df.groupby('LABEL')['LABEL'].count()
fig2.add_trace(go.Bar(x=list(sorted(groupby_rate.index)), y=groupby_rate.tolist(), text=groupby_rate.tolist(), textposition='auto'))
fig2.update_layout(title_text='Distribution of rate within comments', xaxis_title_text='Rate', yaxis_title_text='Frequency',bargap=0.2, bargroupgap=0.2)

fig1.show()
fig2.show()

**Due to data imbalance, we throw away part of the data.**

In [19]:
sorted_df=df.sort_values(by=['LABEL'])
sorted_df.reset_index(drop=True, inplace=True)
# print(sorted_df.shape)
# sorted_df
sorted_df['LABEL'].value_counts()

0    34708
1    15306
2    10069
Name: LABEL, dtype: int64

In [20]:
print(len(sorted_df['LABEL'][0:10069]))
print(len(sorted_df['LABEL'][34708:44777]))
print(len(sorted_df['LABEL'][50014:]))

10069
10069
10069


In [None]:
data = pd.concat([sorted_df[0:10069], sorted_df[34708:44777], sorted_df[50014:]])
data = shuffle(data)
data.reset_index(drop=True, inplace=True)
print(len(data))
data

**cleaning data**

In [18]:
#this function borrowed from https://colab.research.google.com/github/hooshvare/parsbert/blob/master/notebooks/Taaghche_Sentiment_Analysis.ipynb
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext


def cleaning(text):
    text = text.strip()

    # regular cleaning
    text = clean(text, fix_unicode=True, to_ascii=False, lower=True, no_line_breaks=True, no_urls=True, no_emails=True,
                 no_phone_numbers=True, no_numbers=False, no_digits=False, no_currency_symbols=True, no_punct=False,
                 replace_with_url="", replace_with_email="", replace_with_phone_number="", replace_with_number="", replace_with_digit="0",
                 replace_with_currency_symbol="",)

    # cleaning htmls
    text = cleanhtml(text)

    # normalizing
    normalizer = hazm.Normalizer()
    text = normalizer.normalize(text)

    # removing wierd patterns
    wierd_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u'\U00010000-\U0010ffff'
        u"\u200d"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\u3030"
        u"\ufe0f"
        u"\u2069"
        u"\u2066"
        # u"\u200c"
        u"\u2068"
        u"\u2067"
        "]+", flags=re.UNICODE)

    text = wierd_pattern.sub(r'', text)

    # removing extra spaces, hashtags
    text = re.sub("#", "", text)
    text = re.sub("\s+", " ", text)

    return text

It takes about 7 hours to run this cell. Therefore, you can import the cleared data by running the next cell.

In [None]:
#It takes about 7 hours to run this cell. Therefore, you can import the cleared data by running the next cell.
data["cleaned_comment"] = np.nan
for i in tqdm(range (0, len(data['COMMENT']))):
  data['cleaned_comment'][i] = cleaning(str(data['COMMENT'][i]))

data = data[['cleaned_comment', 'LABEL']]
data.columns = ['COMMENT', 'LABEL']
data.to_excel("cleaned_digikala_dataset.xlsx", index=False)
# data.head()

In [None]:
data = pd.read_excel('/content/Persian_Sentiment_Analysis/cleaned_digikala_dataset.xlsx')
data.head()

In [None]:
fig = go.Figure()
groupby_label = data.groupby('LABEL')['LABEL'].count()
fig.add_trace(go.Bar(x=list(sorted(groupby_label.index)), y=groupby_label.tolist(), text=groupby_label.tolist(), textposition='auto'))
fig.update_layout(title_text='Distribution of label within comments', xaxis_title_text='Label', yaxis_title_text='Frequency', bargap=0.2, bargroupgap=0.2)
fig.show()

##tokenizing, vectorizing, splitting data

In [None]:
idx = np.random.randint(0, len(data))
sample_comment = data.iloc[idx]['COMMENT']
sample_label = data.iloc[idx]['LABEL']

print(f'Sample: \n{sample_comment}\n{sample_label}')

In [None]:
target = data['LABEL']
text = data['COMMENT']
Xtrain = text
Ytrain= target
print(Xtrain.shape)

In [29]:
Xtrain, Ytrain = data['COMMENT'], data['LABEL']
print(Xtrain.shape)

(30207,)


In [32]:
all_words=' '.join(Xtrain)
all_words=word_tokenize(all_words)
dist=FreqDist(all_words)
num_unique_word=len(dist)
print ('number unique word:',num_unique_word)

number unique word: 37005


In [None]:
max_tokens = 100
num_words = num_unique_word
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(Xtrain)

x_train_tokens = tokenizer.texts_to_sequences(Xtrain)

#print(Xtrain[100])
print(x_train_tokens[100])
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens,padding='post')


idx = tokenizer.word_index
inverse_map = dict(zip(idx.values(), idx.keys()))
def tokens_to_string(tokens):

    words = [inverse_map[token] for token in tokens if token!=0]
    text = ' '.join(words)
    return text
print('train shape',x_train_pad.shape)

In [48]:
txt_train, txt_test, lbl_train, lbl_test = train_test_split(x_train_pad, Ytrain,test_size=0.1)
txt_train, txt_valid, lbl_train, lbl_valid = train_test_split(txt_train, lbl_train, test_size=0.2)

x_train, y_train = txt_train.tolist(), lbl_train.tolist()
x_valid, y_valid = txt_valid.tolist(), lbl_valid.tolist()
x_test, y_test = txt_test.tolist(), lbl_test.tolist()

##Model definition and training

In [49]:
embedding_size = 250

model = Sequential()
model.add(Embedding(input_dim=num_words, output_dim=embedding_size, input_length=max_tokens, name='embedding_layer'))
# model.add(Dropout(0.25))
model.add(Conv1D(256,kernel_size=3,padding='same',activation='elu',strides=1))
model.add(BatchNormalization())
model.add(GlobalMaxPooling1D())
model.add(Dense(200, activation='elu'))
model.add(Dropout(0.25))
model.add(Dense(3,activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history = model.fit(x_train, y_train, batch_size=256, epochs=10, validation_data=(x_valid, y_valid), verbose=1)

##predicting

In [51]:
y_pred = model.predict(x_test)
y_pred = np.argmax(y_pred, axis=-1)



In [52]:
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred , average="micro")
precision = precision_score(y_test, y_pred , average="micro")
f1 = f1_score(y_test, y_pred, average="micro")

print("accuracy:","%.2f" %(accuracy*100))
print("racall:","%.2f" %(recall*100))
print("precision:","%.2f" %(precision*100))
print("f1score:","%.2f" %(f1*100))

accuracy: 62.83
racall: 62.83
precision: 62.83
f1score: 62.83


In [55]:
print(id2label)

{0: 'recommended', 1: 'not_recommended', 2: 'no_idea'}


In [54]:
for i in range (0,5):
  a = x_test[i]
  print(a)
  text = tokenizer.sequences_to_texts([a])
  print(text)
  print('true label is : ', y_test[i])
  print('pred label is : ', y_pred[i])

[9, 289, 1190, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['خیلی بدرد نخور بود']
true label is :  1
pred label is :  1
[381, 1, 1730, 417, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['قشنگه و مث عکسش بود']
true label is :  2
pred label is :  2
[123, 33699, 4, 85, 80, 348, 31, 153, 2068, 3723, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 