In [23]:
import numpy as np
import pandas as pd
import hazm as hz
from hazm import Normalizer, word_tokenize,stopwords_list,Stemmer
import re
from keras.preprocessing import sequence
import tensorflow as tf
from keras.models import Sequential
from tensorflow import keras
from tensorflow.keras import layers
import itertools    
from keras.layers import Embedding, LSTM, Dense, Dropout,Masking
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


In [13]:
df=pd.read_csv('snappfood.csv', on_bad_lines='skip' , delimiter='\t')
df.head()

Unnamed: 0.1,Unnamed: 0,comment,label,label_id
0,,واقعا حیف وقت که بنویسم سرویس دهیتون شده افتضاح,SAD,1.0
1,,قرار بود ۱ ساعته برسه ولی نیم ساعت زودتر از مو...,HAPPY,0.0
2,,قیمت این مدل اصلا با کیفیتش سازگاری نداره، فقط...,SAD,1.0
3,,عالللی بود همه چه درست و به اندازه و کیفیت خوب...,HAPPY,0.0
4,,شیرینی وانیلی فقط یک مدل بود.,HAPPY,0.0


In [14]:
#remove nan values
df=df[['comment','label_id']]
df=df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 69480 entries, 0 to 69999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   comment   69480 non-null  object 
 1   label_id  69480 non-null  float64
dtypes: float64(1), object(1)
memory usage: 1.6+ MB


In [15]:
def preprocess(text):
    text = re.sub(r"[\{\}\؛\*\=\-\_\+\/\n]"," ",str(text))
    text = re.sub("[ ]+"," ",text)
    text = re.sub("\!+","!",text)
    text = re.sub("[؟]+","؟",text)
    text = re.sub("[.]+","",text)
    text = re.sub("[،]+","",text)
    # replace Finglish words with an empty string
    finglish_pattern = r"[a-zA-Z]+"
    if finglish_pattern in text:   
        text = re.sub(finglish_pattern, "", text)
    for c in "..آابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهیئ":
        text = re.sub(f"[{c}]+", c, text)
    # \u200c:separate two characters that should not be connected,\r\n:remove line break
    text=text.replace('\u200c', '').replace('\r\n',' ').replace('|',' ')
    #normalize the text
    text = normalizer.normalize(text)
    words = []
    words.append(hz.word_tokenize(text))
    return words

train_data = df['comment'].apply(preprocess)
df['comment'] = list(itertools.chain(*train_data))

In [16]:
# Remove stopwords
stopwords=stopwords_list()
df['comment'] = df['comment'].apply(lambda x: ' '.join([word for word in x if word not in stopwords]))

In [17]:
#remove english comments
english_text=df[df.comment.str.contains(r'[a-zA-Z]+')]
idx=english_text.index
df=df.drop(idx).reset_index()


In [18]:
# find the stemm of words
stemmer = hz.Stemmer()
def stem_comment(comment):
    return ' '.join([stemmer.stem(word) for word in comment.split()])

# Apply stemming to 'comment' column
df['comment'] = df['comment'].apply(stem_comment)


In [19]:
df=df[['comment','label_id']]

In [20]:
df.head()

Unnamed: 0,comment,label_id
0,واقعا حیف وق بنویس سرویس دهیتون افتضاح,1.0
1,قرار ۱ ساعته برسه ن ساع زود موقع چقدر پلاک خفن...,0.0
2,قیم مدل اصلا کیفیت سازگار نداره ظاهر فریبنده د...,1.0
3,درس اندازه کیف امیداور کیفیتون باشه مشتر همیشگ بش,0.0
4,شیرین وانیل مدل,0.0


In [24]:
# Preprocessing
tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(df['comment'])
print(dict(list(tokenizer.word_index.items())[0:3]))
#transforms each text in texts to a sequence of integers
X = tokenizer.texts_to_sequences(df['comment'])
#adding padding to comments
X = pad_sequences(X, maxlen=100)
# Splitting data into training and testing set
y = df['label_id']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train

{'غذا': 1, 'کیف': 2, 'سفار': 3}


array([[  0,   0,   0, ..., 212,  88,  63],
       [  0,   0,   0, ...,  41, 202,  42],
       [  0,   0,   0, ..., 100,  10,   8],
       ...,
       [  0,   0,   0, ..., 837,   4, 246],
       [  0,   0,   0, ..., 100,   6,  28],
       [  0,   0,   0, ...,  54, 334,   4]], dtype=int32)

In [22]:
def get_model():
    model = Sequential()
    model.add(Masking(mask_value=0))
    model.add(Embedding(10000, 128, input_length=100))
    model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))

    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
model=get_model()
# Training model
model.fit(X_train, y_train, batch_size=32, epochs=5, validation_data=(X_test, y_test))

# Evaluate model
score = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (score[1]*100))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 82.39%
