In [None]:
from __future__ import unicode_literals
from hazm import *
import tensorflow as tf
from keras.models import Sequential
import pandas as pd
from keras.layers import Dense
import numpy as np

import re
from urlextract import URLExtract
import emojis

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import Adadelta,Adam,RMSprop
import np_utils

from tqdm import tqdm

In [None]:
data = pd.read_csv('data/Digikala.csv', on_bad_lines='skip')

In [None]:
data.head()

In [None]:
data['Suggestion'].value_counts()

In [None]:
def replace_values(value):
    if value == 2 or value == 3:
        return 0
    return value

data['Suggestion'] = data['Suggestion'].apply(replace_values)

In [None]:
data['Suggestion'].value_counts()

In [None]:
def switch_values(value):
    if value == 0:
        return 1
    elif value == 1:
        return 0
    return value

data['Suggestion'] = data['Suggestion'].apply(switch_values)

In [None]:
data['Suggestion'].value_counts()

In [None]:
def _multiple_replace(mapping, text):
    pattern = "|".join(map(re.escape, mapping.keys()))
    return re.sub(pattern, lambda m: mapping[m.group()], str(text))

def convert_fa_numbers(input_str): # arabic numeral to global conversion
    mapping = {
        '۰': '0',
        '۱': '1',
        '۲': '2',
        '۳': '3',
        '۴': '4',
        '۵': '5',
        '۶': '6',
        '۷': '7',
        '۸': '8',
        '۹': '9',
        '.': '.',
    }
    return _multiple_replace(mapping, input_str)


def convert_ar_characters(input_str): # identical char conversion
    
    mapping = {
        'ك': 'ک',
        'ى': 'ی',
        'ي': 'ی',
        'ئ':'ی',
        'إ':'ا',
        'أ':'ا',
        'ة':'ه',
        'ؤ':'و'
    }
    return _multiple_replace(mapping, input_str)


def preprocess(text):
    extractor = URLExtract()
    for url in extractor.gen_urls(text):
        text = text.replace(url,'<URL>') # omitting the urls and replacing them with a cons
    emj = emojis.get(text)
    for i in emj:
        if i in text:
            text = text.replace(i,'<emoji>') # same as the url for the emojis
    text = convert_fa_numbers(text)
    text = convert_ar_characters(text)
    # regex to detect and replace all smilies in the text with <smiley>
    text = re.sub(r"(:\s?\)|:-\)|\(\s?:|\(-:|:\'\)|:\s?D|8-\)|:\s?\||;\s?\)|:-\*|:-\||:-\(|:\s?P|:-P|:-p|:-b|:-O|:-o|:-0|:-\@|:\$|:-\^|:-&|:-\*|:-\+|:-\~|:-\`|:-\>|:-\<|:-\}|:-\{|\[:\s?\]|\[:\s?\]|:\s?\]|:\s?\[|:\s?\}|:\s?\{)",'<smiley>',text)
    text = text.lower()
    text = text.strip()
    text = re.sub(r'[<>#.:()"\'!?؟،,@$%^&*_+\[\]/]', ' ', text)
    text = re.sub(r'[\s]{2,}', ' ', text)
    text = re.sub(r'(\w)\1{2,}', r'\1',text)
    if re.search(r'[\u0600-\u06FF]', text):
        return(text)
    else:
        return 'None'

In [None]:
tqdm.pandas()

In [None]:
data['cleaned'] = data['Text'].progress_apply(preprocess)

In [None]:
 data.head()

In [None]:
data = data.dropna()

In [None]:
data.info()

In [None]:
data.to_csv('data/digikala_preprocessed.csv')

In [None]:
data = pd.read_csv('data/digikala_preprocessed.csv', on_bad_lines='skip')

In [None]:
data['Score'].value_counts()

In [None]:
def replace_values_with_status(df, column_name):
    df[column_name] = df[column_name].apply(lambda value: 'UNSAT' if value <= 40 else 'SAT')
    return df

data = replace_values_with_status(data, 'Score')

In [None]:
data['Score']

In [None]:
data['Score'].value_counts()

In [None]:
data.head()

In [None]:
def set_suggestion(row):
    if row['Score'] == 'UNSAT':
        return 1
    else:
        return 0
    
data['Suggestion'] = data.apply(set_suggestion, axis=1)

In [None]:
data = data.drop('Unnamed: 0', axis=1)

In [None]:
data.head()

In [None]:
count_vectorizer = CountVectorizer()
X_count_vectorized = count_vectorizer.fit_transform(data.cleaned).todense()

In [None]:
vectorizer = TfidfVectorizer(min_df=2, max_features= 10000)
X_tfidf_vectorized = vectorizer.fit_transform(data.cleaned).todense()

In [None]:
labels = data['Suggestion'].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_count_vectorized, labels, test_size=0.2, random_state=42) 

In [None]:
X_tfidf_train, X_tfidf_test, y_tfidf_train, y_tfidf_test = train_test_split(X_tfidf_vectorized, labels, test_size=0.2, random_state=42)

In [None]:
input_dim = X_tfidf_train.shape

In [None]:
print(input_dim)

In [None]:
classifier = LogisticRegression()
X_train = np.asarray(X_train)
y_train = np.asarray(y_train)
X_test = np.asarray(X_test)
y_test = np.asarray(y_test)
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
print("Accuracy:", score)

In [None]:
clf = LogisticRegression()
X_tfidf_train = np.asarray(X_tfidf_train)
y_tfidf_train = np.asarray(y_tfidf_train)
X_tfidf_test = np.asarray(X_tfidf_test)
y_tfidf_test = np.asarray(y_tfidf_test)
clf.fit(X_tfidf_train, y_tfidf_train)
tfidf_score = clf.score(X_tfidf_test, y_tfidf_test)
print("Accuracy:", tfidf_score)

In [None]:
nb_classes = 2
batch_size = 32
nb_epochs = 10

In [None]:
from tensorflow.keras.utils import to_categorical

In [None]:
y_tfidf_train_cat = to_categorical(y_tfidf_train,)

In [None]:
model = Sequential()

model.add(Dense(1000,input_shape= (input_dim[1],)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(500))

model.add(Activation('relu'))

model.add(Dropout(0.5))

model.add(Dense(50))

model.add(Activation('relu'))

model.add(Dropout(0.5))

model.add(Dense(nb_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
tf.config.run_functions_eagerly(True)

In [None]:
model.fit(X_tfidf_train, y_tfidf_train_cat, batch_size=batch_size, epochs=nb_epochs,verbose=2)

In [None]:
y_test_pred = model.predict(X_tfidf_test)
y_test_predclass = np.argmax(y_test_pred, axis=1)
y_trian_pred = model.predict(X_tfidf_train)
y_train_predclass = np.argmax(y_trian_pred, axis=1)

In [None]:
from sklearn.metrics import accuracy_score,classification_report
print ("nDeep Neural Network - Test accuracy:",(round(accuracy_score(y_tfidf_test, y_test_predclass),4)*100))
print ("nDeep Neural Network - Train accuracy:",(round(accuracy_score(y_tfidf_train, y_train_predclass),4)*100))

In [None]:
from tensorflow.keras.models import save_model

model.save('model/digikala_keras_model.h5')

In [None]:
X_pred = vectorizer.transform([preprocess('اصلا پیشنهاد نمیشود')]).todense()

In [None]:
model.predict(X_pred)