In [2]:
import re
from hazm import *
import time
import numpy as np 
import pandas as pd
import pickle
import sklearn
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from collections import defaultdict
from keras.layers import LSTM,Dropout,Dense,Input,Activation,TimeDistributed
from tensorflow.keras.models import Model
from keras.callbacks import ModelCheckpoint

In [4]:
def get_data(direction):
    df = pd.DataFrame()
    df = pd.read_csv(direction,sep = "\t", error_bad_lines=False)
    df = df[['text','category']]
    df.fillna(value='نامحتوا', inplace=True)
    df = dict(df)
    texts= list(df['text'])
    return texts

def normalize_document(content):
    not_valid_characters = r'[^بهگزارشودیخنجظمستحلعثصپکف;ئطض‌آق۲۵۰چء\^ذغ۱۷۹۶۸ژ۴۳1496327805؟. ‌]'
    normalizer = Normalizer()
    content = normalizer.normalize(content)
    content = re.sub('\$[^ا-ی]+',"",content)
    content = re.sub('\r\n',' ',content)
    content = re.sub('\d+/\d+|\d+\.\d+|\d+:\d+','^',content)
    content = re.sub(not_valid_characters,'',content)
    content = re.sub('[۰-۹]+',' N ',content)
    content = re.sub('[\d]+',' N ', content)
    content = re.sub('[\^]+',' N ',content)
    content = re.sub(' +', ' ', content)
    return content
 
    
def clean(list_of_news):
    cleaned_news_list, char_set, total_char_tokens, avg_count = [], set(), 0, 0
    #list_of_news, avg_count = remove_outliers(list_of_news, test)
    for content,index in zip(list_of_news,range(len(list_of_news))):
        normalized_content = normalize_document(content)
        tokenized_content, total_char_tokens = Tokenize_Document(normalized_content,char_set, total_char_tokens)
        cleaned_news_list.append(tokenized_content)
    char_set.add('\t')
    char_set.add('\n')
    return cleaned_news_list,char_set, avg_count, total_char_tokens

def reform_sentences(news, cluster_count):
    clustered_news, cluster = [],[]
    for index,document in enumerate(news):
        for char in document:
            cluster.append(char)
            if len(cluster) > cluster_count:
                clustered_news.append(cluster[:cluster_count])
                cluster = cluster[cluster_count:]
    if len(cluster):
        clustered_news.append(cluster)
    return clustered_news
    

def map_chars2index(char_set):
    chars = sorted(list(char_set))
    char2index, index2char = defaultdict(lambda: -1), defaultdict(lambda: -1)
    for index, char in enumerate(chars):
        char2index[char], index2char[index] = index, char
    i2c_pickle = open("index2char.pickle","wb")
    pickle.dump(dict(index2char),i2c_pickle)
    i2c_pickle.close()
    c2i_pickle = open("char2index.pickle","wb")
    pickle.dump(dict(char2index),c2i_pickle)
    c2i_pickle.close()
    return char2index, index2char

def char_level_converter(char_level_news, one_hot_encoder, char2index):
    encoded_news = []
    for document in char_level_news:
        encoded_document = []
        for char in document:
            encoded_document.append(one_hot_encoder[char2index[char]])
        encoded_news.append(encoded_document)
    return encoded_news

In [3]:
texts = get_data('train.csv')

In [4]:
a = time.time()
cleaned_news, char_set, avg_count, total_char_tokens = clean(texts, False)
b= time.time()
print(b-a)

158.09116315841675


In [5]:
del(texts)

In [6]:
a = time.time()
clustered_news = reform_sentences(cleaned_news,1500)
b= time.time()
print(b-a)

32.47899913787842


In [7]:
print(total_char_tokens, len(char_set))

162686231 43


In [8]:
del(cleaned_news)
del(avg_count)
del(total_char_tokens)

In [9]:
char2index, index2char = map_chars2index(char_set)

In [11]:
one_hot_encoder = [[0 for i in range(len(char2index))] for j in range(len(char2index))]
for i, row in enumerate(char2index):
    one_hot_encoder[i][i] = 1
    one_hot_encoder[i] = np.asarray(one_hot_encoder[i])

In [12]:
a = time.time()
encoded_news = char_level_converter(clustered_news,one_hot_encoder,char2index)
b = time.time()
print(b-a)

20.320264101028442


In [13]:
del(clustered_news)

In [14]:
len(encoded_news)

108458

In [15]:
encoded_news = encoded_news[:108456]

In [17]:
t1 = time.time()
X_data, y_data = [], []
for index,sentence in enumerate(encoded_news):
    X_data.append(sentence[:-1])
    y_data.append(sentence[1:])
t2 = time.time()
print(t2-t1)

8.222413063049316


In [18]:
X_data = np.array(X_data)
y_data = np.array(y_data)

In [19]:
X_data.shape

(108456, 1499, 43)

In [3]:
inputs = keras.Input(shape=(None,len(char_set)), name = "X_data")
layer, first_hidden_state, first_cell_state = LSTM(365,return_sequences=True,return_state = True)(inputs, initial_state=None)
layer = Activation('elu')(layer)
layer, second_hidden_state, second_cell_state = LSTM(365,return_sequences=True,return_state=True)(layer, initial_state=None)
layer = Activation('elu')(layer)
layer = Dense(len(char_set))(layer)
prediction = Activation('softmax', name = 'y_pred')(layer)
model = Model(inputs, [prediction,first_hidden_state, first_cell_state,second_hidden_state, second_cell_state])
model.load_weights('weights-improvement-04-1.1540.hdf5')
model.compile(optimizer="adam", loss={'y_pred':'categorical_crossentropy'})
model.summary()

NameError: name 'char_set' is not defined

In [44]:
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [45]:
model.fit(X_data,y_data,epochs=10, batch_size=32,verbose = 1, callbacks=callbacks_list)

Epoch 1/10
Epoch 00001: loss improved from inf to 1.64349, saving model to weights-improvement-01-1.6435.hdf5
Epoch 2/10
Epoch 00002: loss improved from 1.64349 to 1.26933, saving model to weights-improvement-02-1.2693.hdf5
Epoch 3/10
Epoch 00003: loss improved from 1.26933 to 1.19339, saving model to weights-improvement-03-1.1934.hdf5
Epoch 4/10
Epoch 00004: loss improved from 1.19339 to 1.15397, saving model to weights-improvement-04-1.1540.hdf5
Epoch 5/10
 160/3390 [>.............................] - ETA: 4:19:10 - loss: 1.1381 - y_pred_loss: 1.1381

KeyboardInterrupt: 