In [1]:
def echo_log(action, text=""):
    print("\033[4m\033[1m"+action+"\033[0m\033[0m "+text)

#=======================================================================
#   Import Modules
#=======================================================================
import os
from libs import DataCleaner, Seq2SeqModel, TextSummaryWordLevelDataGenerator, CharacterLevelTokenizer
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np
import pandas as pd 
import warnings
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
pd.set_option("display.max_colwidth", 200)
warnings.filterwarnings("ignore")

In [2]:
#=======================================================================
max_text_len=256 # Characters
max_summary_len=128 # Characters
batch_size = 60
#=======================================================================

In [3]:
#=======================================================================
#   Read Dataset and preform some preprocessing
#=======================================================================
prepared_dataset_path = 'text_summary_prepared.csv'
echo_log("Searching...", f"for prepared data set on path: {prepared_dataset_path}")
if os.path.exists(prepared_dataset_path):
    echo_log("Exist.", f"prepared dataset exist on: {prepared_dataset_path}")
    echo_log("reading dateset")
    df=pd.read_csv(prepared_dataset_path) #,nrows=1000
else:
    echo_log("Not Found.", f"prepared dataset not exist on: {prepared_dataset_path}")
    echo_log("reading default dateset", "on text_summary.csv")
    df=pd.read_csv("text_summary.csv",nrows=1000) #,nrows=1000

    df = df[(df['text'].str.len() < max_text_len) & (df['summary'].str.len() < max_summary_len-2)]

    echo_log("Begin Cleaning")
    df.drop_duplicates(subset=['text'],inplace=True) #dropping duplicates
    df.dropna(axis=0,inplace=True)#dropping na

    echo_log("cleaning ......")
    #Preprocessing
    df['text'] = DataCleaner.clean(df['text'], uniform_arabic_characters=True)
    df['summary'] = DataCleaner.clean(df['summary'], remove_stop_words=False, uniform_arabic_characters=True)

    df.replace('', np.nan, inplace=True)
    df.dropna(axis=0,inplace=True)

    df.to_csv(prepared_dataset_path, index=False)
    echo_log("Successfully Cleaned")

[4m[1mSearching...[0m[0m for prepared data set on path: text_summary_prepared.csv
[4m[1mExist.[0m[0m prepared dataset exist on: text_summary_prepared.csv
[4m[1mreading dateset[0m[0m 


In [4]:
#=======================================================================
#   Initialize tokenizer, split dataset
#=======================================================================
tokenizer_path = "./outputs/tokenizer.json"
echo_log("Searching...", f"for saved tokenizer on path: {tokenizer_path}")
if os.path.exists(tokenizer_path):
    echo_log("Loading tokenizer ...", f"from: {tokenizer_path}")
    tokenizer = CharacterLevelTokenizer.from_json(tokenizer_path)
else:
    echo_log("Init tokenizer and fit on texts ...")
    #prepare a tokenizer for reviews on training data
    tokenizer = CharacterLevelTokenizer() 
    tokenizer.fit_on_texts(list(df['summary']) + list(df['text']))
    echo_log("Number of Vocab:",str(len(tokenizer.token_index)))

    # Save tokenizer to a JSON file
    tokenizer_json = tokenizer.to_json(tokenizer_path)
    echo_log("Saving tokenizer ...", f"json_file on => {tokenizer_path}")

    echo_log("Number of Vocab:",str(len(tokenizer.token_index)))


x_tr,x_val,y_tr,y_val=train_test_split(np.array(df['text']),np.array(df['summary']),test_size=0.1,random_state=0,shuffle=False)

# Using data generator
train_gen = TextSummaryWordLevelDataGenerator(data_frame=pd.DataFrame({"text": x_tr, "summary": y_tr}), tokenizer=tokenizer, 
                                               max_text_len=max_text_len, max_summary_len=max_summary_len, batch_size=batch_size)
valid_gen = TextSummaryWordLevelDataGenerator(data_frame=pd.DataFrame({"text": x_val, "summary": y_val}), tokenizer=tokenizer,
                                              max_text_len=max_text_len, max_summary_len=max_summary_len, batch_size=batch_size)

[4m[1mSearching...[0m[0m for saved tokenizer on path: ./outputs/tokenizer.json
[4m[1mLoading tokenizer ...[0m[0m from: ./outputs/tokenizer.json


In [5]:
print(15*"=="+"\n",tokenizer.texts_to_sequences(["أهلا بالعالم", "<SOS>"]))

 [[3, 74, 71, 49, 5, 50, 49, 71, 66, 49, 71, 72], [1]]


In [7]:
# STEPS_PER_EPOCH = len(df.text) / batch_size
# SAVE_PERIOD = 25
# save_freq= int(SAVE_PERIOD * STEPS_PER_EPOCH)
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=5)
#=======================================================================
#   Build, Compile, Train Model
#=======================================================================
vocab_size = len(tokenizer.token_index) + 1

model = Seq2SeqModel(input_vocab_size=vocab_size, output_vocab_size=vocab_size,
                    max_input_length=max_text_len, max_output_length=max_summary_len, latent_dim=300, embedding_dim=100);
model.build_model()

model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

model.fit_using_data_generator(train_generator=train_gen, checkpoints_saving_path='outputs/checkpoints',
                                validation_generator=valid_gen, callbacks=[es], epochs=3000)
model.save('outputs/s2s_model.keras')


Epoch 1/100
[1m1/4[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m4:19[0m 86s/step - loss: 4.3811