In [None]:
import numpy as np
import pandas as pd
from transformers import GPT2TokenizerFast
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Dense, GRU, Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras import layers, models, optimizers, callbacks
import keras
import os
import re

In [None]:
model_name = "aubmindlab/aragpt2-base"
tokenizer = GPT2TokenizerFast.from_pretrained(model_name)

In [None]:
df = pd.read_csv('data.csv')
print(df.shape)

In [None]:
# df = df[:10]

In [None]:
X_tr, X_val = train_test_split(df['text'], test_size=0.1, random_state=42)

In [None]:
class Text:
    def __init__(self, input_text, tokenizer, predict=False, decode=False):
        if decode:
            self.content = tokenizer.decode(input_text)
            self.indexed_tokens = input_text
        else:
            self.content = input_text
            self.predict = predict
            self.indexed_tokens = self.tokenize()
        
    def __repr__(self):
        return self.content
  
    def __len__(self):
        return len(self.indexed_tokens)

    def preprocess(self):
        # remove punctuations
        self.content_preprocess = re.sub("^[\uFE70-\uFEFF]", " ", self.content)
        self.content_preprocess = re.sub(r"[.،\"()0-9:A-Za-z,!%-/؟'ّ»ـ»'ً«'ُ'ْ'َ'ٍ{}؛'ِ'ٌ…\\|\xad”@_?<>’“\]\[éà=‘]","",self.content_preprocess) 
        
        words=[]
        for i in self.content_preprocess.split():
            i.strip()
            words.append(i)
        self.content = (" ".join(words)).strip()

    def tokenize(self):
        if self.predict == False:
            self.preprocess()
        indexed_tokens = tokenizer.encode(self.content)
        return indexed_tokens

    def tokens_info(self):
        print('total tokens: %d' % (len(self.indexed_tokens)))

In [None]:
class Sequences():
    def __init__(self, text_object, max_len, step):
        self.tokens_ind = text_object.indexed_tokens
        self.max_len = max_len
        self.step = step
        self.sequences, self.next_subwords = self.create_sequences()
  
    def __repr__(self):
        return 'Sequence object of max_len: %d and step: %d' % (self.max_len, self.step)
  
    def __len__(self):
        return len(self.sequences)

    def create_sequences(self):
        input_sequences = []
        for i in range(1, len(self.tokens_ind), self.step):
            n_gram_sequence = self.tokens_ind[:i+1]
            input_sequences.append(n_gram_sequence)

        input_sequences = pad_sequences(input_sequences, maxlen=max_len, padding='pre')
        sequences, next_subwords = input_sequences[:,:-1],input_sequences[:,-1]
    
        return sequences, next_subwords

    def sequences_info(self):
        print('number of sequences of length %d: %d' % (self.max_len, len(self.sequences)))

In [None]:
max_len = 20
step = 1

In [None]:
train_sequences=[]
train_next_subwords=[]

for line in X_tr:
    line_encoded = Text(line, tokenizer)
    # print(len(line_encoded.tokens))
    line_sequences = Sequences(line_encoded, max_len, step)
    train_sequences += [seq.tolist() for seq in line_sequences.sequences]
    train_next_subwords += line_sequences.next_subwords.tolist()
    # line_sequences.sequences_info()

In [None]:
val_sequences=[]
val_next_subwords=[]

for line in X_val:
    line_encoded = Text(line, tokenizer)
    # print(len(line_encoded.tokens))
    line_sequences = Sequences(line_encoded, max_len, step)
    val_sequences += [seq.tolist() for seq in line_sequences.sequences]
    val_next_subwords += line_sequences.next_subwords.tolist()
    # line_sequences.sequences_info()

In [None]:
class TextDataGenerator(keras.utils.all_utils.Sequence):
    def __init__(self, sequences, next_subwords, sequence_length, vocab_size, batch_size=32, shuffle=True):
        self.batch_size = batch_size
        self.sequences = sequences
        self.next_subwords = next_subwords
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        return int(np.floor(len(self.sequences) / self.batch_size))

    def __getitem__(self, index):
        indexes = self.indexes[index * self.batch_size: (index + 1) * self.batch_size]
        sequences_batch = [self.sequences[k] for k in indexes]
        next_subwords_batch = [self.next_subwords[k] for k in indexes]

        X = np.array(sequences_batch)
        y = keras.utils.np_utils.to_categorical(next_subwords_batch, num_classes=self.vocab_size)

        return X, y

    def on_epoch_end(self):
        self.indexes = np.arange(len(self.sequences))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

In [None]:
batch_size = 512

params = {
  'sequence_length': max_len,
  'vocab_size': tokenizer.vocab_size+1,
  'batch_size': batch_size,
  'shuffle': True
}

train_generator = TextDataGenerator(train_sequences, train_next_subwords, **params)
val_generator = TextDataGenerator(val_sequences, val_next_subwords, **params)

In [None]:
def LSTM_model(sequence_length, vocab_size, layer_size):
    model = Sequential()
    model.add(Embedding(vocab_size, 128, input_length=sequence_length-1, trainable=True))
    model.add(LSTM(layer_size))#, recurrent_dropout=0.1, dropout=0.1
    model.add(Dense(vocab_size, activation='softmax'))
    return model

In [None]:
model = LSTM_model(max_len, tokenizer.vocab_size+1, 256)
model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer='adam')

In [None]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [None]:
model.fit(train_generator, steps_per_epoch=len(train_generator), epochs=20, callbacks=[checkpoint_callback], validation_data=val_generator)

In [None]:
def generate_seq(model, seq_length, seed_text, n_subwords):
    encoded = Text(seed_text, tokenizer, True).indexed_tokens
    # generate a fixed number of subwords
    for _ in range(n_subwords):
        # truncate sequences to a fixed length
        encoded_seq = pad_sequences([encoded], maxlen=seq_length-1, truncating='pre')
        # predict subword
        predict_x = model.predict(encoded_seq, verbose=0)
        yhat=np.argmax(predict_x,axis=1)[0]
        encoded.append(yhat)    
    out_text = Text(encoded, tokenizer, decode=True).content
    return out_text

In [None]:
txt = 'قررت المحكمة'
print(generate_seq(model, max_len, txt, 40))