In [1]:
import pandas as pd
import numpy as np
import keras
from transformers import AutoTokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint
from keras.layers import LSTM, Dense, GRU, Embedding
from keras.preprocessing.sequence import pad_sequences

In [2]:
df = pd.read_csv('../PreprocessedData/preprocessed_data.csv',sep='\t',encoding='utf-8')

In [3]:
df.sample(5)

Unnamed: 0,id,text,dialect,preprocessed_text
7565,980073078909632384,@war7 مظغوط بويمن انفضحت ههههه,IQ,[مستخدم] مظغوط بويمن انفضحت هه
71742,1126485283833249792,@Nor__201 الشرهة واللوم على اللي يدفعون.,QA,[مستخدم] الشرهة واللوم على اللي يدفعون .
74632,1078416878035484800,@moazot2015 @imankais1 هل هذا يعقل؟ ع قولة معز...,QA,[مستخدم] [مستخدم] هل هذا يعقل ؟ ع قولة معزبكم ...
193426,842385563193905152,عزيزتي البنت \nقبل ماتطلبين طلب من اخوك قولي ل...,SA,عزيزتي البنت قبل ماتطلبين طلب من اخوك قولي له ...
419286,897409645056327680,@Bash9987 سؤال انته بكامل قواك العقليه؟!اذا كل...,AE,[مستخدم] سؤال انته بكامل قواك العقليه ؟ ! اذا ...


In [4]:
df.dropna(subset=['preprocessed_text'],inplace=True)

In [5]:
model_name = "aubmindlab/bert-base-arabertv02-twitter"
arabert_tokenizer = AutoTokenizer.from_pretrained(model_name)

In [6]:
def tokenize(text):
    tokens = arabert_tokenizer.encode(text)
    return tokens

In [7]:
df['tokenized_text'] = df['preprocessed_text'].apply(lambda x: tokenize(x))

In [8]:
max_len = max([len(sentence) for sentence in df['tokenized_text']])
df['indexed_text'] = [([arabert_tokenizer.pad_token_id] * (max_len - len(sentence))) + sentence for sentence in df['tokenized_text']]

In [9]:
le = LabelEncoder()
df['dialect'] = le.fit_transform(df['dialect'])

In [10]:
df.sample(5)

Unnamed: 0,id,text,dialect,preprocessed_text,tokenized_text,indexed_text
367007,1079201073137680384,@Q8Ping @latifaalsaeedan @danaalthuwaikh الله ...,6,[مستخدم] [مستخدم] [مستخدم] الله يخليكم تعلموا ...,"[2, 64, 8465, 66, 64, 8465, 66, 64, 8465, 66, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
123550,499858668712513600,"بيحكو عنك ,,, وهما ناسيين حالهم ,, \n\nاتذكر ح...",11,"بيحكو عنك , , وهما ناسيين حالهم , , اتذكر حالك...","[2, 1268, 16907, 185, 21619, 18, 18, 7416, 754...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
136690,845236338198806528,شو الواحد بدو يعلق على هاد الغبي https://t.co/...,15,شو الواحد بدو يعلق على هاد الغبي [رابط],"[2, 9016, 4332, 41954, 19306, 323, 9237, 580, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
338424,1040596257365024768,@Hameeda_W حسسني انهم علماء ورواة احاديث ومتخص...,6,[مستخدم] حسسني انهم علماء ورواة احاديث ومتخصصي...,"[2, 64, 8465, 66, 63202, 5014, 6838, 5844, 761...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
75548,968320729426092032,@Fares_s44 @DemaQtr @hassanalishaq73 @neno_515...,12,[مستخدم] [مستخدم] [مستخدم] [مستخدم] ياخي سبحان...,"[2, 64, 8465, 66, 64, 8465, 66, 64, 8465, 66, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [11]:
X_tr, X_val = train_test_split(df, test_size=0.1, random_state=42, stratify=df['dialect'])

In [12]:
X_tr.shape, X_val.shape

((412342, 6), (45816, 6))

In [13]:
X_tr = X_tr.reset_index().drop(columns='index')
X_val = X_val.reset_index().drop(columns='index')

In [14]:
class TextDataGenerator(keras.utils.all_utils.Sequence):
    def __init__(self, sequences, preds, sequence_length, vocab_size, num_classes, batch_size=32, shuffle=True):
        self.batch_size = batch_size
        self.sequences = sequences
        self.preds = preds
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.num_classes = num_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        return int(np.floor(len(self.sequences) / self.batch_size))

    def __getitem__(self, index):
        indexes = self.indexes[index * self.batch_size: (index + 1) * self.batch_size]
        sequences_batch = [self.sequences[k] for k in indexes]
        preds_batch = [self.preds[k] for k in indexes]

        X = np.array(sequences_batch)
        y = keras.utils.np_utils.to_categorical(preds_batch, num_classes=self.num_classes)

        return X, y

    def on_epoch_end(self):
        self.indexes = np.arange(len(self.sequences))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

In [15]:
batch_size = 512
num_classes = 18

params = {
  'sequence_length': max_len,
  'vocab_size': arabert_tokenizer.vocab_size,
  'num_classes':num_classes,
  'batch_size': batch_size,
  'shuffle': True
}

train_generator = TextDataGenerator(X_tr['indexed_text'], X_tr['dialect'], **params)
val_generator = TextDataGenerator(X_val['indexed_text'], X_val['dialect'], **params)

In [16]:
def LSTM_model(sequence_length, vocab_size, num_classes, layer_size):
    model = Sequential()
    model.add(Embedding(vocab_size, 128, input_length=sequence_length, trainable=True))
    model.add(LSTM(layer_size))#, recurrent_dropout=0.1, dropout=0.1
    model.add(Dense(num_classes, activation='softmax'))
    return model

In [17]:
model = LSTM_model(max_len, arabert_tokenizer.vocab_size, num_classes, 128)
model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer='adam')

In [18]:
checkpoint = ModelCheckpoint('../models/lstm_wts.hdf5', save_best_only=True, monitor='val_loss', mode='min')

In [19]:
model.fit(train_generator, steps_per_epoch=len(train_generator), epochs=4, callbacks=[checkpoint], validation_data=val_generator)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x2a88e7f1c48>