In [25]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from tensorflow.keras.layers import LSTM, GRU, Embedding, Dense, Dropout, Bidirectional
from tensorflow.keras.models import Sequential
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras_preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping

from preprocessing_utlis import normalize

import pickle

# Load Cleaned Data

In [13]:
df_clean = pd.read_csv('./Names web dataset/all_data_clean.csv')

# Splitting Data

In [14]:
# shuffle Data
df_clean = shuffle(df_clean)

train, test = train_test_split(df_clean, test_size=0.3,random_state=42, stratify=df_clean['label'])

In [15]:
vocab_sz = 800
tok = Tokenizer(num_words=vocab_sz, oov_token='UNK')
tok.fit_on_texts(train['clean_tweet'])

In [17]:
X_train = tok.texts_to_sequences(train['Names'])
X_test = tok.texts_to_sequences(test['Names'])


In [18]:
maxlen = max([len(t) for t in train['Names']])
maxlen

26

In [19]:
X_train_padded = np.array(pad_sequences(X_train,
                          maxlen=maxlen,
                          padding='post',
                          truncating='post'))

X_test_padded = np.array(pad_sequences(X_test,
                          maxlen=maxlen,
                          padding='post',
                          truncating='post'))

X_train_padded.shape, X_test_padded.shape

((3620, 26), (1552, 26))

In [20]:
y_train = np.asarray(train['label']).astype('float32')
y_test = np.asarray(test['label']).astype('float32')

In [21]:
embedding_size = 100

model = Sequential()
model.add(Embedding(vocab_sz+1, embedding_size, mask_zero=True, input_length=maxlen))
model.add(Bidirectional(LSTM(units = 32)))
model.add(Dense(32, activation = 'relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation = 'sigmoid'))

model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [22]:
callbacks_lst = [EarlyStopping(monitor='val_accuracy', mode='max', patience=2)]
# Training
batch_size = 128
epochs = 5
print('Train...')
history = model.fit(X_train_padded, y_train,
          batch_size=batch_size,
          epochs=epochs,          
          validation_data=(X_test_padded, y_test),
        #   callbacks=callbacks_lst
                   )

Train...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [27]:
class_names = ['Fake Name' , 'Valid Name']
def classify(sentence):

  sentence = [normalize(sent) for sent in sentence]

  sequence = tok.texts_to_sequences(sentence)
  sequence = pad_sequences(sequence, maxlen=maxlen, padding='post', truncating='post')
  pred = model.predict(sequence)[0][0]
  print(class_names[np.round(pred).astype('int')], pred)

In [29]:
tst_sent = ['محمد السعيد خليفه']
classify(tst_sent)
classify(['باسمم وحةد السد'])
classify(['باسم وحيد السيد'])
classify(['يشسبة كخسيشبىن سيبش'])

Valid Name 0.98991346
Fake Name 0.07966373
Valid Name 0.99031186
Fake Name 0.07966373


In [30]:
model.save('../saved_model/tria2.h5')
with open('../saved_model/tokenizer2.pickle', 'wb') as f:
    pickle.dump(tok, f, protocol=pickle.HIGHEST_PROTOCOL)
