In [1]:
import pandas as pd
import os
import numpy as np
from keras import utils

Using TensorFlow backend.


In [2]:
from gensim.models import KeyedVectors
fasttext_model = KeyedVectors.load_word2vec_format('drive/My Drive/Data/wiki.ar.vec')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [3]:
# read data 
data_path = '/content/drive/My Drive/'
stu_answers= pd.read_csv(os.path.join(data_path, 'stu-answers.csv'), encoding='utf-8')
stu_answers = stu_answers.replace(np.nan, '', regex=True)
stu_answers

Unnamed: 0,question,stu_answer,grade
0,18,ز,0
1,18,جبرائيل,1
2,18,جبريل,2
3,18,جبريل عليه السلام,2
4,18,سيدنا جبريل عليه السلام,2
...,...,...,...
1255,1,الجنه,2
1256,1,جزاءه الجنة و رضى الله,2
1257,1,"‏كما قال الرسول صلى الله عليه وسلم : "" صبرا آل...",2
1258,1,جزاءه الجنة,2


In [5]:
# preprocessing 


import nltk
nltk.download('stopwords')
# stop words
arb_stopwords = set(nltk.corpus.stopwords.words("arabic"))
nltk.download('wordnet')

from nltk.stem.arlstem import ARLSTem
stemmmer = ARLSTem()

def remove_stowords(elements):
  corps = []
  for string in elements :
    string = string.strip()
    string = string.split()
    string = [ stemmmer.stem(word) for word in string if not word in arb_stopwords ]
    string = ' '.join(string)
    corps.append(string)
  return corps

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
answers = stu_answers['stu_answer'].tolist()
scores = stu_answers['grade'].tolist()
scores = utils.to_categorical(scores)
corps = remove_stowords(answers)
scores.shape,len(corps)

((1260, 3), 1260)

In [7]:
fasttext_model.most_similar('جبريل')

  if np.issubdtype(vec.dtype, np.int):


[('بجبريل', 0.8251434564590454),
 ('وجبريل', 0.7967433333396912),
 ('جبريل،', 0.7817331552505493),
 ('جبريلُ', 0.7257490158081055),
 ('لجبريل', 0.7167991399765015),
 ('جبرئيل', 0.5911530256271362),
 ('الرجوب،', 0.5503920912742615),
 ('فسجد', 0.5416853427886963),
 ('الرجوب', 0.535503625869751),
 ('بالنبي', 0.5288292169570923)]

In [8]:

# tokenization
from keras.preprocessing.text import Tokenizer,text_to_word_sequence , one_hot , text_to_word_sequence
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(filters=''''!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''''' )
tokenizer.fit_on_texts(corps)
sequences = tokenizer.texts_to_sequences(corps)
max_sequence_length = max(len(s) for s in sequences)
sequences = pad_sequences(sequences,max_sequence_length)
word2idx = tokenizer.word_index
vocab_size = len(word2idx) + 1

# word embedding
from keras.layers import Embedding
import numpy as np
EMBEDDING_DIM = 300
num_words = len(word2idx) + 1
# prepare embedding matrix
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, idx in word2idx.items():
    if (word in fasttext_model) :
        embedding_matrix[idx] = fasttext_model.get_vector(word)
    else :
      #embedding_matrix[idx] = fasttext_model.get_vector("unk")
      print("  word not exist in voca ---> " + word)    


# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=max_sequence_length,
                            trainable=False)


  word not exist in voca ---> جهري
  word not exist in voca ---> فاصدع
  word not exist in voca ---> 3
  word not exist in voca ---> 13
  word not exist in voca ---> ؤمر
  word not exist in voca ---> تعديب
  word not exist in voca ---> جزاؤ
  word not exist in voca ---> اهنة
  word not exist in voca ---> يذء
  word not exist in voca ---> رضو
  word not exist in voca ---> بالل
  word not exist in voca ---> 13سن
  word not exist in voca ---> 3سنو
  word not exist in voca ---> سخري
  word not exist in voca ---> بمتل
  word not exist in voca ---> تتك
  word not exist in voca ---> بمك
  word not exist in voca ---> 3سن
  word not exist in voca ---> ندو
  word not exist in voca ---> فموعد
  word not exist in voca ---> بدعو
  word not exist in voca ---> اقربين
  word not exist in voca ---> عرضو
  word not exist in voca ---> 4
  word not exist in voca ---> اثن
  word not exist in voca ---> 3افراد
  word not exist in voca ---> بجرا
  word not exist in voca ---> قتصر
  word not exist in voca --->

In [None]:

# train model
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
print('Build model...')
model = Sequential()
model.add(embedding_layer)
#model.add(Embedding(vocab_size,50))
model.add(LSTM(16, activation='relu'))
model.add(Dense(3, activation='softmax'))

# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(sequences, scores,
          batch_size=1, epochs=100)                           
model.save('islamic_model.h5')

Epoch 2/100
Epoch 3/100

In [None]:

def preprocces_input(input_ans):
  input_ans = remove_stowords(input_ans)
  input_ans = tokenizer.texts_to_sequences(input_ans)
  input_seq= pad_sequences(input_ans, maxlen=max_sequence_length)
  return input_seq

def predict(input_ans) :
  input_ans = [input_ans]
  input_ans = preprocces_input(input_ans)
  pred = model.predict_classes(input_ans)
  return pred[0]

for ans in stu_answers['stu_answer'].tolist() :
  print(ans, predict(input_ans))

AttributeError: ignored