### Installing pyarabic package

In [1]:
!pip install pyarabic



### Importing necessary Libraries 

In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
import os
import time
import glob
from random import shuffle
from pyarabic import araby
from tensorflow.keras.layers import GRU, Embedding, Dense, Input, Dropout, Bidirectional, BatchNormalization, Flatten, Reshape
from tensorflow.keras.models import Sequential
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [3]:
tf.compat.v1.enable_eager_execution()

### Reading the data and Checking it

In [4]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,labels,data
0,8,أَنا الفقير وباللَه العظيم غني # لئن فقدتك في ...
1,10,وَلوعاً بِيُمنَى نَمْنَمَتْها حَدِيقَةٌ # نَزْ...
2,11,فيا منْ لم أزلْ أحظى لديه # بفضلٍ جامعٍ بابَ ا...
3,9,وَسَلامٌ عَلَى ضَرِيحِكَ مَا أَهْ # دَتْ شَذَا...
4,8,أمِنْتُ فقري لما قُلتُ عن ثِقَةٍ # أنْ لا جواد...


In [14]:
df.shape

(20000, 2)

In [6]:
df.isna().sum()

labels    0
data      0
dtype: int64

In [7]:
df.labels.value_counts()

8     2016
7     1973
10    1972
9     1944
11    1927
2     1924
0     1924
13    1914
1     1908
4     1572
6      432
5      198
3      162
12     134
Name: labels, dtype: int64

### Helper function to clean the data

In [8]:
# Read, then decode for py2 compat.
def extract_data(df, thresh = 70, on_shatrs = False):
  global vocab
  
  text = ""
  
  X = []
  y = []
    
    
  # remove some exteranous chars 
  execluded = '!()*-ـ.:=o[]«»;؛,،~?؟\u200f\ufeffـ'
  out = ""
  for i in df["data"]:
    i = araby.strip_tashkeel(i)
    for char in i:
      if char in execluded:
        i = i.replace(char, "")
      
  i=0
  for line in df["data"]:
    if len(line) <= 1:
      continue

    label = int(df.labels[i])
    bait  = line.strip()
    if line.find("#"):
      shatrs = bait.split('#')
      for shatr in shatrs:
        X.append(shatr.strip())
        y.append(label)
    else:
      X.append(bait.strip())
      y.append(label)
  
  #create the vocab 
    i += 1
  vocab = sorted(set(' '.join(X)))  
  
  #shuffle the data 
  X, y = shuffle(X, y)
  return X, y

### Dividing the data into train and test sets

In [9]:
X, y = extract_data(df, True)

In [13]:
pd.Series(y).value_counts()

8     4032
7     3946
10    3944
9     3888
11    3854
2     3848
0     3848
13    3828
1     3816
4     3144
6      864
5      396
3      324
12     268
dtype: int64

In [10]:
X_train, X_valid , y_train, y_valid = train_test_split(X, y, test_size = 0.15, random_state = 41)

### Data pre-processing (Word Embedding)

In [15]:
# Creating a mapping from unique characters to indices
char2idx = {u:i+1 for i, u in enumerate(vocab)}

def to_sequences(X):
  X = [[char2idx[char] for char in line] for line in X]
  X = pad_sequences(X, padding='post', value=0, maxlen = 100)
  return X
 
X_train = to_sequences(X_train)
X_valid = to_sequences(X_valid)

y_train = np.array(y_train)
y_valid = np.array(y_valid)

## Deep Learning Part
### Building the model

In [16]:
model = Sequential()
model.add(Input((100,)))
model.add(Embedding(len(char2idx)+1, 256))
model.add(Bidirectional(GRU(units = 256, return_sequences=True)))
model.add(Bidirectional(GRU(units = 256, return_sequences=True)))
model.add(Bidirectional(GRU(units = 256)))
model.add(Dense(128, activation = 'relu'))
model.add(Dropout(0.3))
model.add(Dense(14, activation = 'softmax'))
model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

In [17]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 256)          14336     
                                                                 
 bidirectional (Bidirectiona  (None, 100, 512)         789504    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 100, 512)         1182720   
 nal)                                                            
                                                                 
 bidirectional_2 (Bidirectio  (None, 512)              1182720   
 nal)                                                            
                                                                 
 dense (Dense)               (None, 128)               65664     
                                                        

In [18]:
model(tf.zeros((10, 100))).shape

TensorShape([10, 14])

In [19]:
callbacks = [tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, min_delta=0.0001, min_lr=0.0001)]
callbacks += [tf.keras.callbacks.ModelCheckpoint('full_verse.h5', monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')]

### Training the Model

In [20]:
model.fit(X_train, y_train, validation_data= (X_valid, y_valid), epochs = 5, batch_size= 128, shuffle = True, callbacks=callbacks)

Epoch 1/5
Epoch 00001: val_accuracy improved from -inf to 0.55667, saving model to full_verse.h5
Epoch 2/5
Epoch 00002: val_accuracy improved from 0.55667 to 0.82600, saving model to full_verse.h5
Epoch 3/5
Epoch 00003: val_accuracy improved from 0.82600 to 0.86867, saving model to full_verse.h5
Epoch 4/5
Epoch 00004: val_accuracy did not improve from 0.86867
Epoch 5/5
Epoch 00005: val_accuracy improved from 0.86867 to 0.88683, saving model to full_verse.h5


<keras.callbacks.History at 0x7f55a5140210>

### Saving the model

---



In [25]:
model = tf.keras.models.load_model('full_verse.h5')

### Helper Functions to preprocess the test data

In [26]:
with open('labels.txt', 'r') as f:
  label2name = f.readlines()
  label2name = [name.replace('\n', '') for name in label2name]

In [43]:
def processing(sent):
  sent = araby.strip_tashkeel(sent)
  execluded = '!()*-ـ.:=o[]«»;؛,،~?؟\u200f\ufeffـ'
  out = ""
  
  for char in sent:
    if char not in execluded:
      out += char
  bait  = out.strip()
  bait = bait.replace("#", "")
  return bait

### Predicting 

In [44]:
def classify(sentence):
#   sentence = process_review(sentence)
  sentence = processing(sentence)
  sequence = [char2idx[char] for char in sentence]
  sequence = pad_sequences([sequence], maxlen = X_train.shape[1], padding='post', value=0)

  pred = model.predict(sequence)[0]
  return pred

In [46]:
df_test = pd.read_csv("test.csv")
df_submit = pd.read_csv("sample_submission.csv")
df_submit.head()

Unnamed: 0,id,labels
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [49]:
output = []
for i in range(0, df_test.shape[0]):
  out = classify(df_test["data"][i])
  output.append(out)

  #Printing the first 10 rows
  if i < 10:
    print(label2name[np.argmax(out, 0).astype('int')])

baseet
baseet
baseet
baseet
baseet
kamel
baseet
baseet
baseet
khafeef


In [57]:
for i in range(df_submit.shape[0]):
  df_submit.labels[i] = np.argmax(output[i])

In [58]:
df_submit.to_csv("submission.csv", index=False)