## Machathon 2.0 Qualification Round
### Arabic Poetry Meter

In [1]:
import os
import io
import glob
import numpy as np
import pandas as pd
import tensorflow as tf
from random import shuffle
from pyarabic import araby
from sklearn.utils import shuffle
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import GRU, Embedding, Dense, Input, Dropout, Bidirectional
!pip install pyarabic



### Reading Data

In [2]:
df=pd.read_csv('../input/poemdatasett/train.csv',header=0)
df=pd.read_csv(io.StringIO(u""+df.to_csv(header=None,index=False)), header=None)
df.to_csv("trainFile.txt", header=None, index=None, sep=' ', mode='w')

In [3]:
with open('../input/poemdatasett/labels.txt', 'r') as f:
    data_labels = f.readlines()
    data_labels = [name.replace('\n', '') for name in data_labels]

### Preprocessing Arabic Data

In [4]:
def splitting(path, thresh = 70, on_shatrs = False):
    
    global ALL_WORDS
    words = ""
    X = []
    y = []
    file = open(path, 'r').read()
    file = araby.strip_tashkeel(file)
    removed = '!()*-ـ.:=o[]«»;؛,،~?؟\u200f\ufeffـ'
    after_R = ""
    for char in file:
        if char not in removed:
            after_R += char
    words += after_R
    splitted = after_R.split('\n')
    for line in splitted:
        if len(line) <= 1:
            continue
        data_label, splitted = line.split(' ', 1)
        data_label = int(data_label)
        splitted  = splitted.strip()
        if on_shatrs:
            shatrs = splitted.split('#')
            for shatr in shatrs:
                X.append(shatr.strip())
                y.append(data_label)
        else:
            X.append(splitted.strip())
            y.append(data_label)
    ALL_WORDS = sorted(set(' '.join(X)))  
    X, y = shuffle(X, y)
        
    return X, y

In [5]:
X, y = splitting("./trainFile.txt", on_shatrs=False)

### Showing First Five Abyat

In [6]:
for i in range(5):
    print(X[i], ' ', data_labels[y[i]])

"اذا هبط القوم المباغي سمت به # مآرب شتى ما يسف رفيعها"   mutakareb
"وانتقال الطبع شيء معوز # مثل ما أعوز لين الجلمد"   ramal
"وللهجاء نجوم # ترى سماءك أرضا"   mujtath
"مبتدا الحسن صيغ منها ومنها # فرق الحسن في جميع العباد"   khafeef
"لا توحش الوحدة أصحابها # إن سهيلا وحده فارد"   saree


### Splitting to train & validation data

In [7]:
X_train, X_validation , y_train, y_validation = train_test_split(X, y, test_size = 0.18, random_state = 30)

### Converting Arabic Characters to Index

In [8]:
convert_char_idx = {u:i+1 for i, u in enumerate(ALL_WORDS)}

### Making Sequences

In [9]:
def padding(X):
    X = [[convert_char_idx[char] for char in line] for line in X]
    X = pad_sequences(X, padding='post', value=0, maxlen = 100)
    return X

In [10]:
X_train = padding(X_train)
X_validation = padding(X_validation)
y_train = np.array(y_train)
y_validation = np.array(y_validation)

### Building The Model

In [11]:
classifier = Sequential()
classifier.add(Input((100,)))
classifier.add(Embedding(len(convert_char_idx)+1, 256))
classifier.add(Bidirectional(GRU(units = 512, return_sequences=True)))
classifier.add(Bidirectional(GRU(units = 256, return_sequences=True)))
classifier.add(Bidirectional(GRU(units = 256, return_sequences=True)))
classifier.add(Bidirectional(GRU(units = 256)))
classifier.add(Dense(64, activation = 'relu'))
classifier.add(Dropout(0.3))
classifier.add(Dense(len(data_labels), activation = 'softmax'))
classifier.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

In [12]:
classifier.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 256)          10240     
_________________________________________________________________
bidirectional (Bidirectional (None, 100, 1024)         2365440   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 512)          1969152   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 100, 512)          1182720   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 512)               1182720   
_________________________________________________________________
dense (Dense)                (None, 64)                32832     
_________________________________________________________________
dropout (Dropout)            (None, 64)                0

In [13]:
classifier(tf.zeros((10, 100))).shape

TensorShape([10, 14])

In [14]:
CB = [tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, min_delta=0.0001, min_lr=0.0001)]
CB += [tf.keras.callbacks.ModelCheckpoint('MAAN_Model', monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')]

### Training The Model

In [15]:
classifier.fit(X_train, y_train, validation_data= (X_validation, y_validation), epochs = 17, batch_size= 64, shuffle = True, callbacks=CB)

Epoch 1/17

Epoch 00001: val_accuracy improved from -inf to 0.18278, saving model to MAAN_Model
Epoch 2/17

Epoch 00002: val_accuracy improved from 0.18278 to 0.34194, saving model to MAAN_Model
Epoch 3/17

Epoch 00003: val_accuracy improved from 0.34194 to 0.60500, saving model to MAAN_Model
Epoch 4/17

Epoch 00004: val_accuracy improved from 0.60500 to 0.74944, saving model to MAAN_Model
Epoch 5/17

Epoch 00005: val_accuracy improved from 0.74944 to 0.80556, saving model to MAAN_Model
Epoch 6/17

Epoch 00006: val_accuracy improved from 0.80556 to 0.84278, saving model to MAAN_Model
Epoch 7/17

Epoch 00007: val_accuracy improved from 0.84278 to 0.85694, saving model to MAAN_Model
Epoch 8/17

Epoch 00008: val_accuracy improved from 0.85694 to 0.87333, saving model to MAAN_Model
Epoch 9/17

Epoch 00009: val_accuracy improved from 0.87333 to 0.88528, saving model to MAAN_Model
Epoch 10/17

Epoch 00010: val_accuracy improved from 0.88528 to 0.89194, saving model to MAAN_Model
Epoch 11/17


<tensorflow.python.keras.callbacks.History at 0x7f9db46ad150>

### Saving The Model

In [16]:
classifier.save('MAAN_Model_Final.h5')

### Loading Model

In [17]:
classifier = tf.keras.models.load_model('MAAN_Model_Final.h5')

### Classifier's Preprocessing Function

In [18]:
def classifier_preprocess_data(text, thresh = 70, on_shatrs = False):
    
    words = ""
    X = []
    text = araby.strip_tashkeel(text)
    removed = 'ـ!()*-ـ.:=o[]«»;؛,،~?؟\u200f\ufeffـ'
    after_R = ""
    for char in text:
        if char not in removed:
            after_R += char
    words += after_R
    splitted = after_R
    splitted  = splitted.strip()
    if on_shatrs:
        shatrs = splitted.split('#')
        for shatr in shatrs:
            X.append(shatr.strip())
    else:
        X.append(splitted.strip())
    X = X[0]
    return X 

In [19]:
data_labels = {i:name for i, name in enumerate(data_labels)}

### Prediction Function

In [20]:
def main_classify_func(sentence):
    sentence = classifier_preprocess_data(sentence, on_shatrs=False)
    sequence = [convert_char_idx[char] for char in sentence]
    sequence = pad_sequences([sequence], maxlen = X_train.shape[1], padding='post', value=0)
    pred = classifier.predict(sequence)[0]
    print(data_labels[np.argmax(pred, 0).astype('int')], np.max(pred))
    pred = np.argmax(pred, 0).astype('int')
    return pred

### Reading and Classifying Test Data

In [21]:
test_data_df=pd.read_csv('../input/finaltestdata/test (2).csv')
main_classify_func(test_data_df.data[0])

baseet 0.99600405


8

In [22]:
test_data_dictionary = { i: main_classify_func(test_data_df.data[i]) for i in range(test_data_df.shape[0])}
list(test_data_dictionary.keys())[0]

baseet 0.99600405
mutakareb 0.9970693
mutakareb 0.9817545
taweel 0.9956863
wafer 0.98380107
kamel 0.994706
saree 0.8387061
saree 0.9994566
madeed 0.7971848
khafeef 0.9997937
saree 0.99575776
kamel 0.93433625
saree 0.9965758
munsareh 0.9897233
mujtath 0.99999654
saree 0.9213738
saree 0.6055784
kamel 0.53376234
khafeef 0.7701035
saree 0.9938929
kamel 0.79809475
saree 0.4680685
saree 0.6139803
wafer 0.9309135
ramal 0.999918
khafeef 0.9999877
saree 0.9868356
baseet 0.83856905
saree 0.58719015
saree 0.99949217
kamel 0.967025
mutadarak 0.5601708
mutakareb 0.99957377
rajaz 0.9241643
ramal 0.99997294
ramal 0.98809487
rajaz 0.732059
munsareh 0.9997434
khafeef 0.863121
saree 0.99003875
wafer 0.9994646
mutakareb 0.92215896
munsareh 0.9965994
khafeef 0.5245455
baseet 0.558357
kamel 0.99857295
rajaz 0.9303459
munsareh 0.83004344
kamel 0.99332726
wafer 0.86682224
khafeef 0.87944204
ramal 0.95822936
taweel 0.9896988
baseet 0.7793427
munsareh 0.9998994
mutakareb 0.99791414
kamel 0.6457696
wafer 0.9991

0

In [23]:
test_data_df =  pd.DataFrame(test_data_dictionary.items(), columns=['id', 'labels'])
test_data_df.head()

Unnamed: 0,id,labels
0,0,8
1,1,2
2,2,2
3,3,10
4,4,11


### Saving Final Predictions to CSV File

In [24]:
test_data_df.to_csv('MAAN_Final.csv', header=True, index=False)