In [1]:
pip install openpyxl

In [2]:
import pandas as pd
import seaborn as sns
import fasttext
from nltk.util import ngrams
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import text, sequence
import numpy as np
from tensorflow.keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Dense, Embedding,GRU,LSTM,Bidirectional,Dropout,Conv1D,MaxPooling1D,GlobalAveragePooling1D,Flatten,Input
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.metrics import accuracy_score,classification_report
from keras.callbacks import EarlyStopping,ModelCheckpoint, ReduceLROnPlateau
from sklearn.model_selection import StratifiedKFold
import gensim
from tqdm import tqdm
from gensim.models import word2vec
from sklearn import metrics


### Import excel file for maghreb countries

In [3]:
df=pd.read_excel("../input/data-arabic-dialect/data_maghreb.xlsx",index_col=0,
              dtype={'tokens': str, 'dialect': str})
## check the presence of null values
df.isna().sum()

In [4]:
## reset index
df=df.reset_index()
df.drop('index',axis=1,inplace=True)

### count numbers of sentences that belong to each country

In [5]:
x=df['dialect'].value_counts()
print(x.index)
sns.barplot(x.index,x)

### compute the max number of words ( max_length of the sentence)

In [6]:
def splitolist(data):
    l=data.split(" ")
    return l

In [7]:
df['tokens1']=df['tokens'].apply(splitolist)

In [8]:
max_length=df.tokens1.str.len().max()
print("max length is"+" "+str(max_length))


In [9]:
df.loc[0,['tokens','dialect']][0]

### create a corpus of the entire dataset in a txt file

In [10]:
with open(r'./corpus.txt', 'w', encoding='utf-8') as txtfile:
    for i in range(len(df)):
        line = df.loc[i,'tokens']
        txtfile.write(line)
        txtfile.write('\n')

### train skipgram model on the corpus to get word embeddings

In [11]:
EMBED_SIZE=100
model = fasttext.train_unsupervised('./corpus.txt',
                                    minCount = 5, 
                                    model='skipgram',
                                    minn = 2,
                                    maxn = 5,
                                    dim = 100,
                                    lr = 0.1,
                                    epoch = 10)

In [12]:
#create a list of all unique words in the dataset
with open(r'./corpus.txt', 'r', encoding="utf-8") as txtfile:
    corpus_sentences = txtfile.readlines()
    corpus_words = []
    for sent in corpus_sentences:
        tokenized_sent = sent.split()
        for word_ in tokenized_sent:
            corpus_words.append(word_)
            
    corpus_unique_words = list(set(corpus_words))

### convert text to numbers using tokenizer

In [13]:
tokenizer = Tokenizer(num_words=len(corpus_unique_words)+1)
tokenizer.fit_on_texts(df['tokens'])
sequences = tokenizer.texts_to_sequences(df['tokens'])

In [14]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=max_length,padding='post')  ## padding all the sentences in the dataset to have the same length

### create embedding matrix from the trained fasttext model

In [15]:
vocab_size = len(tokenizer.word_index)+1
embeddings_matrix = np.zeros(shape = (vocab_size , EMBED_SIZE))

for word, index in tqdm(tokenizer.word_index.items()):
    embeddings_matrix[index] = model.get_word_vector(word)

In [16]:
data_y=pd.get_dummies(df['dialect'])

In [17]:
data_y[:10]

In [26]:
#split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(data,data_y, test_size = 0.1, stratify =data_y,shuffle=True)

In [27]:
#create a bidirectional LSTM model
def create_model():
    model = Sequential()
    embedding_layer = Embedding(vocab_size, EMBED_SIZE, 
                                weights=[embeddings_matrix], 
                                input_length=max_length , 
                                trainable=True)
    
    model.add(embedding_layer)
    model.add(Bidirectional(LSTM(128)))
    model.add(Dropout(0.5))
    model.add(Dense(5000, activation='relu'))
    model.add(Dense(len(np.unique(df['dialect'])), activation='softmax'))
    return model

model1 = create_model()
tf.keras.backend.clear_session()
early_stop = tf.keras.callbacks.EarlyStopping(monitor='loss',mode = 'max',patience=3,verbose=1) #,mode = 'min',callbacks=[early_stop]
early_stopping = EarlyStopping(monitor='loss', patience=2)
tf.keras.backend.clear_session()
# early_stopping = EarlyStopping(monitor= 'val_acc', 
#                                mode = 'max',
#                                patience=30, 
#                                verbose=1)

# model_checkpoint = ModelCheckpoint('levant_dialect_CLASSIFIER',
#                                    monitor = 'val_acc', 
#                                    mode = 'max', 
#                                    save_best_only=True, 
#                                    verbose=1)


opt = Adam(learning_rate = 0.0001)

model1.compile(opt, loss = 'categorical_crossentropy', metrics=['accuracy'])

In [28]:
np.unique(df['dialect']).tolist()

In [29]:
#train the model
history = model1.fit(x_train, 
                    y_train, 
                    validation_data=(x_test, y_test),
                    batch_size=32,
                    epochs=50,
                    callbacks = [early_stop])

In [30]:
model1.save("maghreb.h5")

In [31]:
target_names=np.unique(df['dialect']).tolist()
target_names

In [32]:
y_test

In [33]:
np.argmax(np.array(y_test),axis=1)

### classification report for test data

In [34]:
pred=model1.predict(x_test)
print(classification_report(np.argmax(np.array(y_test),axis=1),np.argmax(pred,axis=1), target_names=target_names))

### classification report for train data

In [35]:
pred_train=model1.predict(x_train)
print(classification_report(np.argmax(np.array(y_train),axis=1),np.argmax(pred_train,axis=1), target_names=target_names))

In [40]:
a="توحشتك"
l=splitolist(a)
seq=tokenizer.texts_to_sequences(l)
b=[item for sublist in seq for item in sublist]
print(b)
pad_seq=pad_sequences([b], maxlen=max_length,padding='post')

In [41]:
print(l,seq,pad_seq)

In [42]:
out=model1.predict(pad_seq).argmax(axis=1)
out[0]

In [43]:
country_pred=target_names[out[0]]
country_pred