In [None]:
#import libraries
import numpy as np
import pandas as pd
import pickle
import re
pd.options.display.max_rows = 100
from pyarabic.araby import *
from itertools import groupby
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import joblib
import matplotlib.pyplot as plt

In [None]:
#constants
TRAIN_PATH = 'preprocessed_train_data.pkl'
VALID_PATH = 'preprocessed_valid_data.pkl'
TEST_PATH  = 'preprocessed_test_data.pkl'

In [None]:
pip install pickle5

In [None]:
import pickle5 as pickle

In [None]:
with open('fetched_dialect_dataset.pkl','rb') as dt:
    dataset = pickle.load(dt)
#dataset = pd.read_pickle('/kaggle/input/dialect/fetched_dialect_dataset.pkl')

In [None]:
dataset.head()

In [None]:
#preprocessing for dataset

#removing arabic stopwords
stop_words = stopwords.words('arabic') 
def filter_text(text):
    
    """
    Function to filter text
    
    Parameters:
      * text(string): text that is filtered from special characters and others
      
    Return text(string): filtered text using regular expression
    """
    #filter text by removing special characters -> not arabic words or numbers -> trim spcases -> removing stopwords and others
    filtered_text = " ".join([word for word in text.split(' ') if word not in stop_words])
    filtered_text = " ".join(re.findall('[\u0600-\u06ff]+',filtered_text))
    filtered_text = re.sub('\s+',' ',re.sub('[٠-٩؟،]','',filtered_text))
    filtered_text = re.sub("[إأآا]", "ا", filtered_text)
    filtered_text = "".join(c for c, _ in groupby(filtered_text))
    filtered_text = strip_tashkeel(filtered_text)
    filtered_text = strip_lastharaka(filtered_text)
    filtered_text = strip_tatweel(filtered_text)

    return filtered_text
    

In [None]:
#copy dataset
data = dataset.copy()

In [None]:
data['text'] = dataset['text'].apply(lambda x:filter_text(x))

In [None]:
#before filter
dataset['text'][0]

In [None]:
#after filter
data['text'][0]

In [None]:
#remove empty strings
data.drop(index=data[data['text']==''].index.to_list(),inplace=True)

In [None]:
#split dataset into training validation and testing datasets and transform only training dataset into vectors using dict

train_data,test_data = train_test_split(data,test_size=0.2,random_state=0)

In [None]:
print('The size of training dataset  : ',len(train_data))
print('The size of testing  dataset  : ',len(test_data))

**************** Preprocessing data to ML Model************

In [None]:
#convert dataset into Count vectorization to be used in ML Model
#train_data
vector = CountVectorizer()
X_train = vector.fit_transform(train_data['text'])
y_train = train_data['dialect']

In [None]:
#test_data
X_test = vector.transform(test_data['text'])
y_test = test_data['dialect']

In [None]:
#save data into pickle files
#training
joblib.dump(X_train,'train_text.pkl')
joblib.dump(y_train,'train_labels.pkl')

#testing
joblib.dump(X_test,'test_text.pkl')
joblib.dump(y_test,'test_labels.pkl')

In [None]:
X_test[0].shape

In [None]:
#save countervectorizer
joblib.dump(vector,'count_vector.pkl')

**************** Preprocessing data vectorDL Model************

In [None]:
#check the length of sentences to define the best number of the length
text_length = [len(sent) for sent in data['text'].to_list()]
plt.hist(text_length)

In [None]:
#split data into training,validation and testing
training_data,valid_data = train_test_split(data,test_size=0.2)
valid_data,testing_data = train_test_split(valid_data,test_size=0.5)

In [None]:
print(train_data.shape)
print(valid_data.shape)
print(test_data.shape)

* Convert words into indices after defining unique words

In [None]:
words_ls = []
for i in list(train_data.index):
  words_ls += train_data['text'][i].split(' ')

In [None]:
unique_words = list(set(words_ls))

In [None]:
word2idx = {word:index+1 for index,word in enumerate(unique_words)}
idx2word = {word2idx[word]:word for word in unique_words}

In [None]:
max_sequence_len = 100

In [None]:
def convert_text_to_vector(text):

  vector = np.zeros((1,max_sequence_len))
  text_ls = text.split(' ')
  for i in range(len(text_ls)):
    vector[0,i] = word2idx.get(text_ls[i],0)

  return list(vector[0])

In [None]:
#save the dictionary to be used in the deployment
joblib.dump(word2idx,'dict_word2idx.sav')

In [None]:
training_data['text'] = training_data['text'].apply( convert_text_to_vector)

In [None]:
train_data_split = pd.DataFrame(training_data['text'].to_list(), columns = ['col'+str(i) for i in range(1,101)])
train_data_split['dialect'] = training_data['dialect'].to_list()

In [None]:
valid_data['text'] = valid_data['text'].apply(convert_text_to_vector)

In [None]:
valid_data_split = pd.DataFrame(valid_data['text'].to_list(), columns = ['col'+str(i) for i in range(1,101)])
valid_data_split['dialect'] = valid_data['dialect'].to_list()

In [None]:
testing_data['text'] = testing_data['text'].apply(convert_text_to_vector)

In [None]:
test_data_split = pd.DataFrame(testing_data['text'].to_list(), columns = ['col'+str(i) for i in range(1,101)])
test_data_split['dialect'] = testing_data['dialect'].to_list()

In [None]:
train_data_split.head()

In [None]:
#saving datasets into pickle files
train_data_split.to_pickle(TRAIN_PATH)
valid_data_split.to_pickle(VALID_PATH)
test_data_split.to_pickle(TEST_PATH)