In [1]:
#import libraries
import numpy as np
import pandas as pd
import pickle
import re
pd.options.display.max_rows = 100
from pyarabic.araby import *
from itertools import groupby
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

In [2]:
#constants
TRAIN_PATH = 'preprocessed_train_data.pkl'
VALID_PATH = 'preprocessed_valid_data.pkl'
TEST_PATH  = 'preprocessed_test_data.pkl'

In [3]:
dataset = pd.read_pickle('fetched_dialect_dataset.pkl')

In [4]:
dataset.head()

Unnamed: 0,text,dialect
0,@Nw8ieJUwaCAAreT ŸÑŸÉŸÜ ÿ®ÿßŸÑŸÜŸáÿßŸäÿ© .. ŸäŸÜÿ™ŸÅÿ∂ .. Ÿäÿ∫Ÿäÿ± .,IQ
1,@7zNqXP0yrODdRjK ŸäÿπŸÜŸä Ÿáÿ∞ÿß ŸÖÿ≠ÿ≥Ÿàÿ® ÿπŸÑŸâ ÿßŸÑÿ®ÿ¥ÿ± .. ÿ≠...,IQ
2,@KanaanRema ŸÖÿ®ŸäŸÜ ŸÖŸÜ ŸÉŸÑÿßŸÖŸá ÿÆŸÑŸäÿ¨Ÿä,IQ
3,@HAIDER76128900 Ÿäÿ≥ŸÑŸÖŸÑŸä ŸÖÿ±Ÿàÿ±ŸÉ Ÿàÿ±Ÿàÿ≠ŸÉ ÿßŸÑÿ≠ŸÑŸàŸáüíê,IQ
4,@hmo2406 ŸàŸäŸÜ ŸáŸÑ ÿßŸÑÿ∫Ÿäÿ®Ÿá ÿßÿÆ ŸÖÿ≠ŸÖÿØ üå∏üå∫,IQ


In [5]:
#preprocessing for dataset

#removing arabic stopwords
stop_words = stopwords.words('arabic') 
def filter_text(text):
    
    """
    Function to filter text
    
    Parameters:
      * text(string): text that is filtered from special characters and others
      
    Return text(string): filtered text using regular expression
    """
    #filter text by removing special characters -> not arabic words or numbers -> trim spcases -> removing stopwords and others
    filtered_text = " ".join([word for word in text.split(' ') if word not in stop_words])
    filtered_text = " ".join(re.findall('[\u0600-\u06ff]+',filtered_text))
    filtered_text = re.sub('\s+',' ',re.sub('[Ÿ†-Ÿ©ÿüÿå]','',filtered_text))
    filtered_text = re.sub("[ÿ•ÿ£ÿ¢ÿß]", "ÿß", filtered_text)
    filtered_text = "".join(c for c, _ in groupby(filtered_text))
    filtered_text = strip_tashkeel(filtered_text)
    filtered_text = strip_lastharaka(filtered_text)
    filtered_text = strip_tatweel(filtered_text)

    return filtered_text
    

In [6]:
#copy dataset
data = dataset.copy()

In [7]:
data['text'] = dataset['text'].apply(lambda x:filter_text(x))

In [8]:
#before filter
dataset['text'][0]

'@Nw8ieJUwaCAAreT ŸÑŸÉŸÜ ÿ®ÿßŸÑŸÜŸáÿßŸäÿ© .. ŸäŸÜÿ™ŸÅÿ∂ .. Ÿäÿ∫Ÿäÿ± .'

In [9]:
#after filter
data['text'][0]

'ÿ®ÿßŸÑŸÜŸáÿßŸäÿ© ŸäŸÜÿ™ŸÅÿ∂ Ÿäÿ∫Ÿäÿ±'

In [10]:
#remove empty strings
data.drop(index=data[data['text']==''].index.to_list(),inplace=True)

In [11]:
#split dataset into training validation and testing datasets and transform only training dataset into vectors using dict

train_data,valid_data = train_test_split(data,test_size=0.2,random_state=0)
valid_data,test_data = train_test_split(valid_data,test_size=0.5,random_state=0)

In [12]:
print('length of training dataset  : ',len(train_data))
print('length of validation dataset: ',len(valid_data))
print('length of testing dataset   : ',len(test_data))

length of training dataset  :  366476
length of validation dataset:  45809
length of testing dataset   :  45810


In [13]:
def build_freqs(texts,dialects):
    """
    Function to build frequencies of each word in tweet according to its dialect
    
    Parameters:
      * texts(pd.Series): contains all tweets
      * dialects(pd.Series): contains all dialects
      
    Return vocab_dict(dict): a dictionary contains frequency of each word for each dialect
    """
    vocab_dict = {}
    for y,text in zip(dialects,texts):
        for word in text.split(' '):
            pair = (y,word)
            if pair in vocab_dict:
                vocab_dict[pair] += 1
            else:
                vocab_dict[pair] = 1
                
    return vocab_dict

In [14]:
vocab = build_freqs(train_data['text'],train_data['dialect'])

* we notice that we have built vocab on only training dataset until we handle words that are out of vocab when testing, we can check the model if it works well 

In [15]:
def extract_features(text, freqs, classes, num_classes=18):
    """
    Function to build a vector using frequencies of each word of each dialect
    
    Parameters:
      * text(string): tweet text
      * freqs(dict): dictionary of frequencies of words and their dialects
      
    Return(array): vector contains of frequencies
    """
   
    x = np.zeros((1, num_classes)) 
        
    #loop through each word in the list of words
    for word in text.split(' '):
        for i in range(num_classes):
            
            x[0,i] += freqs.get((classes[i],word),0)
            
            
    return x

In [16]:
#dialects
classes = list(np.unique(dataset['dialect']))

In [17]:
#test vector
extract_features(train_data['text'][0],vocab,classes)

array([[ 22.,  23.,  10.,  28.,  12.,  29.,  42., 205.,  23.,   2.,  15.,
         35.,  29.,  20.,   6.,  17.,   3.,   5.]])

In [18]:
#convert training data from text into vectors
train_data['text'] = train_data['text'].apply(lambda x:list(extract_features(x,vocab,classes)[0]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['text'] = train_data['text'].apply(lambda x:list(extract_features(x,vocab,classes)[0]))


In [19]:
#convert validation data from text into vectors
valid_data['text'] = valid_data['text'].apply(lambda x:list(extract_features(x,vocab,classes)[0]))

In [20]:
#convert testing data from text into vectors
test_data['text'] = test_data['text'].apply(lambda x:list(extract_features(x,vocab,classes)[0]))

In [21]:
#split column vector into multiple columns
train_data_split = pd.DataFrame(train_data['text'].to_list(), columns = ['col'+str(i) for i in range(1,19)])
train_data_split['dialect'] = train_data['dialect'].to_list()

In [22]:
#split column vector into multiple columns
valid_data_split = pd.DataFrame(valid_data['text'].to_list(), columns = ['col'+str(i) for i in range(1,19)])
valid_data_split['dialect'] = valid_data['dialect'].to_list()

In [23]:
#split column vector into multiple columns
test_data_split = pd.DataFrame(test_data['text'].to_list(), columns = ['col'+str(i) for i in range(1,19)])
test_data_split['dialect'] = test_data['dialect'].to_list()

In [24]:
train_data_split.head()

Unnamed: 0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14,col15,col16,col17,col18,dialect
0,604.0,607.0,583.0,6442.0,554.0,691.0,1060.0,589.0,1066.0,465.0,467.0,1328.0,671.0,509.0,2721.0,224.0,425.0,259.0,EG
1,21.0,50.0,138.0,494.0,15.0,381.0,67.0,96.0,96.0,162.0,14.0,619.0,25.0,41.0,24.0,43.0,69.0,39.0,SY
2,195.0,202.0,99.0,722.0,81.0,277.0,304.0,242.0,419.0,84.0,136.0,508.0,244.0,157.0,77.0,117.0,73.0,85.0,LY
3,5048.0,4078.0,1280.0,12296.0,1644.0,4083.0,8301.0,2405.0,6862.0,1295.0,2848.0,6594.0,6054.0,5108.0,965.0,1878.0,1942.0,1371.0,EG
4,4171.0,2828.0,2908.0,6596.0,253.0,3872.0,4608.0,3378.0,4507.0,1859.0,1181.0,2954.0,3586.0,2412.0,1139.0,1624.0,3044.0,624.0,MA


In [26]:
#save datasets into pickle files to be used in the model
train_data_split.to_pickle(TRAIN_PATH)
valid_data_split.to_pickle(VALID_PATH)
test_data_split.to_pickle(TEST_PATH)