In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import  Dropout, Dense, Embedding, LSTM, SpatialDropout1D
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping

In [2]:
df = pd.read_csv('/content/drive/MyDrive/clean_dataset.csv')
df

Unnamed: 0.1,Unnamed: 0,dialect,text,clean_text,tokenizer,region_dialect
0,0,IQ,@Nw8ieJUwaCAAreT لكن بالنهاية .. ينتفض .. يغير .,بالنهايه ينتفض يغير,"['بالنهايه', 'ينتفض', 'يغير']",Gulf
1,1,IQ,@7zNqXP0yrODdRjK يعني هذا محسوب على البشر .. ح...,يعني محسوب علي البشر حيونه وحشيه وتطلبون الغرب...,"['يعني', 'محسوب', 'علي', 'البشر', 'حيونه', 'وح...",Gulf
2,2,IQ,@KanaanRema مبين من كلامه خليجي,مبين كلامه خليجي,"['مبين', 'كلامه', 'خليجي']",Gulf
3,3,IQ,@HAIDER76128900 يسلملي مرورك وروحك الحلوه💐,يسلملي مرورك وروحك الحلوه,"['يسلملي', 'مرورك', 'وروحك', 'الحلوه']",Gulf
4,4,IQ,@hmo2406 وين هل الغيبه اخ محمد 🌸🌺,وين الغيبه اخ محمد,"['وين', 'الغيبه', 'اخ', 'محمد']",Gulf
...,...,...,...,...,...,...
458192,458192,BH,@Al_mhbaa_7 مبسوطين منك اللي باسطانا😅,مبسوطين منك الي باسطانا,"['مبسوطين', 'منك', 'الي', 'باسطانا']",Gulf
458193,458193,BH,@Zzainabali @P_ameerah والله ماينده ابش يختي,واله ماينده ابش يختي,"['واله', 'ماينده', 'ابش', 'يختي']",Gulf
458194,458194,BH,@Al_mhbaa_7 شو عملنا لك حنا تهربي مننا احنا مس...,شو عملنا حنا تهربي منا احنا مساكين ليش بتعملي ...,"['شو', 'عملنا', 'حنا', 'تهربي', 'منا', 'احنا',...",Gulf
458195,458195,BH,@haneenalmwla الله يبارك فيها وبالعافيه 😋😋😋,اله يبارك وبالعافيه,"['اله', 'يبارك', 'وبالعافيه']",Gulf


In [3]:
df.dropna(axis=0,inplace=True)

In [5]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(df.clean_text.values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 384180 unique tokens.


In [6]:
X = tokenizer.texts_to_sequences(df.clean_text.values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (457952, 250)


In [7]:
Y = pd.get_dummies(df['dialect']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (457952, 18)


In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42,stratify=Y)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(412156, 250) (412156, 18)
(45796, 250) (45796, 18)


In [9]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(18, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 250, 100)          5000000   
                                                                 
 spatial_dropout1d (SpatialD  (None, 250, 100)         0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 18)                1818      
                                                                 
Total params: 5,082,218
Trainable params: 5,082,218
Non-trainable params: 0
_________________________________________________________________
None


In [12]:
history = model.fit(X_train, Y_train, 
                    epochs=5,
                    batch_size=100,
                    validation_split=0.1,
                    callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5


In [13]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 1.713
  Accuracy: 0.505
