In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn import metrics

In [2]:
data = pd.read_pickle("C:/Users/HP/AIM_Dialect_Prediction/processed_data.pkl")

In [3]:
data.head()

Unnamed: 0,id,dialect,Tweets
0,1175358310087892992,IQ,بالنهاية ينتفض يغير
1,1175416117793349632,IQ,يعني محسوب البشر حيونه ووحشيه وتطلبون الغرب ي...
2,1175450108898565888,IQ,مبين كلامه خليجي
3,1175471073770573824,IQ,يسلملي مرورك وروحك الحلوه
4,1175496913145217024,IQ,وين الغيبه اخ محمد


In [4]:
data['dialect'].value_counts()

EG    57636
PL    43742
KW    42109
LY    36499
QA    31069
JO    27921
LB    27617
SA    26832
AE    26296
BH    26292
OM    19116
SY    16242
DZ    16183
IQ    15497
SD    14434
MA    11539
YE     9927
TN     9246
Name: dialect, dtype: int64

In [5]:
def print_report(pipe, x_test, y_test):
    y_pred = pipe.predict(x_test)
    report = metrics.classification_report(y_test, y_pred)
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))

In [6]:
train_sentences, val_sentences , train_labels, val_labels = train_test_split(data.Tweets, data.dialect, test_size=0.05, stratify=data.dialect)

# ML

In [7]:
# vec = CountVectorizer()
# clf = LogisticRegression()
# pipe = make_pipeline(vec, clf)
# pipe.fit(train_sentences, train_labels);

In [8]:
# print_report(pipe, val_sentences, val_labels) #95 5 stratify

In [9]:
vec = TfidfVectorizer(ngram_range=(1,2))
clf = LinearSVC()
pipe_tfidf = make_pipeline(vec, clf)
pipe_tfidf.fit(train_sentences, train_labels)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(ngram_range=(1, 2))),
                ('linearsvc', LinearSVC())])

In [10]:
print_report(pipe_tfidf, val_sentences, val_labels) #95 5 stratify

              precision    recall  f1-score   support

          AE       0.46      0.46      0.46      1315
          BH       0.45      0.35      0.39      1315
          DZ       0.64      0.56      0.59       809
          EG       0.70      0.88      0.78      2882
          IQ       0.68      0.53      0.59       775
          JO       0.45      0.37      0.41      1396
          KW       0.51      0.64      0.57      2105
          LB       0.62      0.71      0.66      1381
          LY       0.67      0.72      0.69      1825
          MA       0.75      0.63      0.68       577
          OM       0.48      0.35      0.40       956
          PL       0.50      0.57      0.54      2187
          QA       0.48      0.53      0.50      1553
          SA       0.47      0.49      0.48      1342
          SD       0.78      0.60      0.67       722
          SY       0.53      0.33      0.41       812
          TN       0.71      0.48      0.57       462
          YE       0.50    

# DL

In [11]:
from collections import Counter
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from joblib import dump, load

In [12]:
def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count

In [13]:
counter = counter_word(train_sentences)

In [14]:
num_unique_words = len(counter)
num_unique_words

506175

In [15]:
counter.most_common(5)

[('اللي', 56360),
 ('الله', 47033),
 ('مش', 29466),
 ('انا', 28836),
 ('والله', 23421)]

In [16]:
train_sentences = train_sentences.to_numpy()
train_labels = pd.get_dummies(train_labels).values # One-hot expression
val_sentences = val_sentences.to_numpy()
val_labels = pd.get_dummies(val_labels)

In [17]:
train_sentences.shape, val_sentences.shape

((435287,), (22910,))

In [18]:
train_labels.shape, val_labels.shape

((435287, 18), (22910, 18))

In [19]:
# Tokenize
# vectorize a text corpus by turning each text into a sequence of integers
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(train_sentences) # fit only to training

In [20]:
# Now each word has unique index
word_index = tokenizer.word_index
#word_index

In [21]:
#apply on train, and validation

train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)

In [22]:
#Check
print(train_sentences[10:15])
print(train_sequences[10:15])

[' اه والله فعلا ربنا يسترها' ' أمين ويبارك فيك وفي عمرك يارب 🤲'
 ' هاي كلمات بيكا حسن شاكوش ايش فهمنا احنا '
 ' خطاب ريح روحك راه نديرلهن منشن'
 'تقصقصيش جناحك عشان يوسعك الصندوق إطلعي منّو سما كبيرة بتستنّاكِ ']
[[208, 5, 148, 56, 12720], [3316, 3084, 58, 228, 267, 33, 1379], [109, 2190, 18858, 608, 49334, 189, 5273, 27], [4324, 3166, 959, 514, 136607, 2045], [202244, 87521, 9, 136608, 8039, 105695, 28878, 3785, 635, 202245]]


In [23]:
# Pad the sequences to have the same length
max_length = 15 #arbitrary number

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post", truncating="post") #post-> 0
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding="post", truncating="post")

In [24]:
#Check
train_padded.shape, val_padded.shape

((435287, 15), (22910, 15))

In [25]:
model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words, 128, input_length=max_length))

model.add(layers.Bidirectional(layers.LSTM(64, return_sequences=True)))
model.add(layers.Bidirectional(layers.LSTM(64)))
# model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(18, activation="softmax"))

model.compile(optimizer="adam", loss="categorical_crossentropy",metrics=["acc"])
print(model.summary()) # Show the summary of the model

history = model.fit(train_padded, train_labels, batch_size=128,
                    epochs=2, validation_data=(val_padded, val_labels))    

2022-03-13 10:53:44.791971: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-13 10:53:44.895026: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-13 10:53:44.896183: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-13 10:53:44.899269: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 15, 128)           64790400  
_________________________________________________________________
bidirectional (Bidirectional (None, 15, 128)           98816     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense (Dense)                (None, 18)                2322      
Total params: 64,990,354
Trainable params: 64,990,354
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/2


2022-03-13 10:53:48.003256: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2022-03-13 10:53:53.448349: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


Epoch 2/2


In [26]:
dump(pipe_tfidf, 'C:/Users/HP/AIM_Dialect_Prediction/svm_pipe.joblib') 

['./svm_pipe.joblib']