In [None]:
!gdown --id 1MactbX3-I43bJx_e_X2144bpOlBaaF7L
!gdown --id 1aa3VagcFPBoiHjA_Dwfh9KzsnUjIXNX5
!gdown --id 1IHHuiFuEc2TNKC_PRwVPKJ5yv1EYXKcN

Downloading...
From: https://drive.google.com/uc?id=1MactbX3-I43bJx_e_X2144bpOlBaaF7L
To: /content/test.csv
100% 2.05M/2.05M [00:00<00:00, 217MB/s]
Downloading...
From: https://drive.google.com/uc?id=1aa3VagcFPBoiHjA_Dwfh9KzsnUjIXNX5
To: /content/DL3.h5
100% 9.21M/9.21M [00:00<00:00, 47.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=1IHHuiFuEc2TNKC_PRwVPKJ5yv1EYXKcN
To: /content/tokenizer.pkl
100% 10.5M/10.5M [00:00<00:00, 40.6MB/s]


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import keras
from sklearn.metrics import confusion_matrix , accuracy_score
import pickle


In [None]:
test_df = pd.read_csv('test.csv')

In [None]:
test_df['dialect'].unique()

array(['LY', 'LB', 'MA', 'EG', 'SD'], dtype=object)

In [None]:
texts = test_df['text'].values.astype(str)
labels = test_df['dialect'].values

In [None]:
# Map string labels to integer labels
label_map = {'EG': 0, 'LB': 1, 'MA': 2, 'SD': 3, 'LY': 4}
labels = [label_map[label] for label in labels]

In [None]:
with open('/content/tokenizer.pkl', 'rb') as handle:
    tokenizer = pickle.load(handle)

# Tokenize new text
sequences = tokenizer.texts_to_sequences(texts)

In [None]:
# Pad sequences to a fixed length
max_len = 61
data = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_len)
labels = np.asarray(labels)

In [None]:
labels = tf.keras.utils.to_categorical(labels)

In [None]:
labels

array([[0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       ...,
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]], dtype=float32)

In [None]:
# Load the saved model
model = keras.models.load_model('DL3.h5')

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 61, 128)           640000    
                                                                 
 conv1d (Conv1D)             (None, 61, 64)            24640     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 30, 64)           0         
 )                                                               
                                                                 
 lstm (LSTM)                 (None, 128)               98816     
                                                                 
 dense (Dense)               (None, 5)                 645       
                                                                 
Total params: 764,101
Trainable params: 764,101
Non-trainable params: 0
__________________________________________________

In [None]:
data[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,  203, 1432,    5, 1562,    3,   62,    7,    1,
       1609,    5,   18,  586, 3219,  852], dtype=int32)

In [None]:
# Use the model to make predictions
y_pred = model.predict(data)



In [None]:
y_pred

array([[8.2307030e-05, 4.5853420e-05, 8.4044850e-03, 6.2933053e-05,
        9.9140447e-01],
       [2.5017568e-04, 9.9825841e-01, 2.6571625e-04, 6.0494238e-04,
        6.2085706e-04],
       [3.5216097e-02, 9.1949189e-01, 4.8590139e-03, 7.1695447e-03,
        3.3263542e-02],
       ...,
       [9.9509132e-01, 5.2229234e-04, 1.6647789e-03, 9.2292245e-04,
        1.7986781e-03],
       [7.2153729e-01, 1.9226962e-01, 1.1872552e-02, 8.4717432e-03,
        6.5848641e-02],
       [4.4887530e-04, 9.9444604e-01, 1.2335139e-03, 2.2520435e-04,
        3.6464219e-03]], dtype=float32)

In [None]:
y_pred=np.argmax(y_pred, axis=1)
labels=np.argmax(labels, axis=1)
cm = confusion_matrix(labels, y_pred)
print(cm)

[[5156  155   45  110  311]
 [ 205 2228   27   45  265]
 [ 158   63  719   55  173]
 [ 312  114   28  848  180]
 [ 505  174  124   79 2684]]


In [None]:
accuracy = accuracy_score(labels, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.7881189460136828


In [None]:
from sklearn import metrics
def print_report(y_pred, y_test):
    report = metrics.classification_report(y_test, y_pred)
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))

print_report(y_pred,labels)

              precision    recall  f1-score   support

           0       0.81      0.89      0.85      5777
           1       0.81      0.80      0.81      2770
           2       0.76      0.62      0.68      1168
           3       0.75      0.57      0.65      1482
           4       0.74      0.75      0.75      3566

    accuracy                           0.79     14763
   macro avg       0.78      0.73      0.75     14763
weighted avg       0.79      0.79      0.78     14763

accuracy: 0.788
