In [None]:
!gdown --id 1UhIF3TqmBxWGJIj6vNsYUx0iLiNnOd0q

Downloading...
From: https://drive.google.com/uc?id=1UhIF3TqmBxWGJIj6vNsYUx0iLiNnOd0q
To: /content/train.csv
100% 18.3M/18.3M [00:00<00:00, 93.2MB/s]


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Dropout, Embedding, Conv1D, MaxPooling1D, LSTM, Flatten
from tensorflow.keras.models import Sequential
import pickle


In [None]:
train_df = pd.read_csv('train.csv')

In [None]:
train_df.head()

Unnamed: 0,text,dialect
0,قولتله اصلي عياده الدكتور اهيه فوقينا اتكسف...,EG
1,متحاولش تبقي مبهور بحد مشفتوش وهو بياكل وهو صا...,EG
2,كنك متردش عليا ؟ والله كنت ماسك النقال وشفت ...,LY
3,بتزيح او بقعد بحضنك,LB
4,مفيش حد كامل من كل شء مفيش حد مش بيغلط مغيش حد...,EG


In [None]:
texts = train_df['text'].values.astype(str)
labels = train_df['dialect'].values

In [None]:
train_df['dialect'].unique()


array(['EG', 'LY', 'LB', 'SD', 'MA'], dtype=object)

In [None]:

# Map string labels to integer labels
label_map = {'EG': 0, 'LB': 1, 'MA': 2, 'SD': 3, 'LY': 4}
labels = [label_map[label] for label in labels]


In [None]:
# Tokenize text data
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))


Found 211859 unique tokens.


In [None]:
# Save the tokenizer
tokenizer_path = 'tokenizer.pkl'
with open(tokenizer_path, 'wb') as file:
    pickle.dump(tokenizer, file)

In [None]:
# Pad sequences to same length
max_sequence_length = max([len(seq) for seq in sequences])
data = pad_sequences(sequences, maxlen=max_sequence_length)
labels = np.asarray(labels)

In [None]:
# Split data into train and test sets
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(0.2 * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_test = data[-num_validation_samples:]
y_test = labels[-num_validation_samples:]


In [None]:
# Convert labels to one-hot encoded vectors
y_train = tf.keras.utils.to_categorical(y_train)
y_test = tf.keras.utils.to_categorical(y_test)

In [None]:
# Build model
model = Sequential()
model.add(Embedding(5000, 128, input_length=max_sequence_length))
model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(5, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
model.fit(x_train, y_train,
          validation_data=(x_test, y_test),
          epochs=3, batch_size=128)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f1bc01ad7b0>

In [None]:
# Evaluate model
loss, accuracy = model.evaluate(x_test, y_test, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 78.688091


In [None]:
model.save('DL3.h5')