In [104]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Flatten, Input, concatenate, Bidirectional
from keras.layers import Embedding, SpatialDropout1D, BatchNormalization
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D, LSTM, GRU
from keras.callbacks import EarlyStopping
from importlib import import_module
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer
from sklearn import preprocessing
from keras.preprocessing.sequence import pad_sequences
from keras import models
from keras import layers
from keras import losses
from keras import metrics
from keras import optimizers

In [105]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [106]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [107]:
dataset = pd.read_csv("/content/drive/MyDrive/balanced_dataset.csv")

In [108]:
vocab_size = 40000
mx_len = 250
emd_dim = 100
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(dataset['cleaned_tweet'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 241352 unique tokens.


In [109]:
X = tokenizer.texts_to_sequences(dataset['cleaned_tweet'].values)
X = pad_sequences(X, maxlen=mx_len)
print('Shape of data:', X.shape)

Shape of data: (166428, 250)


In [110]:
y = pd.get_dummies(dataset['dialect']).values
print('Shape of label:', y.shape)

Shape of label: (166428, 18)


In [111]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2022)

print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(133142, 250) (133142, 18)
(33286, 250) (33286, 18)


In [112]:
inp = Input(shape = (mx_len,))
x = Embedding(vocab_size, emd_dim, input_length=X.shape[1])(inp)
# x_lstm = Bidirectional(LSTM(128, return_sequences = True))(x)
# x_lstm_c1d = Conv1D(64,kernel_size=3,padding='valid',activation='tanh')(x_lstm)
# x_lstm_c1d_gp = GlobalMaxPooling1D()(x_lstm_c1d)

x_gru = Bidirectional(GRU(128, return_sequences = True))(x)
x_gru_c1d = Conv1D(64,kernel_size=2,padding='valid',activation='tanh')(x_gru)
x_gru_c1d_gp = GlobalMaxPooling1D()(x_gru_c1d)

# x_f = concatenate([x_lstm_c1d_gp, x_gru_c1d_gp])
# x_f = BatchNormalization()(x_f)
# x_f =(Dense(128, activation='tanh') (x_f))    
# x_f = BatchNormalization()(x_f)

x_f = BatchNormalization()(x_gru_c1d_gp)
x_f =(Dense(128, activation='tanh') (x_f))    
x_f = BatchNormalization()(x_f)

x_f = (Dense(64, activation='tanh') (x_f))
x_f = (Dense(18, activation='softmax'))(x_f)
model = Model(inputs = [inp], outputs = x_f)

In [113]:
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

my_optimizer = tf.keras.optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.99, decay=0.01)
reduceLR = ReduceLROnPlateau(monitor='accuracy', factor=0.1, patience=3, verbose=1)
model.compile(loss='categorical_crossentropy', optimizer=my_optimizer, metrics=['accuracy'])
print(model.summary())

Model: "model_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_21 (InputLayer)       [(None, 250)]             0         
                                                                 
 embedding_23 (Embedding)    (None, 250, 100)          4000000   
                                                                 
 bidirectional_26 (Bidirecti  (None, 250, 256)         176640    
 onal)                                                           
                                                                 
 conv1d_17 (Conv1D)          (None, 249, 64)           32832     
                                                                 
 global_max_pooling1d_17 (Gl  (None, 64)               0         
 obalMaxPooling1D)                                               
                                                                 
 batch_normalization_20 (Bat  (None, 64)               256

  super(Adam, self).__init__(name, **kwargs)


In [114]:
history = model.fit(X_train, y_train, batch_size=512, epochs=4, shuffle=True,
                             validation_data=[X_test, y_test], verbose=1)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [115]:
model.save("DL_model.h5")