In [2]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import *

## ML Model

In [3]:
#load dataset for training and testing
X_train = joblib.load('train_text.pkl')
y_train = joblib.load('train_labels.pkl')

X_test = joblib.load('test_text.pkl')
y_test = joblib.load('test_labels.pkl')

In [4]:
#scaling dataset
scale = MaxAbsScaler()
X_train = scale.fit_transform(X_train)
X_test = scale.transform(X_test)

In [5]:
clf = SGDClassifier()
clf.fit(X_train,y_train)

In [6]:
#prediction
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

acc_train = accuracy_score(y_train,y_pred_train)
acc_test = accuracy_score(y_test,y_pred_test)
print('Train Accuracy: ',acc_train)
print('Test Accuracy: ',acc_test)

In [7]:
fig, ax = plt.subplots(figsize=(10, 10))
cm = confusion_matrix(y_test, y_pred_test, labels=clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=clf.classes_)
disp.plot(ax=ax)
plt.show()

In [8]:
joblib.dump(clf, 'dialect_ml_model.pkl')
joblib.dump(scale, 'scaler.pkl')


## DL Model

In [10]:
import tensorflow as tf

In [11]:
#import datasets
train_data_2 = joblib.load('preprocessed_train_data.pkl')
valid_data_2 = joblib.load('preprocessed_valid_data.pkl')
test_data_2  = joblib.load('preprocessed_test_data.pkl')

In [12]:
#split datasets
X_train = train_data_2.iloc[:,:-1].values
y_train = train_data_2.iloc[:,-1].values

X_valid = valid_data_2.iloc[:,:-1].values
y_valid = valid_data_2.iloc[:,-1].values

X_test = test_data_2.iloc[:,:-1].values
y_test = test_data_2.iloc[:,-1].values


In [13]:
#encoding labels
ord = OrdinalEncoder()
y_train = ord.fit_transform(y_train.reshape(-1,1))
y_valid = ord.transform(y_valid.reshape(-1,1))
y_test  = ord.transform(y_test.reshape(-1,1))

In [14]:
joblib.dump(ord,'encoder.pkl')

In [None]:
#from number of unique words
VOCAB_SIZE = 369251

In [None]:
model = tf.keras.models.Sequential([    
    tf.keras.layers.Embedding(VOCAB_SIZE+1, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(18, activation='softmax')
])

In [None]:
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])

In [None]:
checkpoint_path = './'
callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy',restore_best_weights = True, patience=3)
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_best_only=True,
                                                 monitor='val_accuracy',
                                                 mode='max',
                                                 verbose=1)

In [None]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  with tf.device('/CPU:0'):
    history = model.fit(X_train, y_train, epochs=7, batch_size=128,
                      validation_data=(X_valid, y_valid),callbacks=[callback,cp_callback])

In [None]:
#test Accuracy
model.evaluate(X_test,y_test)

In [None]:
model.save('dialect_dl_model.h5')

In [None]:
import matplotlib.pyplot as plt
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

epochs = range(len(acc))

plt.plot(epochs, acc, 'r', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend(loc=0)
plt.figure()


plt.show()