In [1]:
pip install pandas scikit-learn joblib keras tensorflow

Collecting keras
  Downloading keras-2.15.0-py3-none-any.whl.metadata (2.4 kB)
Downloading keras-2.15.0-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: keras
  Attempting uninstall: keras
    Found existing installation: keras 3.3.3
    Uninstalling keras-3.3.3:
      Successfully uninstalled keras-3.3.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-decision-forests 1.8.1 requires wurlitzer, which is not installed.[0m[31m
[0mSuccessfully installed keras-2.15.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
# from google.colab import drive
# drive.mount('/content/drive')
# main_dir = '/content/drive/MyDrive/NLP/data'

In [3]:
import pandas as pd
import sqlite3
import re
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
import joblib
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from keras.models import load_model

2024-06-13 09:28:24.342516: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-13 09:28:24.342622: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-13 09:28:24.472983: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
# Database loading
class DataLoader:
    def __init__(self, dbfile):
        self.dbfile = dbfile

    def load_data(self):
        conn = sqlite3.connect(self.dbfile)
        data_df = pd.read_sql("SELECT * FROM id_text", conn)
        type_df = pd.read_sql("SELECT * FROM id_dialect", conn)
        conn.close()
        return data_df, type_df


In [5]:
# Text Preprocessing
class TextCleaner(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = pd.Series(X)
        return X.apply(self.clean_text)

    @staticmethod
    def clean_text(text):
        text = TextCleaner.remove_urls(text)
        text = TextCleaner.remove_mentions(text)
        text = TextCleaner.remove_english_words(text)
        text = TextCleaner.remove_unicode_bmp(text)
        text = TextCleaner.remove_emoji_shortcodes(text)
        text = TextCleaner.remove_specific_punctuation(text)
        text = TextCleaner.remove_complex_patterns(text)
        text = TextCleaner.remove_various_punctuation(text)
        text = TextCleaner.remove_numbers(text)
        text = TextCleaner.remove_extra_spaces(text)
        return text

    @staticmethod
    def remove_urls(text):
        return re.sub(r'http[s]?://\S+', ' ', text)

    @staticmethod
    def remove_mentions(text):
        return re.sub(r'@\w+', ' ', text)

    @staticmethod
    def remove_english_words(text):
        return re.sub(r'\b[a-zA-Z]+\b', ' ', text)

    @staticmethod
    def remove_unicode_bmp(text):
        return re.sub(r'[\U00010000-\U0010ffff]', ' ', text)

    @staticmethod
    def remove_emoji_shortcodes(text):
        return re.sub(r':[a-z_]+:', ' ', text)

    @staticmethod
    def remove_specific_punctuation(text):
        return re.sub(r'[*!?#@]', ' ', text)

    @staticmethod
    def remove_complex_patterns(text):
        return re.sub(r'\|\|+\\s*\d+%\s*\|\|+?[_\-\.\?]+', ' ', text)

    @staticmethod
    def remove_various_punctuation(text):
        return re.sub(r'[_\-\.\"\:\;\,\'\،\♡\\\)/(\&\؟]', ' ', text)

    @staticmethod
    def remove_numbers(text):
        return re.sub(r'\d+', ' ', text)

    @staticmethod
    def remove_extra_spaces(text):
        return ' '.join(text.split())


In [6]:
# Model Training
class TextClassificationModel:
    def __init__(self, model_type='logistic'):
        self.model_type = model_type
        self.pipeline = None

    def build_pipeline(self):
        if self.model_type == 'logistic':
            self.pipeline = Pipeline([
                ('cleaner', TextCleaner()),
                ('vectorizer', CountVectorizer()),
                ('classifier', LogisticRegression())
            ])
        elif self.model_type == 'naive_bayes':
            self.pipeline = Pipeline([
                ('cleaner', TextCleaner()),
                ('vectorizer', CountVectorizer()),
                ('classifier', MultinomialNB())
            ])

    def train(self, X_train, y_train):
        self.build_pipeline()
        self.pipeline.fit(X_train, y_train)

    def evaluate(self, X_test, y_test):
        y_pred = self.pipeline.predict(X_test)
        return accuracy_score(y_test, y_pred)

    def save(self, filename):
        joblib.dump(self.pipeline, filename)

    def load(self, filename):
        self.pipeline = joblib.load(filename)

    def predict(self, text):
        return self.pipeline.predict([text])

In [7]:
# Deep Learning Model
class DeepLearningModel:
    def __init__(self, max_len=100):
        self.max_len = max_len
        self.tokenizer = Tokenizer()
        self.label_encoder = LabelEncoder()
        self.model = None

    def build_model(self, input_dim, output_dim):
        self.model = Sequential()
        self.model.add(Embedding(input_dim=input_dim, output_dim=100, input_length=self.max_len))
        self.model.add(LSTM(100, return_sequences=True))
        self.model.add(LSTM(100))
        self.model.add(Dense(100, activation='relu'))
        self.model.add(Dropout(0.2))
        self.model.add(Dense(output_dim, activation='softmax'))
        self.model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    def preprocess_text(self, texts):
        sequences = self.tokenizer.texts_to_sequences(texts)
        return pad_sequences(sequences, padding='post', maxlen=self.max_len)

    def train(self, X_train, y_train, validation_split=0.2, epochs=10, patience=3):
        self.tokenizer.fit_on_texts(X_train)
        X_train_padded = self.preprocess_text(X_train)
        y_train_encoded = self.label_encoder.fit_transform(y_train)
        y_train_categorical = to_categorical(y_train_encoded)
        self.build_model(len(self.tokenizer.word_index)+1, len(self.label_encoder.classes_))
        early_stopping = EarlyStopping(patience=patience)
        self.model.fit(X_train_padded, y_train_categorical, validation_split=validation_split, epochs=epochs, callbacks=[early_stopping])

    def evaluate(self, X_test, y_test):
        X_test_padded = self.preprocess_text(X_test)
        y_test_encoded = self.label_encoder.transform(y_test)
        y_pred = self.model.predict(X_test_padded)
        y_pred_classes = y_pred.argmax(axis=1)
        return accuracy_score(y_test_encoded, y_pred_classes)

    def save(self, model_filename, tokenizer_filename, label_encoder_filename):
        self.model.save(model_filename)
        joblib.dump(self.tokenizer, tokenizer_filename)
        joblib.dump(self.label_encoder, label_encoder_filename)

    def load(self, model_filename, tokenizer_filename, label_encoder_filename):
        self.model = load_model(model_filename)
        self.tokenizer = joblib.load(tokenizer_filename)
        self.label_encoder = joblib.load(label_encoder_filename)

    def predict(self, text):
        cleaned_text = TextCleaner.clean_text(text)
        sequence = self.tokenizer.texts_to_sequences([cleaned_text])
        padded_sequence = pad_sequences(sequence, padding='post', maxlen=self.max_len)
        prediction = self.model.predict(padded_sequence)
        return self.label_encoder.inverse_transform(prediction.argmax(axis=1))

In [8]:
main_dir = '/kaggle/input/arabic-dialect-db'

In [9]:
dbfile = f'{main_dir}/dialects_database.db'
data_loader = DataLoader(dbfile)
data_df, type_df = data_loader.load_data()

X_train, X_test, y_train, y_test = train_test_split(data_df['text'], type_df['dialect'], test_size=0.2, random_state=42)


In [10]:
# Logistic Regression Model
logistic_model = TextClassificationModel(model_type='logistic')
logistic_model.train(X_train, y_train)
print("Logistic Regression Accuracy:", logistic_model.evaluate(X_test, y_test))
logistic_model.save('logistic_model.pkl')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.8313081739719073


In [11]:
# Naive Bayes Model
nb_model = TextClassificationModel(model_type='naive_bayes')
nb_model.train(X_train, y_train)
print("Naive Bayes Accuracy:", nb_model.evaluate(X_test, y_test))
nb_model.save('nb_model.pkl')

Naive Bayes Accuracy: 0.8337112878659672


In [12]:
# Deep Learning Model
dl_model = DeepLearningModel(max_len=100)
dl_model.train(X_train, y_train, epochs=10, patience=3)
print("Deep Learning Model Accuracy:", dl_model.evaluate(X_test, y_test))
dl_model.save('dl_model.h5', 'tokenizer.pkl', 'label_encoder.pkl')


Epoch 1/10


I0000 00:00:1718270989.116724      76 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Deep Learning Model Accuracy: 0.3886952106955492


  saving_api.save_model(


In [13]:
import numpy as np

In [14]:
# Inference Example
new_text ='ليه اوجع ايدي'

# Logistic Regression Inference
logistic_model.load('logistic_model.pkl')
print("Logistic Regression Prediction:", logistic_model.predict(new_text))

# Naive Bayes Inference
nb_model.load('nb_model.pkl')
print("Naive Bayes Prediction:", nb_model.predict(new_text))

# Deep Learning Model Inference
dl_model.load('dl_model.h5', 'tokenizer.pkl', 'label_encoder.pkl')
print("Deep Learning Model Prediction:", dl_model.predict(new_text))


Logistic Regression Prediction: ['EG']
Naive Bayes Prediction: ['EG']
Deep Learning Model Prediction: ['EG']


In [15]:
# Inference Example
new_text ='في ناس مليح اللي ما في متلن'

# Logistic Regression Inference
logistic_model.load('logistic_model.pkl')
print("Logistic Regression Prediction:", logistic_model.predict(new_text))

# Naive Bayes Inference
nb_model.load('nb_model.pkl')
print("Naive Bayes Prediction:", nb_model.predict(new_text))

# Deep Learning Model Inference
dl_model.load('dl_model.h5', 'tokenizer.pkl', 'label_encoder.pkl')
print("Deep Learning Model Prediction:", dl_model.predict(new_text))


Logistic Regression Prediction: ['LY']
Naive Bayes Prediction: ['LY']
Deep Learning Model Prediction: ['EG']
