In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install Arabic-Stopwords
!pip install arabic-reshaper
!pip install python-bidi
!pip install livelossplot

In [3]:
import pandas as pd


def read_csv(file_path):
    return pd.read_csv(file_path)


def read_txt(file_path):
    return set(open(file_path, encoding='utf-8').readlines())


def read_arabic_csv(file_path):
    df = pd.read_csv(file_path, lineterminator='\n', encoding='utf-8')
    df.columns = [col.replace('\r', '') for col in df.columns]
    return df.replace({r'\r': ''}, regex=True)



In [4]:
import pandas as pd
import requests
from requests.packages import urllib3
import json
from os.path import join
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


url = 'https://recruitment.aimtechnologies.co/ai-tasks'


def get_tweets_api(ids_list):
    return requests.post(url,
                         headers={'Content-Type': 'application/json'},
                         data=json.dumps(ids_list),
                         verify=False).json()


def get_dataset_df(df, save_directory_path="Dataset"):
    len_df = len(df)
    count = 0
    id_list = []
    dialect_list = []
    text_list = []
    while len_df > 0:
        num_samples = min(1000, len_df)
        end_index = count+num_samples-1
        ids_list = list(map(str, df.loc[count: end_index, "id"].values))
        json_dataset = get_tweets_api(ids_list)
        id_list.extend(ids_list)
        dialect_list.extend(list(df.loc[count: end_index, "dialect"].values))
        text_list.extend(json_dataset.values())
        count = end_index
        len_df -= num_samples

    res_df = pd.DataFrame(list(zip(id_list, dialect_list, text_list)),
                          columns=['Id', 'Dialect', "Text"])
    csv_file = join(save_directory_path, "csv_text_dataset.csv")
    res_df.to_csv(csv_file, index=False, encoding='utf-8')


In [5]:
import re
from string import punctuation
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
import arabicstopwords.arabicstopwords as stp
punctuation += '،؛؟”“'
stop_words = set(stopwords.words('english'))


def remove_emoji(text):
    regex_pattern = re.compile("["
                            u"\U0001F600-\U0001F64F"  # emoticons
                            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                            u"\U0001F680-\U0001F6FF"  # transport & map symbols
                            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                            u"\U00002500-\U00002BEF"  # chinese char
                            u"\U00002702-\U000027B0"
                            u"\U00002702-\U000027B0"
                            u"\U000024C2-\U0001F251"
                            u"\U0001f926-\U0001f937"
                            u"\U00010000-\U0010ffff"
                            u"\u2640-\u2642" 
                            u"\u2600-\u2B55"
                            u"\u200d"
                            u"\u23cf"
                            u"\u23e9"
                            u"\u231a"
                            u"\ufe0f"  # dingbats
                            u"\u3030"
                            "]+", flags=re.UNICODE)

    return regex_pattern.sub(r'', text)


def remove_email(text):
    return re.sub('([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})', '', text)


def remove_repeated_char(text):
    return re.sub(r'(.)\1\1{1,}', r'\1\1', text)


def remove_account_tag(text):
    return re.sub(r'@[\w]+', '', text)


def remove_hashtag(text):
    return re.sub(r'#[\w]+', '', text)


def remove_links(text):
    return re.sub(r'http[^\s]+', '', text)


def remove_spaces(text):
    text = re.sub(r"\n+", ' ', text)
    text = re.sub(r"\t+", ' ', text)
    text = re.sub(r"\r+", ' ', text)
    text = re.sub(r"\s+", ' ', text)
    return text


def remove_tashkeel(text):
    regx_pattern = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    text = re.sub(regx_pattern, "", text)

    regx_pattern = re.compile(r'(.)\1+')
    subst = r"\1\1"
    text = re.sub(regx_pattern, subst, text)
    return re.sub(r"[^\w\s]", '', text)


def remove_punctuation(text):
    return ''.join(c for c in text if c not in punctuation)


def remove_stop_words(text):
    text_list = []
    for w in text.split():
        if (not stp.is_stop(w)) and (w not in stop_words):
            text_list.append(w)
    return " ".join(text_list)


def remove_less_2_characters(text):
    return re.sub(r"\W*\b\w{1,2}\b", '', text)


def preprocess_text_sample(text):
    text = text.lower()
    text = remove_emoji(text)
    text = remove_email(text)
    text = remove_account_tag(text)
    text = remove_hashtag(text)
    text = remove_links(text)
    text = remove_less_2_characters(text)
    text = remove_repeated_char(text)
    text = remove_punctuation(text)
    text = remove_tashkeel(text)
    text = remove_stop_words(text)
    text = remove_spaces(text)
    text = text.strip()
    return text


def preprocess_text_cols(df, col):
    df[col] = df[col].apply(lambda x: preprocess_text_sample(x))
    return df


def preprocess_df(df, col="Text"):
    df = preprocess_text_cols(df, col)
    df = df[df[col] != ""]
    df.dropna(inplace=True)
    return df

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
import nltk

nltk.download('punkt')
nltk.download('wordnet')

oov_tok = "<oov_tok>"


def CountVectorizer_fit(X_train, ngram_range=(1,1)):
    count_vect = CountVectorizer(ngram_range=ngram_range)
    return count_vect.fit(X_train)


def CountVectorizer_transform(count_vect, X):
    return count_vect.transform(X)


def TfidfTransformer_fit(X_train_counts, use_idf=True):
    tf_transformer = TfidfTransformer(use_idf=use_idf)
    return tf_transformer.fit(X_train_counts)


def TfidfTransformer_transform(tf_transformer, X_counts):
    return tf_transformer.transform(X_counts)


def fit_preprocessing_pipeline(X_train, ngram_range=(1,1), use_idf=True):
    pipeline = Pipeline([
        ("vect", CountVectorizer(ngram_range=ngram_range)),
        ("tfidf", TfidfTransformer(use_idf=use_idf))])
    pipeline.fit(X_train)
    return pipeline


def transform_preprocessing_pipeline(pipeline, X):
    return pipeline.transform(X)


def get_max_sequences_len(df, col):
    return max([len(x.split()) for x in df[col].values])


def get_tokenizer_obj(text_list):
    tokenizer = Tokenizer(lower=True, split=" ", oov_token=oov_tok)
    tokenizer.fit_on_texts(text_list)
    return tokenizer, len(tokenizer.word_index)


def tokenize_texts_to_sequences(tokenizer, text_list):
    return tokenizer.texts_to_sequences(text_list)


def padding_sequences(x_arr, max_len):
    x_arr = pad_sequences(x_arr, maxlen=max_len, value=0, padding='post')
    return x_arr


def get_max_statment_len(df, col):
    return max([len(text.split()) for text in df[col]])

In [7]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, \
    classification_report, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score as f1_score_rep
import keras.backend as K
from sklearn.utils import class_weight
from keras.utils.np_utils import to_categorical
from numpy import unique, newaxis
import joblib
import os


from sklearn.model_selection import train_test_split


def split_dataset(df, y_col="", test_size=0.20, with_stratify=True, shuffle=True):
    if with_stratify:
        train, val = train_test_split(df,
                                      test_size=test_size,
                                      random_state=1,
                                      stratify=df[y_col],
                                      shuffle=shuffle)
    else:
        train, val = train_test_split(df,
                                      test_size=test_size,
                                      random_state=1,
                                      stratify=df[y_col],
                                      shuffle=shuffle)
    return train, val


def get_label_encoder_obj(y):
    label_encoder = LabelEncoder()
    return label_encoder.fit(y)


def get_y_label_encoder(label_encoder, y):
    return label_encoder.transform(y)


def get_nb_classes(y):
    return len(unique(y))


def one_hot_encode(y, num_classes):
    return to_categorical(y, num_classes=num_classes)


def f1_score(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2 * (precision * recall) / (precision + recall + K.epsilon())
    return f1_val


def get_class_weights(y):
    class_weights = class_weight.compute_class_weight('balanced',
                                                      classes=unique(y),
                                                      y=y)
    return {k: v for k, v in enumerate(class_weights)}


def print_score(y_pred, y_real, label_encoder):
    print("Accuracy: ", accuracy_score(y_real, y_pred))
    print("Precision:: ", precision_score(y_real, y_pred, average="micro"))
    print("Recall:: ", recall_score(y_real, y_pred, average="micro"))
    print("F1_Score:: ", f1_score_rep(y_real, y_pred, average="micro"))

    print()
    print("Macro precision_recall_fscore_support (macro) average")
    print(precision_recall_fscore_support(y_real, y_pred, average="macro"))

    print()
    print("Macro precision_recall_fscore_support (micro) average")
    print(precision_recall_fscore_support(y_real, y_pred, average="micro"))

    print()
    print("Macro precision_recall_fscore_support (weighted) average")
    print(precision_recall_fscore_support(y_real, y_pred, average="weighted"))

    print()
    print("Confusion Matrix")
    cm = confusion_matrix(y_real, y_pred)
    cm = cm.astype('float') / cm.sum(axis=1)[:, newaxis]
    df_cm = pd.DataFrame(cm, index=[i for i in label_encoder.classes_],
                         columns=[i for i in label_encoder.classes_])
    plt.figure(figsize=(20, 20))
    sns.heatmap(df_cm, annot=True)

    print()
    print("Classification Report")
    print(classification_report(y_real, y_pred, target_names=label_encoder.classes_))


def get_prediction_results(y_true, y_hat, label_encoder, num_classes):
    if len(y_true.shape) == 1:
        y_train_ohe = one_hot_encode(y_true, num_classes)
        y_hat_ohe = one_hot_encode(y_hat, num_classes)
    else:
        y_train_ohe = y_true.copy()
        y_hat_ohe = y_hat.copy()
    ROC_plot(y_train_ohe, y_hat_ohe, label_encoder, num_classes)
    print_score(y_hat, y_true, label_encoder)


def predict(model, X_val):
    return model.predict(X_val)


def save_model_pkl(model, path_directory, file_name):
    joblib.dump(model, os.path.join(path_directory, file_name))


def load_model_pkl(file_directory):
    return joblib.load(file_directory)




In [8]:
import numpy as np
import time
import pickle
import os
import tensorflow as tf
import keras
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from livelossplot import PlotLossesKeras
from transformers import *
from transformers import BertTokenizer, TFBertModel, BertConfig


pickle_inp_path = "Weights\\bert_inp.pkl"
pickle_mask_path = "Weights\\bert_mask.pkl"
pickle_label_path = "Weights\\bert_label.pkl"


def tokenizer_decode(bert_tokenizer, tokenized_sequence):
    bert_tokenizer.decode(tokenized_sequence['input_ids'])


def tokenizer_encode(sentences, labels, max_length):
    input_ids = []
    attention_masks = []
    bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    for sentence in sentences:
        bert_inp = bert_tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=max_length,
                                              pad_to_max_length=True, return_attention_mask=True)
        input_ids.append(bert_inp['input_ids'])
        attention_masks.append(bert_inp['attention_mask'])

    input_ids = np.asarray(input_ids)
    attention_masks = np.array(attention_masks)
    labels = np.array(labels)
    return input_ids, attention_masks, labels


def save_model_pkl(input_ids, attention_masks, labels):
    pickle.dump((input_ids), open(pickle_inp_path, 'wb'))
    pickle.dump((attention_masks), open(pickle_mask_path, 'wb'))
    pickle.dump((labels), open(pickle_label_path, 'wb'))

    print('Pickle files saved as ', pickle_inp_path, pickle_mask_path, pickle_label_path)


def load_model_pkl():
    print('Loading the saved pickle files..')
    input_ids = pickle.load(open(pickle_inp_path, 'rb'))
    attention_masks = pickle.load(open(pickle_mask_path, 'rb'))
    labels = pickle.load(open(pickle_label_path, 'rb'))
    print('Input shape {} Attention mask shape {} Input label shape {}'.format(input_ids.shape, attention_masks.shape,
                                                                               labels.shape))


def build_bert_model(num_classes):
    bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)
    print('\nBert Model', bert_model.summary())
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08)
    bert_model.compile(loss=loss, optimizer=optimizer, metrics=[metric])
    return bert_model


def fit_bert_model(bert_model, train_inp, train_mask, train_label,
                   val_inp, val_mask, val_label, weights_dir):
    log_dir = 'tb_bert'
    model_save_path = os.path.join(weights_dir, 'bert_model.h5')
    callbacks = [ModelCheckpoint(filepath=model_save_path,
                                 save_weights_only=True,
                                 monitor='val_loss',
                                 mode='min',
                                 verbose=1,
                                 save_best_only=True),
                 EarlyStopping(monitor="val_loss", mode="min", verbose=1, patience=5),
                 PlotLossesKeras(),
                 keras.callbacks.TensorBoard(log_dir=log_dir)]

    start_time = time.time()
    history = bert_model.fit([train_inp, train_mask], train_label,
                             batch_size=32, epochs=20,
                             validation_data=([val_inp, val_mask], val_label),
                             callbacks=callbacks)
    duration = time.time() - start_time
    print("Model take {} S to train ".format(duration))
    return bert_model, history

In [9]:
if __name__ == '__main__':
    csv_file_path = "../input/aimtask/dialect_dataset.csv"
    df = read_csv(csv_file_path)
    save_directory_path = ""
    get_dataset_df(df, save_directory_path)

    csv_file_path = "csv_text_dataset.csv"
    df = read_arabic_csv(csv_file_path)
    df = preprocess_df(df, col="Text")

    train, val = split_dataset(df, y_col="Dialect", test_size=0.06, with_stratify=True, shuffle=True)
    
    label_encoder = get_label_encoder_obj(train["Dialect"])
    train["Dialect"] = get_y_label_encoder(label_encoder, train["Dialect"])
    val["Dialect"] = get_y_label_encoder(label_encoder, val["Dialect"])
    
    max_statment_len = get_max_statment_len(train, "Text")

    train_inp, train_mask, train_label = tokenizer_encode(train["Text"], train["Dialect"], max_statment_len)
    val_inp, val_mask, val_label = tokenizer_encode(val["Text"], val["Dialect"], max_statment_len)

    num_classes = get_nb_classes(train["Dialect"])
    bert_model = build_bert_model(num_classes)
    weights_dir = ""
    bert_model, history = fit_bert_model(bert_model, train_inp, train_mask, train_label,
                                                  val_inp, val_mask, val_label, weights_dir)