In [66]:
from glob import glob
from pathlib import Path

import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

from utilities import *

pd.set_option('display.max_rows', 500)


### Params

In [21]:
test_size = .15
seed = 42

### Import data

In [2]:
data = {
    "smadc": get_SMADC_folder_data(),
    "annotated": get_annotated_data_folder_data(),
    "dart": get_dart_folder_data(),
    "aoc": get_arabic_dialects_dataset_folder_data(),
}

### Import lexicons

In [88]:
# Add dialects
lx = {
    "EGY_EMOTIONS": None,
    "GLF_EMOTIONS": None,
    "EGY_UNKOWN": None,
    "GLF_UNKOWN": None,
}
for path in glob("counter_lexicon/*"):
    name = Path(path).stem
    lx[name] = pd.read_csv(f"counter_lexicon/{name}.csv")[["Text", "Emotion"]]

### Validation set

In [89]:
lx["EGY_EMOTIONS"], EGY_EMOTIONS_VALIDATION = train_test_split(lx["EGY_EMOTIONS"], test_size=.1, random_state=seed, shuffle=True)
lx["GLF_EMOTIONS"], GLF_EMOTIONS_VALIDATION = train_test_split(lx["GLF_EMOTIONS"], test_size=.1, random_state=seed, shuffle=True)

### Training and results

In [80]:
def generate_classifier(
    df: pd.DataFrame,
    test_size: float=test_size,
    seed: int=seed
    ) -> Pipeline:

    train, test = train_test_split(df, test_size=test_size, random_state=seed, shuffle=True)
    train_x, train_y = train["Text"], train["Emotion"]
    test_x, test_y = test["Text"], test["Emotion"]
    clf = Pipeline([
        ('vect', CountVectorizer()),
        ('clf', MultinomialNB()),
    ])

    clf.fit(train_x, train_y)

    # Report
    preds = clf.predict(test_x)
    print(classification_report(test_y, preds, digits=4))

    return clf

###### Classifier scores alone

In [81]:
GLF_clf = generate_classifier(lx["GLF_EMOTIONS"])

              precision    recall  f1-score   support

       ANGER     0.8843    0.6903    0.7754       155
     DISGUST     1.0000    0.3793    0.5500        29
        FEAR     1.0000    0.4194    0.5909        31
         JOY     0.8827    0.9251    0.9034       187
     SADNESS     0.7985    0.8359    0.8168       256
    SURPRISE     0.5920    0.8240    0.6890       125

    accuracy                         0.7931       783
   macro avg     0.8596    0.6790    0.7209       783
weighted avg     0.8181    0.7931    0.7900       783



In [131]:
EGY_clf = generate_classifier(lx["EGY_EMOTIONS"])

              precision    recall  f1-score   support

       ANGER     0.9754    0.9957    0.9855      2349
     DISGUST     0.9508    0.8056    0.8722        72
        FEAR     0.0000    0.0000    0.0000         1
         JOY     0.9500    0.7451    0.8352        51
     SADNESS     0.9020    0.6216    0.7360        74
    SURPRISE     1.0000    0.2500    0.4000         4

    accuracy                         0.9730      2551
   macro avg     0.7964    0.5697    0.6381      2551
weighted avg     0.9717    0.9730    0.9707      2551



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


###### Classifier scores combined

In [128]:
combined_df = pd.concat([lx["GLF_EMOTIONS"], lx["EGY_EMOTIONS"]])
general_clf = generate_classifier(combined_df)

              precision    recall  f1-score   support

       ANGER     0.9568    0.9772    0.9669      2496
     DISGUST     0.9608    0.5269    0.6806        93
        FEAR     1.0000    0.2105    0.3478        38
         JOY     0.9158    0.8409    0.8768       220
     SADNESS     0.8060    0.7749    0.7902       311
    SURPRISE     0.5000    0.7526    0.6008        97

    accuracy                         0.9201      3255
   macro avg     0.8566    0.6805    0.7105      3255
weighted avg     0.9267    0.9201    0.9176      3255



# Scores on validation set

In [142]:
general_score = general_clf.score(
    GLF_EMOTIONS_VALIDATION["Text"],
    GLF_EMOTIONS_VALIDATION["Emotion"]
)

GLF_score = GLF_clf.score(
    GLF_EMOTIONS_VALIDATION["Text"],
    GLF_EMOTIONS_VALIDATION["Emotion"]
)

print(
    f"""General emotion classifier Vs. GLF specific classifier on validation set:
    \tgeneral_score: {general_score}
    \tGLF_score: {GLF_score}
    """
)

General emotion classifier Vs. GLF specific classifier on validation set:
    	general_score: 0.764367816091954
    	GLF_score: 0.789272030651341
    


In [143]:
general_score = general_clf.score(
    EGY_EMOTIONS_VALIDATION["Text"],
    EGY_EMOTIONS_VALIDATION["Emotion"]
)

EGY_score = EGY_clf.score(
    EGY_EMOTIONS_VALIDATION["Text"],
    EGY_EMOTIONS_VALIDATION["Emotion"]
)

print(
    f"""General emotion classifier Vs. EGY specific classifier on validation set:
    \tgeneral_score: {general_score}
    \tEGY_score: {EGY_score}
    """
)

General emotion classifier Vs. EGY specific classifier on validation set:
    	general_score: 0.9650793650793651
    	EGY_score: 0.9703703703703703
    


### Validation set CSV generation

In [118]:
def generate_output_df(
    dataset: pd.DataFrame,
    clf: MultinomialNB
):
    cols = [
        "Original text",
        "True label",
        "Predicted label",
        *[col +  " probability" for col in clf.classes_],
    ]

    output_df = pd.DataFrame(columns=cols)
    for i, (t, tl, pl, prob) in enumerate(zip(
        dataset["Text"],
        dataset["Emotion"],
        clf.predict(dataset["Text"]),
        clf.predict_proba(dataset["Text"])
    )):
        output_df.loc[i] = [t, tl, pl, *prob]
    
    return output_df

In [123]:
generate_output_df(GLF_EMOTIONS_VALIDATION, GLF_clf).to_csv("output_csvs/GLF_clf__GLF_validation.csv")
generate_output_df(GLF_EMOTIONS_VALIDATION, general_clf).to_csv("output_csvs/general_clf__GLF_validation.csv")
generate_output_df(EGY_EMOTIONS_VALIDATION, EGY_clf).to_csv("output_csvs/EGY_clf__EGY_validation.csv")
generate_output_df(EGY_EMOTIONS_VALIDATION, general_clf).to_csv("output_csvs/general_clf__EGY_validation.csv")

In [125]:
len(lx["GLF_EMOTIONS"])

4695