In [321]:
from glob import glob
from pathlib import Path

import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

from utilities import *

pd.set_option('display.max_rows', 500)


### Import data

In [28]:
data = {
    "smadc": get_SMADC_folder_data(),
    "annotated": get_annotated_data_folder_data(),
    "dart": get_dart_folder_data(),
    "aoc": get_arabic_dialects_dataset_folder_data(),
}

### Import lexicons

In [152]:
pd.DataFrame.from_dict(get_arabic_lexicon_data(), orient="index").T.columns.values

array(['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise'],
      dtype=object)

Unnamed: 0,variable,value
0,joy,عبادة
1,joy,عبادة
2,joy,توقير
3,joy,تمجيد
4,joy,تبجيل
...,...,...
10692,anger,قتله
10693,anger,الضحيّة
10694,anger,الشر
10695,anger,العدوات


In [387]:
# Add dialects
lx = pd.DataFrame(columns=["Word", "Dialect", "Emotion"])
for path in glob("dialect_lexicon/*"):
    with open(path, "r", encoding="utf8") as file:
        dialect, emotion = Path(path).stem.split("_")
        temp = pd.DataFrame(columns=["Word", "Dialect", "Emotion"])
        temp[["Word", "Dialect", "Emotion"]] = [[line.replace("\n", "") for line in file.readlines()], dialect, emotion]
        lx = pd.concat([lx, temp])

# MSA
emotions = {"anger", "joy", "sadness", "disgust", "surprise", "fear"}
MSA = pd.DataFrame.from_dict(get_arabic_lexicon_data(), orient="index") \
    .T \
    .melt(value_vars=emotions, var_name="Emotion", value_name="Word") \
    .dropna()

# Finish lx
lx.reset_index(inplace=True, drop=True)
lx.head(30)

  return asarray(a).ndim
  result = asarray(a).shape
  return asarray(a).ndim
  result = asarray(a).shape
  return asarray(a).ndim
  result = asarray(a).shape
  return asarray(a).ndim
  result = asarray(a).shape
  return asarray(a).ndim
  result = asarray(a).shape
  return asarray(a).ndim
  result = asarray(a).shape


Unnamed: 0,Word,Dialect,Emotion
0,هكرهك,EGY,anger
1,تخرس,EGY,anger
2,اخرس,EGY,anger
3,جلنف,EGY,anger
4,بزيئ,EGY,anger
5,اندال,EGY,anger
6,ندل,EGY,anger
7,عرص,EGY,anger
8,وسخ,EGY,anger
9,تزغرط,EGY,joy


# Training utils

In [432]:
def generate_classifier(
        lx: pd.DataFrame, 
        MSA: pd.DataFrame, 
        emotion: str, 
        add_MSA_data: bool=False, 
        add_other_data: bool=False
    ) -> MultinomialNB:
    emotions = {"anger", "joy", "sadness", "disgust", "surprise", "fear"}
    assert emotion in emotions, f"{emotion} is not {emotions}"

    x, y = lx[lx["Emotion"] == "anger"][["Word", "Dialect"]].values.T

    if add_MSA_data:
        # Add MSA to all dialects (Messes up results!)
        x_msa = MSA[MSA["Emotion"] == "anger"]["Word"].values.T
        for dialect in np.unique(y):
            x, y = np.concatenate((x, x_msa)), np.concatenate((y, len(x_msa) * [dialect]))

    if add_other_data:
        # Add other data
        other = lx[lx["Emotion"] != "anger"]["Word"].values
        x, y = np.concatenate((x, other)), np.concatenate((y, len(other) * ["OTHER"]))
    
    print("Input data: ", np.unique(y, return_counts=True))

    clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', MultinomialNB()),
    ])

    clf.fit(x, y)
    return clf

# Results

In [436]:
test = [
    "غضب",
    "شخص",
    "نذل", # This word is in the GLF lexicon
    "هكرهك", # This word is in the EGY lexicon
]


### just lexicon

In [437]:
anger_clf = generate_classifier(lx, MSA, "anger")
anger_clf.predict(test), anger_clf.predict_proba(test)

Input data:  (array(['EGY', 'GLF'], dtype=object), array([ 9, 15], dtype=int64))


(array(['GLF', 'GLF', 'GLF', 'EGY'], dtype='<U3'),
 array([[0.375     , 0.625     ],
        [0.375     , 0.625     ],
        [0.375     , 0.625     ],
        [0.58646617, 0.41353383]]))

### add OTHER class

In [438]:
anger_clf = generate_classifier(lx, MSA, "anger", add_other_data=True)
anger_clf.predict(test), anger_clf.predict_proba(test)

Input data:  (array(['EGY', 'GLF', 'OTHER'], dtype=object), array([ 9, 15, 37], dtype=int64))


(array(['OTHER', 'OTHER', 'OTHER', 'OTHER'], dtype='<U5'),
 array([[0.14754098, 0.24590164, 0.60655738],
        [0.14754098, 0.24590164, 0.60655738],
        [0.14754098, 0.24590164, 0.60655738],
        [0.30904279, 0.2372039 , 0.45375331]]))

### add MSA data to lexicon

In [439]:
anger_clf = generate_classifier(lx, MSA, "anger", add_MSA_data=True)
anger_clf.predict(test), anger_clf.predict_proba(test)

Input data:  (array(['EGY', 'GLF'], dtype=object), array([1101, 1107], dtype=int64))


(array(['GLF', 'GLF', 'GLF', 'EGY'], dtype='<U3'),
 array([[0.49949357, 0.50050643],
        [0.4986413 , 0.5013587 ],
        [0.4986413 , 0.5013587 ],
        [0.66621636, 0.33378364]]))

### add OTHER class
### add MSA data to lexicon

In [440]:
anger_clf = generate_classifier(lx, MSA, "anger", add_other_data=True, add_MSA_data=True)
anger_clf.predict(test), anger_clf.predict_proba(test)

Input data:  (array(['EGY', 'GLF', 'OTHER'], dtype=object), array([1101, 1107,   37], dtype=int64))


(array(['GLF', 'GLF', 'GLF', 'EGY'], dtype='<U5'),
 array([[0.49854036, 0.49958355, 0.00187609],
        [0.49042316, 0.49309577, 0.01648107],
        [0.49042316, 0.49309577, 0.01648107],
        [0.648323  , 0.32483981, 0.02683719]]))