In [1]:
from glob import glob
from pathlib import Path

import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

from utilities import *

pd.set_option('display.max_rows', 500)


### Import data

In [2]:
data = {
    "smadc": get_SMADC_folder_data(),
    "annotated": get_annotated_data_folder_data(),
    "dart": get_dart_folder_data(),
    "aoc": get_arabic_dialects_dataset_folder_data(),
}

### Import lexicons

In [3]:
pd.DataFrame.from_dict(get_arabic_lexicon_data(), orient="index").T.columns.values

array(['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise'],
      dtype=object)

In [4]:
# Add dialects
lx = pd.DataFrame(columns=["Word", "Dialect", "Emotion"])
for path in glob("dialect_lexicon/*"):
    with open(path, "r", encoding="utf8") as file:
        dialect, emotion = Path(path).stem.split("_")
        temp = pd.DataFrame(columns=["Word", "Dialect", "Emotion"])
        temp[["Word", "Dialect", "Emotion"]] = [[line.replace("\n", "") for line in file.readlines()], dialect, emotion]
        lx = pd.concat([lx, temp])

# MSA
emotions = {"anger", "joy", "sadness", "disgust", "surprise", "fear"}
MSA = pd.DataFrame.from_dict(get_arabic_lexicon_data(), orient="index") \
    .T \
    .melt(value_vars=emotions, var_name="Emotion", value_name="Word") \
    .dropna()

# Finish lx
lx.reset_index(inplace=True, drop=True)
lx

  return asarray(a).ndim
  result = asarray(a).shape
  return asarray(a).ndim
  result = asarray(a).shape
  return asarray(a).ndim
  result = asarray(a).shape
  return asarray(a).ndim
  result = asarray(a).shape
  return asarray(a).ndim
  result = asarray(a).shape
  return asarray(a).ndim
  result = asarray(a).shape
  return asarray(a).ndim
  result = asarray(a).shape
  return asarray(a).ndim
  result = asarray(a).shape
  return asarray(a).ndim
  result = asarray(a).shape
  return asarray(a).ndim
  result = asarray(a).shape
  return asarray(a).ndim
  result = asarray(a).shape
  return asarray(a).ndim
  result = asarray(a).shape


Unnamed: 0,Word,Dialect,Emotion
0,هكرهك,EGY,anger
1,تخرس,EGY,anger
2,اخرس,EGY,anger
3,جلنف,EGY,anger
4,بزيئ,EGY,anger
5,اندال,EGY,anger
6,ندل,EGY,anger
7,عرص,EGY,anger
8,وسخ,EGY,anger
9,يمسخكم,EGY,disgust


# Training utils

In [5]:
def generate_classifier(
        lx: pd.DataFrame, 
        MSA: pd.DataFrame, 
        emotion: str, 
        add_MSA_data: bool=False, 
        add_other_data: bool=False
    ) -> MultinomialNB:
    emotions = {"anger", "joy", "sadness", "disgust", "surprise", "fear"}
    assert emotion in emotions, f"{emotion} is not {emotions}"

    x, y = lx[lx["Emotion"] == "anger"][["Word", "Dialect"]].values.T

    if add_MSA_data:
        # Add MSA to all dialects (Messes up results!)
        x_msa = MSA[MSA["Emotion"] == "anger"]["Word"].values.T
        for dialect in np.unique(y):
            x, y = np.concatenate((x, x_msa)), np.concatenate((y, len(x_msa) * [dialect]))

    if add_other_data:
        # Add other data
        other = lx[lx["Emotion"] != "anger"]["Word"].values
        x, y = np.concatenate((x, other)), np.concatenate((y, len(other) * ["OTHER"]))
    
    print("Input data: ", np.unique(y, return_counts=True))

    clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', MultinomialNB()),
    ])

    clf.fit(x, y)
    return clf

# Results

In [6]:
test = [
    "غضب",
    "شخص",
    "نذل", # This word is in the GLF lexicon
    "هكرهك", # This word is in the EGY lexicon
    "هو ليه ما حدش بيقولنا إننا بنتكلم لغة جميلة و فيها تعبيرات و أمثال و مفردات في منتهى الذكاء و خفة الدم ... ليه كل اللي بنسمعه إنها  عامية و سوقية و لغة شوارع .. المصري من أحلى لغات الدنيا بالفعل و على ودن المصري هي الأجمل على الإطلاق",
    "مبيشتغلش ماتشين ورا بعض لازم فاصل وللودي عنده زي الدوري",
]


### just lexicon

In [7]:
anger_clf = generate_classifier(lx, MSA, "anger")
anger_clf.predict(test), anger_clf.predict_proba(test)

Input data:  (array(['EGY', 'GLF'], dtype=object), array([ 9, 15], dtype=int64))


(array(['GLF', 'GLF', 'GLF', 'EGY', 'GLF', 'GLF'], dtype='<U3'),
 array([[0.375     , 0.625     ],
        [0.375     , 0.625     ],
        [0.375     , 0.625     ],
        [0.58646617, 0.41353383],
        [0.375     , 0.625     ],
        [0.375     , 0.625     ]]))

### add OTHER class

In [8]:
anger_clf = generate_classifier(lx, MSA, "anger", add_other_data=True)
anger_clf.predict(test), anger_clf.predict_proba(test)

Input data:  (array(['EGY', 'GLF', 'OTHER'], dtype=object), array([ 9, 15, 85], dtype=int64))


(array(['OTHER', 'OTHER', 'OTHER', 'OTHER', 'OTHER', 'OTHER'], dtype='<U5'),
 array([[0.08256881, 0.13761468, 0.77981651],
        [0.08256881, 0.13761468, 0.77981651],
        [0.08256881, 0.13761468, 0.77981651],
        [0.21480536, 0.17027254, 0.61492209],
        [0.08256881, 0.13761468, 0.77981651],
        [0.08256881, 0.13761468, 0.77981651]]))

### add MSA data to lexicon

In [9]:
anger_clf = generate_classifier(lx, MSA, "anger", add_MSA_data=True)
anger_clf.predict(test), anger_clf.predict_proba(test)

Input data:  (array(['EGY', 'GLF'], dtype=object), array([1101, 1107], dtype=int64))


(array(['GLF', 'GLF', 'GLF', 'EGY', 'GLF', 'GLF'], dtype='<U3'),
 array([[0.49949357, 0.50050643],
        [0.4986413 , 0.5013587 ],
        [0.4986413 , 0.5013587 ],
        [0.66621636, 0.33378364],
        [0.4986413 , 0.5013587 ],
        [0.4986413 , 0.5013587 ]]))

### add OTHER class
### add MSA data to lexicon

In [10]:
anger_clf = generate_classifier(lx, MSA, "anger", add_other_data=True, add_MSA_data=True)
anger_clf.predict(test), anger_clf.predict_proba(test)

Input data:  (array(['EGY', 'GLF', 'OTHER'], dtype=object), array([1101, 1107,   85], dtype=int64))


(array(['GLF', 'GLF', 'GLF', 'EGY', 'GLF', 'GLF'], dtype='<U5'),
 array([[0.49750546, 0.49858818, 0.00390636],
        [0.480157  , 0.48277366, 0.03706934],
        [0.480157  , 0.48277366, 0.03706934],
        [0.6299375 , 0.31565421, 0.05440829],
        [0.480157  , 0.48277366, 0.03706934],
        [0.480157  , 0.48277366, 0.03706934]]))

## Classifier classifies everything as GLF unless very clear