# Import Libs

In [37]:
import sys
import os
sys.path.append(os.path.dirname(os.path.realpath("")))

In [38]:
from termcolor import colored
from utilities import *

# Lexicons

In [41]:
egy_joy = ["تزغرط", "هيرضى", "تمزحو", "ميحرمناش", "متشكر", "هتستمتع", "هفرح", "هتضحك", "مريحني"]
egy_anger = ["هكرهك", "تخرس", "اخرس", "جلنف", "بزيئ", "اندال", "ندل", "عرص", "وسخ"]
egy_disgust = ["يمسخكم", "تقرف", "مؤرف", "متقرفوناش", "تقرفينا", "هيقرفه", "صرصار"]
egy_fear = ["هخافميخافش", "مبنخافش"]
egy_sadness = ["حزنان", "مبزعلش", "مخنوق", "اتبكي", "يبكو", "هعيط", "نكدية", "مصحتش", "مضيقاني", "متزعلش", "بتضايقنى"]
egy_surprise = ["ماتوقعوش", "هتفاجئ", "هفاجئك", "اتفاجئنا"]

glf_joy = ["ينحب", "ماقصرت", "سالخير", "مشكور", "لاهنتوا", "عقبالك", "يهنيكم", "نكتة"]
glf_anger = ["مصخره", "كريه", "متخلف", "مبزره", "كرهي", "لطخ", "حقير", "يقهر", "عمى", "يلعن", "غبي", "لعنبو", "يلعنم", "شخصنة"]
glf_disgust = ["يغث", "لطخ", "بذيء", "أشمط", "جرذ", "جرثوم", "وصخ", "حثاله", "ازق"]
glf_fear = ["عجيب", "مفجوع", "مروع", "متخوفة", "مخوف", "خوفتني", "خايفين", "منغص"]
glf_sadness = ["ضايقني", "شكى", "يبكيني", "ضايق", "قهر", "تبكيك", "يخذلك", "خنق", "خذلان", "قسى", "يحزني", "أحزن", "أبكى", "غثيث", "حز", "فاقدن", "أفتك"]
glf_surprise = ["هول", "مهول", "ترويع", "أعجب", "مفاجئه", "هبل", "فجأه"]

egy_lexicon_dict = {"JOY" : egy_joy, "ANGER" : egy_anger, "DISGUST" : egy_disgust, "FEAR" : egy_fear, "SADNESS" : egy_sadness, "SURPRISE" : egy_surprise}
glf_lexicon_dict = {"JOY" : glf_joy, "ANGER" : glf_anger, "DISGUST" : glf_disgust, "FEAR" : glf_fear, "SADNESS" : glf_sadness, "SURPRISE" : glf_surprise}

# Utility

In [42]:
def get_lexicons(text, lexicon_list):
    text_lexicon = []
    for lexicon in lexicon_list:
        if lexicon in text:
            text_lexicon.append(lexicon)
    return " ".join(text_lexicon)

def contains_lexicon(text, lexicons):
    for lexicon in lexicons:
        if lexicon in text:
            return True
    return False

def get_emotion(text, lexicon_dict):
    emotion_counter = {"JOY" : 0, "ANGER" : 0, "DISGUST" : 0, "FEAR" : 0, "SADNESS" : 0, "SURPRISE" : 0}
    
    for emotion in list(emotion_counter.keys()):
        for lexicon in lexicon_dict[emotion]:
            if lexicon in text:
                emotion_counter[emotion] += 1

    emotion_counter_sorted = {k: v for k, v in sorted(emotion_counter.items(), key=lambda item: item[1], reverse=True)}
    emotion_counter_sorted_values = list(emotion_counter_sorted.values())
    if emotion_counter_sorted_values[0] == emotion_counter_sorted_values[1]:
        return "UNKNOWN"
    return list(emotion_counter_sorted.keys())[0]

def display_lexicons(df, glf=False):
    if glf:
        for text in df[df["Contains_Lexicon"]].sample(20)["Text"]:
            formattedText = []
            for word in text.split():
                if any(lexicon in word for lexicon in glf_joy):
                    formattedText.append(colored(word,'white','on_green'))
                elif any(lexicon in word for lexicon in glf_anger):
                    formattedText.append(colored(word,'white','on_red'))
                else: 
                    formattedText.append(word)
            print(" ".join(formattedText))
            print("-----------------")
    else:
        for text in df[df["Contains_Lexicon"]].sample(20)["Text"]:
            formattedText = []
            for word in text.split():
                if any(lexicon in word for lexicon in egy_joy):
                    formattedText.append(colored(word,'white','on_green'))
                elif any(lexicon in word for lexicon in egy_anger):
                    formattedText.append(colored(word,'white','on_red'))
                else: 
                    formattedText.append(word)
            print(" ".join(formattedText))
            print("-----------------")

def process(df, lexicon_dict):
    df["Contains_Lexicon"] = df["Text"].apply(contains_lexicon, args=([val for lst in lexicon_dict.values() for val in lst],))
    df["Lexicon"] = df["Text"].apply(get_lexicons, args=([val for lst in lexicon_dict.values() for val in lst],))
    df["Emotion"] = df["Text"].apply(get_emotion, args=(lexicon_dict,))
    return df

## SMADC Dataset

In [43]:
df = get_SMADC_folder_data(".")
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1408456 entries, 0 to 256631
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   Text    1408456 non-null  object
 1   Region  1408456 non-null  object
dtypes: object(2)
memory usage: 32.2+ MB


In [44]:
df_egy = df[df["Region"] == "EGY"]
df_glf = df[df["Region"] == "GLF"]

### EGY Test

In [45]:
df_egy = process(df_egy, egy_lexicon_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Contains_Lexicon"] = df["Text"].apply(contains_lexicon, args=([val for lst in lexicon_dict.values() for val in lst],))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Lexicon"] = df["Text"].apply(get_lexicons, args=([val for lst in lexicon_dict.values() for val in lst],))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#retur

In [46]:
df_egy_unknown = df_egy[df_egy["Emotion"] == "UNKNOWN"]
df_egy = df_egy[df_egy["Emotion"] != "UNKNOWN"]

In [47]:
df_egy.groupby(["Emotion"]).agg({"Text" : "count"}).sort_values("Text", ascending=False)

Unnamed: 0_level_0,Text
Emotion,Unnamed: 1_level_1
ANGER,17410
DISGUST,591
SADNESS,536
JOY,308
SURPRISE,36
FEAR,10


In [48]:
df_egy_unknown.groupby(["Emotion"]).agg({"Text" : "count"}).sort_values("Text", ascending=False)

Unnamed: 0_level_0,Text
Emotion,Unnamed: 1_level_1
UNKNOWN,603176


In [49]:
df_egy.groupby(["Lexicon"]).agg({"Text" : "count"}).sort_values("Text", ascending=False)

Unnamed: 0_level_0,Text
Lexicon,Unnamed: 1_level_1
وسخ,9608
عرص,6630
عرص وسخ,620
تقرف,507
اخرس,240
متزعلش,174
ندل,170
مخنوق,162
متشكر,138
يبكو,88


In [None]:
display_lexicons(df_egy)

In [51]:
df_egy.to_csv("EGY_EMOTIONS.csv")
df_egy_unknown.to_csv("EGY_UNKOWN.csv")

### GLF Test

In [52]:
df_glf = process(df_glf, glf_lexicon_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Contains_Lexicon"] = df["Text"].apply(contains_lexicon, args=([val for lst in lexicon_dict.values() for val in lst],))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Lexicon"] = df["Text"].apply(get_lexicons, args=([val for lst in lexicon_dict.values() for val in lst],))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#retur

In [53]:
df_glf_unknown = df_glf[df_glf["Emotion"] == "UNKNOWN"]
df_glf = df_glf[df_glf["Emotion"] != "UNKNOWN"]

In [54]:
df_glf_unknown.groupby(["Emotion"]).agg({"Text" : "count"}).sort_values("Text", ascending=False)

Unnamed: 0_level_0,Text
Emotion,Unnamed: 1_level_1
UNKNOWN,175001


In [55]:
df_glf.groupby(["Emotion"]).agg({"Text" : "count"}).sort_values("Text", ascending=False)

Unnamed: 0_level_0,Text
Emotion,Unnamed: 1_level_1
SADNESS,1701
JOY,1318
ANGER,924
SURPRISE,830
FEAR,286
DISGUST,158


In [56]:
df_glf.groupby(["Lexicon"]).agg({"Text" : "count"}).sort_values("Text", ascending=False)

Unnamed: 0_level_0,Text
Lexicon,Unnamed: 1_level_1
حز,1085
مشكور,866
هول,509
ماقصرت,318
غبي,278
...,...
شكى حز,1
خذلان قسى,1
خذلان حز,1
حقير غبي,1


In [None]:
display_lexicons(df_glf, glf=True)

In [None]:
df_glf.to_csv("GLF_EMOTIONS.csv")
df_glf_unknown.to_csv("GLF_UNKOWN.csv")

## Annotated Dataset

In [None]:
df = get_annotated_data_folder_data("..")
df.info()

In [None]:
df.dropna(inplace=True)

In [None]:
df_egy = df[df["Region"] == "EGY"]
df_glf = df[df["Region"] == "GLF"]

### EGY Test

In [None]:
df_egy = process(df_egy, egy_joy, egy_anger)

In [None]:
df_egy.groupby(["Emotion"]).agg({"Text" : "count"}).sort_values("Text", ascending=False)

In [None]:
df_egy.groupby(["Lexicon"]).agg({"Text" : "count"}).sort_values("Text", ascending=False)

In [None]:
display_lexicons(df_egy)

### GLF Test

In [None]:
df_glf = process(df_glf, glf_joy, glf_anger)

In [None]:
df_glf.groupby(["Emotion"]).agg({"Text" : "count"}).sort_values("Text", ascending=False)

In [None]:
df_glf.groupby(["Lexicon"]).agg({"Text" : "count"}).sort_values("Text", ascending=False)

In [None]:
display_lexicons(df_glf, glf=True)

## DART Dataset

In [None]:
df = get_dart_folder_data("..")
df.info()

In [None]:
df.dropna(inplace=True)

In [None]:
df_egy = df[df["Region"] == "EGY"]
df_glf = df[df["Region"] == "GLF"]

### EGY Test

In [None]:
df_egy = process(df_egy, egy_joy, egy_anger)

In [None]:
df_egy.groupby(["Emotion"]).agg({"Text" : "count"}).sort_values("Text", ascending=False)

In [None]:
df_egy.groupby(["Lexicon"]).agg({"Text" : "count"}).sort_values("Text", ascending=False)

In [None]:
display_lexicons(df_egy)

### GLF Test

In [None]:
df_glf = process(df_glf, glf_joy, glf_anger)

In [None]:
df_glf.groupby(["Emotion"]).agg({"Text" : "count"}).sort_values("Text", ascending=False)

In [None]:
df_glf.groupby(["Lexicon"]).agg({"Text" : "count"}).sort_values("Text", ascending=False)

In [None]:
display_lexicons(df_glf, glf=True)