In [86]:
import json
import string
from datetime import datetime, date
import re
import os
import sys
import requests
import nltk
import socket
from nltk.corpus import stopwords
import emoji
import pandas as pd
nltk.download('stopwords', quiet=True)


def find_all(a_str, sub):
    """Find all substring occurences (non overlapping)

    Args:
        a_str (string): some string
        sub (string): some substring we want to find within a_str

    Yields:
        list: list of indices
    """
    start = 0
    while True:
        start = a_str.find(sub, start)
        if start == -1:
            return
        yield start
        start += len(sub)

def dict_to_feature_tuples(dict, suffix=""):
    """ Take a dict at end of a feature function and converts it into the tuple format suitable for the dataframe

    Args:
        dict: dictionary containing features data
        suffix: string we post pend to each feature category

    Returns:
        tuple_list: list of tuples for the dataframe e.g. [(feature name, value),...]
    """
    tuple_list = []
    #all_values_zero = True
    for k, v in dict.items():
        tpl = (("{0}"+suffix).format(k), v)
        # if not np.isclose(v, 0, rtol=1e-05, atol=1e-08, equal_nan=False):
        #    all_values_zero = False
        tuple_list.append(tpl)

    return tuple_list  # if not all_values_zero else []



def flatten_list(lst):
    """Flattens a list to only 1 dimension

    Args:
        lst (list): Any type of list    

    Returns:
        flattened list: Flat list of the original list
    """
    return [item for sublist in lst for item in sublist]

def get_clean_text(post_text,
                   nlp,
                   remove_URL=True,
                   remove_punctuation=0,
                   remove_newline=True,
                   merge_whitespaces=True,
                   do_lowercaseing=True,
                   remove_stopwords=False,
                   do_lemmatization=True,
                   remove_am=False):
    """Function to clean text (i.e. remove urls, punctuation, newlines etc) and do lemmatizaion if needed

    Args:
        post_text (string): full body text of r/AITA posts
        nlp (function): nlp function from the spacy object
        remove_URL (bool, optional): Whether or not URLs should be removed. Defaults to True.
        remove_punctuation (int, optional): Whether or not punctuation should be removed. Defaults to False.
        remove_newline (bool, optional): Whether or not newline characters should be removed. Defaults to True.
        merge_whitespaces (bool, optional): Whether or not mulitple consecutive whitespace should be merged to one. Defaults to True.
        do_lowercaseing (bool, optional): Whether or not text should be lowercased. Defaults to True.
        remove_stopwords (bool, optional): Whether or not stopwords from nltk should be removed (includes here, than, myself, which, it....). Defaults to False.
        do_lemmatization (bool, optional): Whether or not we should return the lemmatized post. Defaults to True.
        remove_am (bool, optional): Whether or not we should remove all "'m" and "am" in the post. Defaults to False

    Returns:
        string/spacy doc: Cleaned string or cleaned & lemmatized spacy doc. Spacydoc can be iterated over just like one would a string
    """

    #remove emojis:
    post_text = emoji.get_emoji_regexp().sub(u'', post_text)

    if remove_am:
        post_text = post_text.replace("'m", " ").replace("am", " ")

    if remove_URL:
        post_text = re.sub(r'^https?:\/\/.*[\r\n]*', '', str(post_text))

    if remove_punctuation == 1:
        post_text = post_text.translate(
            str.maketrans(' ', ' ', string.punctuation))
    elif remove_punctuation == 2:
        post_text = post_text.translate(str.maketrans(
            string.punctuation, ' '*len(string.punctuation)))

    # \n = newline & \r = carriage return
    if remove_newline:
        post_text = post_text.replace('\n', ' ').replace('\r', '')

    if merge_whitespaces:
        post_text = ' '.join(post_text.split())

    if do_lowercaseing:
        post_text = post_text.lower()

    # removes things like [i, me, my, myself, we, our, ours, ...
    if remove_stopwords:
        post_text = " ".join(
            [word for word in post_text.split() if word not in stopwords.words('english')])

    if do_lemmatization:
        return nlp(post_text)  # spacy
    else:
        return post_text

def string_matching_arr_append_ah(matches):
    """When performing string matching we sometimes manualy specific which words to match. Often these include the words "asshole".
        Often users on AITA, do not write "asshole" but write "ah" instead. Thus we need to extend the matching list but replace all "asshole" occurences with "ah"

    Args:
        matches (list): list of matching strings, these do not include any "ah" yet but only "asshole"

    Returns:
        matches_extended: list of matching strings extended to include "asshole" and "ah
    """

    asshole_str = "asshole"
    ah_str_list = ["ah","a-hole", "a hole", "ass hole", "a**hole", "a** hole"]

    ah_to_post_pend = []
    for ah_str in ah_str_list:
        for match_str in matches:
            if asshole_str in match_str:
                ah_to_post_pend += [match_str.replace(asshole_str, ah_str)]

    matches_extended = matches + ah_to_post_pend
    return matches_extended

def get_judgement_labels(df_cmt):
    """Returns judgement label counts (YTA, NTA, INFO, ESH, NAH)

    Args:
        post_id (int): Id of the reddit post

    Returns:
        tuple list (list): e.g. [("NTA",10), ("YTA", 20),...]
    """
    JUDGMENT_ACRONYM = ["YTA", "NTA", "INFO", "ESH", "NAH"]
    JUDGEJMENT_DICT = { # everytime asshole appears an expression get added that replaces asshole with "ah"
        "NTA": ["NTA", "not the asshole", "not an asshole", "don't think you're an asshole", "YWNBTA"],
        "INFO": ["INFO", "Not enough info", "Not enough information", "More info", ],
        "ESH": ["ESH", "everyone sucks here", "everybody sucks here", "ETA", "everyone's the asshole"],
        "NAH": ["NAH", "No Assholes here", "No Asshole here", "no one sucks here"],
        "YTA": ["YTA", "You're the asshole", "You are the asshole", "You are a little bit the asshole", "YWBTA", "You are an asshole", "You're an asshole", "yt absolute a"],
    }
    BOT_STRINGS = ["automod", "i am a bot"]

    df_comments = df_cmt
    df_comments = df_comments[["comment_text", "comment_score"]]

    label_counter = JUDGMENT_ACRONYM + ["weighted_"+s for s in JUDGMENT_ACRONYM]
    label_counter = dict.fromkeys(label_counter,0)
    
    
    for i, comment_row in enumerate(df_comments.itertuples(), 1):
        _, comment_body, score = comment_row
    
        comment_body = get_clean_text(str(comment_body), None, do_lemmatization=False)
        comment_body_no_punct = get_clean_text(str(comment_body), None, remove_punctuation=2, do_lemmatization=False)

        if any(list(map(lambda x: x.lower() in comment_body, BOT_STRINGS))):
            #print("___SKIPPED___")
            continue

        labels_loc = {}

        middle = max(len(comment_body)//2,1)
        middle_simple = max(len(comment_body_no_punct.split())//2,1)
        
        for k in JUDGEJMENT_DICT.keys():
            idxes = []              # "e.g. You are the asshole"
            center_dist = []                 

            for x in string_matching_arr_append_ah(JUDGEJMENT_DICT[k]):   
                if len(x.split()) > 1: 
                    idxes = find_all(comment_body, x.lower())
                    idxes = list(filter(lambda x: x != -1, idxes))
                    center_dist_tmp = list(map(lambda q: (abs(middle-q) / middle), idxes ))
                    
                else: 
                    idxes = [i for i,y in enumerate(comment_body_no_punct.split()) if y==x.lower()]
                    center_dist_tmp = list(map(lambda q: (abs(middle_simple-q) / middle_simple), idxes ))

                # No longer index but distance from center
                center_dist_tmp.sort(reverse=True)
                center_dist += center_dist_tmp
                
            # Order by distance
            #merged = center_dist + center_dist_simple
            center_dist.sort(reverse=True)
            labels_loc[k] = center_dist

        # Check if more than one vote was detected
        nr_votes = len(flatten_list(list(labels_loc.values())))
        vote = ""
        if nr_votes > 1:
            # We remove info since this could often cause errors and we are not super interested in it
            labels_loc.pop("INFO", None)
            max_label = ""
            max_value = 0
            for k in labels_loc.keys():
                if len(labels_loc[k]) > 0 and labels_loc[k][0] > max_value:
                    max_value = labels_loc[k][0]
                    max_label = k
        
            vote = max_label
        else:
            # Take first dict entry that contains some value
            for k in labels_loc.keys():
                if len(labels_loc[k]) > 0:
                    vote = k
    
        #print(labels_loc)
        #print(vote)
        #print("___end___")
        if vote != "":
            label_counter[vote.upper()] += 1
            label_counter["weighted_"+vote.upper()] += int(score)
    
    tuple_list =  dict_to_feature_tuples(label_counter) 

    return tuple_list


In [91]:
comments = [["Your wife got to enjoy her meal at her own pace while you dealt with the cranky kids; did the bedtime routine alone, and then also cleaned the kitchen afterwards. I’d be grateful. NTA.", 1],
            ["NTA. 1-1.5 hours is excessive, especially with small children", 1],
            ["INFO:How does it take someone one and half hours to eat their food? Nope, nevermind, how does it commonly take an hour?!Is she just talking the whole time? Is she reading her phone? Wth is she doing?NTA since your wife is rude here. If everyone else has finished eating it is polite to speed up. If you maintain your slow eating rate you are just being selfish.EDIT: I concede there are genuine reasons for people to be slow eaters and if they fall into those categories then it is instead reasonable to simple not expect others to wait for you",1],
            ["It's ok, your marriage probably won't survive, but at least you'll keep your job  🤷‍♂️YTA",1],
            ["You are the asshole. This can’t be real. If your job depends on you not kicking a deliberate provocateur out of your own wedding, regardless of who that person is, then you need a new job yesterday. As it is, good luck with the annulment, because I don’t know how or why you expect to stay married after demonstrating you don’t actually qualify as a vertebrate. Then again, I don’t know why your wife didn’t take matters into her own hands and kick these people out herself, so…",1],
            ["Son: You're not my dad! You're not the grandfather to my children. My real dad was a marine.Stepdad: Okay then. Go to your 'real dad' if you want help.Son: <Surprised pikachu face>...........ESH.You for not talking about your feelings sooner to your son.Your son for not realising how much his comments hurt and relying on your help while simultaneously trying to erase your relationship with him.",1],
            ["NAH Sometimes timing sucks and just can't be helped. You were ready to move on with a new husband. That's okay. Your daughter wasn't ready for a new father, or a stepfather. That's okay. Your husband did his best. That's good. Your daughters grown up, and is ready to be more mature about things. That's good. Your husband has been hurt in the past, and is reluctant to take more emotional risks. That's okay. While time doesn't heal all wounds, time may help here. One can't see the future, but one can have hope.",1],
            ["INFO: If your 22 year old fiancée disagrees with you, would you ground her too?",1]
            ]
print("hi")

hi


In [92]:
import json

# NTA, NTA, INFO or NTA, YTA, YTA, ESH, NAH, INFO

#comments = [["YTA", 1],["NTA", 2], ["NAH", 3],["ESH", 4],]
vote_df = pd.DataFrame(comments,columns=["comment_text", "comment_score"])

#print(vote_df)
extracted = get_judgement_labels(vote_df)

for s,v in extracted:
    print(f"{s}: {v}")


YTA: 2
NTA: 3
INFO: 1
ESH: 1
NAH: 1
weighted_YTA: 2
weighted_NTA: 3
weighted_INFO: 1
weighted_ESH: 1
weighted_NAH: 1
