In [None]:
import re
import pandas as pd
import numpy as np


In [None]:
df_comment = pd.read_csv('comments.csv')

# Comments EDA

## Reviwing existing dataset

In [None]:
# Counting duplicates in each column
def count_duplicates_in_columns(df):
    duplicate_counts = {}
    for column in df.columns:
        duplicates_count = df[df.duplicated(subset=[column], keep=False)].shape[0]
        duplicate_counts[column] = duplicates_count
    return duplicate_counts



In [None]:
df_comment.head()

Unnamed: 0.1,Unnamed: 0,id,submission_id,message,comment_id,parent_id,created_utc,score
0,0,1,xt1ksm,Do people with two digits to their age really ...,irs5v1y,t3_xt1ksm,1665421334,59
1,1,2,xt1ksm,Lots of posts in the last 3-4 days about rando...,isdxgsq,t3_xt1ksm,1665813660,55
2,2,3,xt1ksm,Sometimes I think people are making up stories...,iryatl4,t3_xt1ksm,1665529128,40
3,3,4,xt1ksm,Saw it on FB but it's hilarious how threads wi...,is3i5i9,t3_xt1ksm,1665622760,33
4,4,5,xt1ksm,The OP: My MIL can be a bit petty sometimes\n\...,ituw9ym,t3_xt1ksm,1666793880,32


In [None]:
columns_to_drop  = ['created_utc', 'Unnamed: 0', 'id']
df_comment = df_comment.drop(columns=columns_to_drop, errors='ignore')

## Keeping only commens for cleaned posts

In [None]:
df_posts_cleaned = pd.read_csv("df_posts_cleaned.csv")

In [None]:
post_ids = df_posts_cleaned['submission_id'].unique()

In [None]:
len(post_ids)

30135

In [None]:
df_comment = df_comment[df_comment['submission_id'].isin(post_ids)]

In [None]:
df_comment.shape

(8981904, 5)

In [None]:
submission_id_counts = df_comment['submission_id'].value_counts()

In [None]:
submission_id_counts.describe()

count    30118.000000
mean       298.223786
std        277.817601
min          1.000000
25%         74.000000
50%        184.000000
75%        494.000000
max       1708.000000
Name: count, dtype: float64

In [None]:
# this posts will later be removed from posts df
submission_ids_below_3 = submission_id_counts[submission_id_counts < 3].index.tolist()
len(submission_ids_below_3)


28

In [None]:
df_comment.reset_index(drop=True, inplace= True)

## Cleaning

In [None]:
# removing empty messages:
df_comment.dropna(subset=['message'], inplace=True)

In [None]:
df_comment.isnull().sum()

submission_id    0
message          0
comment_id       0
parent_id        0
score            0
dtype: int64

In [None]:
df_comment['message'][7]

"YTA\n\nHe's nice enough to let you live at his house—for free—and you're asking him to *redecorate* for you? Of all the nerve. \n\nAnd your wife kicked up a fuss when Anthony *walked down the hall past her door*? It was *she* who didn't shut the door, for Pete's sake. \n\nIf I were you I'd start packing, because Anthony is about to throw you guys out."

In [None]:
def clean_text(text):
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    text = text.strip()
    return text

df_comment['cleaned_message'] = df_comment['message'].apply(clean_text)

df_comment[['message', 'cleaned_message']]

Unnamed: 0,message,cleaned_message
0,Welcome to /r/AmITheAsshole. Please view our [...,welcome to ramitheasshole please view our voti...
1,**You are guests in their home because you are...,you are guests in their home because you are c...
2,"YTA, and your wife seems to be systematically ...",yta and your wife seems to be systematically d...
3,"YTA. Go stay in a hotel. If I was your friend,...",yta go stay in a hotel if i was your friend i ...
4,YTA - first your wife had issues with your Mom...,yta first your wife had issues with your mom ...
...,...,...
8981899,If she's always been a tomboy and has never ch...,if shes always been a tomboy and has never cho...
8981900,"Yup. I'm extremely close with my parents, but ...",yup im extremely close with my parents but it ...
8981901,He is a good man! I suppose I’ll keep him arou...,he is a good man i suppose ill keep him around...
8981902,"I started my period really young, when I was 8...",i started my period really young when i was 8 ...


In [None]:
df_comment = df_comment.drop(columns='message', errors='ignore')
df_comment.to_csv('df_comment_cleaned.csv')

## Labeling

In [None]:
df_comment['cleaned_message'][7]

'yta hes nice enough to let you live at his housefor freeand youre asking him to redecorate for you of all the nerve and your wife kicked up a fuss when anthony walked down the hall past her door it was she who didnt shut the door for petes sake if i were you id start packing because anthony is about to throw you guys out'

In [None]:
# creating keywords that were taken after carefull comment analysis
keywords = {
    "NTA": [
        r"\bnta\b", r"\byou’re not the asshole\b", r"\byou’re not the ah\b", r"\bdid nothing wrong\b",
        r"\byou’re in the right\b", r"\bhandled this correctly\b", r"\bdid the right thing\b",
        r"\btotally justified\b", r"\byou're good here\b", r"\bnot your fault\b", r"\byou’re cool\b",
        r"\byou’re in the clear\b", r"\byou're all good\b", r"\bdid what you had to do\b", r"\bno way you're the asshole\b"
    ],
    "YTA": [
        r"\byta\b", r"\byou’re the asshole\b", r"\byou’re the ah\b", r"\byou’re in the wrong\b",
        r"\bshouldn’t have done that\b", r"\bthat was a jerk move\b", r"\bacted poorly\b",
        r"\bowe an apology\b", r"\bnot cool\b", r"\bthat was out of line\b", r"\bmessed up\b",
        r"\bbeing a jerk\b", r"\bthat's on you\b", r"\bblew it\b", r"\bthat’s a bad look\b",
        r"\byou're wrong here\b"
    ],
    "ESH": [
        r"\besh\b", r"\beveryone sucks here\b", r"\bboth sides are wrong\b", r"\bno one is in the right here\b",
        r"\bboth handled this poorly\b", r"\bboth of you are at fault\b", r"\ball parties involved are wrong\b",
        r"\byou all messed up\b", r"\bnobody wins here\b", r"\beveryone's at fault\b", r"\ball of you need to chill\b",
        r"\byou're all being ridiculous\b", r"\beveryone's being a jerk\b", r"\byou both need to grow up\b",
        r"\bboth parties are being dumb\b"
    ],
    "NAH": [
        r"\bnah\b", r"\bno assholes here\b", r"\bno ah here\b", r"\bno one is at fault\b",
        r"\beveryone acted reasonably\b", r"\bdon’t think anyone did anything wrong\b", r"\bno bad guys here\b",
        r"\bit's just a tough situation\b", r"\bnobody’s to blame\b", r"\beveryone did their best\b",
        r"\bjust a tough call\b", r"\bno one’s in the wrong\b", r"\bit's all good\b", r"\byou’re both fine\b",
        r"\bjust a misunderstanding\b"
    ]
}

In [None]:
def classify_comment(comment):
    # Check each category for the presence of keywords
    for category, phrases in keywords.items():
        for phrase in phrases:
            if re.search(phrase, comment):
                return category
    # Return Nan that will be droped
    return None

In [None]:
# Apply the classification function to the 'message' column
df_comment['classification'] = df_comment['cleaned_message'].apply(classify_comment)


In [None]:
df_comment['classification'].value_counts()

classification
NTA    2681243
YTA    1068886
ESH     146694
NAH      91808
Name: count, dtype: int64

# Exploring the difference if multiple keywords are found

In [None]:
df_comment.dropna(subset = ['classification'], inplace = True)

In [None]:
# Compile regex patterns for each category
compiled_keywords = {
    category: [re.compile(pattern) for pattern in patterns]
    for category, patterns in keywords.items()
}

def classify_comment_all(comment, compiled_keywords = compiled_keywords):
    comment_lower = comment.lower()
    matched_categories = []
    for category, patterns in compiled_keywords.items():
        for pattern in patterns:
            if pattern.search(comment_lower):
                matched_categories.append(category)
    return matched_categories if matched_categories else None

In [None]:
# Apply the classification function to the 'message' column
df_comment['classification_all'] = df_comment['cleaned_message'].apply(classify_comment_all)


In [None]:
df_comment

Unnamed: 0,submission_id,comment_id,parent_id,score,cleaned_message,classification,classification_all
1,yiplwk,iujucua,t3_yiplwk,23510,you are guests in their home because you are c...,YTA,[YTA]
2,yiplwk,iujuhrk,t3_yiplwk,8757,yta and your wife seems to be systematically d...,YTA,[YTA]
3,yiplwk,iuju98c,t3_yiplwk,1465,yta go stay in a hotel if i was your friend i ...,YTA,[YTA]
4,yiplwk,iujujpn,t3_yiplwk,10726,yta first your wife had issues with your mom ...,YTA,[YTA]
5,yiplwk,iujtsph,t3_yiplwk,1479,yta and this is a deeply unreasonable request ...,YTA,[YTA]
...,...,...,...,...,...,...,...
8981807,17uu03h,k98sp96,t1_k96ej8d,2,fellow aussie youll learn a couple things as y...,NTA,[NTA]
8981808,17uu03h,k996f4w,t1_k96ej8d,2,you shouldnt thats a very reasonable question ...,NTA,[NTA]
8981865,17uu03h,k99dr0i,t1_k9911te,2,yeah but we all know that its annoying on redd...,NAH,[NAH]
8981871,17uu03h,k99hr6a,t1_k99dr0i,3,i mean is it really that surprising things tha...,YTA,[YTA]


In [None]:
df_comment['cleaned_message'].loc[1143]

'grief is horrible your sil is going through the stages of grief in her own way and you are trying to hold on to your miracle baby for dear life youre kind of an a h for speaking to her like that but i know you were doing it for your own baby so kind of nta so ill just settle with esh talk to your sister in law again and ask her to try to contain her feeling around you or just stay away until the baby is born all the best'

In [None]:
def prioritise_classes(labels):
    category_priority = ["ESH", "NAH", "NTA", "YTA"]

    if len(labels) == 1:
        return labels[0]
    else:
        labels.sort(key=lambda x: category_priority.index(x))
        return labels[0]  # Return the highest priority category


In [None]:
df_comment['label'] = df_comment['classification_all'].apply(prioritise_classes)


In [None]:
df_comment[df_comment['classification_all'].apply(len) > 1]

Unnamed: 0,submission_id,comment_id,parent_id,score,cleaned_message,classification,classification_all,label
51,yiplwk,iujusoo,t3_yiplwk,93,yta youre a guest in his home smh i cant even ...,YTA,"[YTA, YTA]",YTA
342,yiplwk,iuk2ii0,t3_yiplwk,30,yta and so is your wife youre not even guests ...,YTA,"[NAH, YTA]",NAH
677,yiv572,iukygn7,t3_yiv572,22,wow yta i was prepared to say nta from the tit...,NTA,"[NTA, YTA]",NTA
761,yiv572,iul0elt,t3_yiv572,13,yta not cool at all there will only ever be on...,YTA,"[YTA, YTA]",YTA
1143,yimgaf,iujnlx6,t3_yimgaf,-28,grief is horrible your sil is going through th...,NTA,"[ESH, NTA]",ESH
...,...,...,...,...,...,...,...,...
8981441,17uma05,k99o2a9,t3_17uma05,1,esh it is victim blaming yes she messed up but...,YTA,"[ESH, YTA]",ESH
8981664,17ventx,k9a7n3s,t3_17ventx,3,nta i dont care if you could have chose your w...,NTA,"[NTA, NTA]",NTA
8981670,17ventx,k9ab98q,t3_17ventx,1,nta the best time to say all that was in thera...,NTA,"[NTA, YTA]",NTA
8981673,17ventx,k9akh3j,t3_17ventx,1,nta definitely nta you did nothing wrong your ...,NTA,"[NTA, NTA]",NTA


# End of experimetn

In [None]:
df_labeled_comments = df_comment[['submission_id', 'label']]

In [None]:
df_comment.to_csv('df_comment_labeled.csv')

In [None]:
df_labeled_comments.to_csv('df_labeled_comments.csv')

In [None]:
label_counts = df_labeled_comments['label'].value_counts()
label_counts

label
NTA    2666030
YTA    1058187
ESH     160924
NAH     103490
Name: count, dtype: int64