In [1]:
import pandas as pd
import string
import nltk
import re

from collections import Counter

In [2]:
df = pd.read_csv("../data/df.csv")

In [3]:
df.sample(10)

Unnamed: 0.1,Unnamed: 0,Title,Post Text,ID,Score,Total Comments,Post URL,subreddit,class
1039,39,Sometimes you are anxious because the relation...,I‚Äôve been lurking in this subreddit for awhile...,zpw1zt,257,34,https://www.reddit.com/r/AnxiousAttachment/com...,AnxiousAttachment,1
1306,306,I daydream about getting hurt and then comforted.,How is this connected to anxious attachment st...,10kdtme,76,24,https://www.reddit.com/r/AnxiousAttachment/com...,AnxiousAttachment,1
399,399,Feeling secure for the first time {FA},"I know a lot of people have followed my story,...",ujnfqw,33,14,https://www.reddit.com/r/AvoidantAttachment/co...,AvoidantAttachment,0
1533,533,Dating an AA and tables have turned. Now i kno...,I‚Äôm starting to gain some sympathy for the guy...,yi238v,52,11,https://www.reddit.com/r/AnxiousAttachment/com...,AnxiousAttachment,1
1340,340,DAs just don‚Äôt like us that much,I talked with my therapist a few days ago and ...,11gd4da,69,28,https://www.reddit.com/r/AnxiousAttachment/com...,AnxiousAttachment,1
810,810,"Healed avoidants, do you become attracted to m...",One of the big things to sabotage my love life...,tint2w,18,15,https://www.reddit.com/r/AvoidantAttachment/co...,AvoidantAttachment,0
959,959,progress !! + uncovering my fears {fa},today marks my 3-day streak of not deactivatin...,upzg3q,16,12,https://www.reddit.com/r/AvoidantAttachment/co...,AvoidantAttachment,0
30,30,{DA} Vilification of avoidants and lack of tak...,This gets so exhausting. From popular resource...,125lfa9,165,81,https://www.reddit.com/r/AvoidantAttachment/co...,AvoidantAttachment,0
735,735,How does anyone deal with hypersensitivity to ...,How do y‚Äôall deal with the tendency to see emb...,sxwegz,21,4,https://www.reddit.com/r/AvoidantAttachment/co...,AvoidantAttachment,0
1789,789,Apparently I‚Äôm just incapable of not smotherin...,"Feeling frustrated, you guys. I long for conne...",whevsn,40,51,https://www.reddit.com/r/AnxiousAttachment/com...,AnxiousAttachment,1


In [4]:
# Data Cleaning
# Selecting both the title and post text as the text to be analysed.

df["text"] = df["Title"] + ' ' + df["Post Text"].fillna('')

In [5]:
# the number of rows for each class is similiar, i.e. no sign of class imbalance

df["class"].value_counts()

class
0    1000
1     998
Name: count, dtype: int64

In [6]:
# Remove punctuations and standardise to lowercase

def remove_punct(text):
    # store character only if it is not a punctuation
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

df["text_clean"] = df["text"].apply(lambda x: remove_punct(x.lower()))
df[["text", "text_clean"]].sample(5)

Unnamed: 0,text,text_clean
1168,Dear Avoidant - I forgive you. And I forgive m...,dear avoidant i forgive you and i forgive me ...
5,‚ù§Ô∏è,‚ù§Ô∏è
994,Anyone struggle to be a kind and warm person i...,anyone struggle to be a kind and warm person i...
47,‚ù§Ô∏è‚Äçü©π {DA} {FA},‚ù§Ô∏è‚Äçü©π da fa
679,{fa} Do any other FAs also experience ‚Äòenmeshm...,fa do any other fas also experience ‚Äòenmeshmen...


In [7]:
# Remove urls in the text

def remove_url(text):
    text_nourl = re.sub(r'\S*http\S*', '', text)
    return text_nourl

df["text_clean"] = df["text_clean"].apply(lambda x: remove_url(x))

df[["text", "text_clean"]].sample(10)

Unnamed: 0,text,text_clean
1739,Finding balance with AA and making a decision ...,finding balance with aa and making a decision ...
353,"{FA}{DA} running into an ex, epiphanies, closu...",fada running into an ex epiphanies closure as ...
1109,I overcame my biggest fear Im not looking for ...,i overcame my biggest fear im not looking for ...
169,I successfully battled through some of the sti...,i successfully battled through some of the sti...
1177,When the DA comes back: a warning We broke up ...,when the da comes back a warning we broke up t...
373,{fa} For those that have made significant stri...,fa for those that have made significant stride...
587,New to Understanding My Avoidant Attachment {f...,new to understanding my avoidant attachment fa...
530,Dating is hard | {SA} {DA} Tired and I have a ...,dating is hard sa da tired and i have a big d...
1024,"Hell, is where we reside ü´†",hell is where we reside ü´†
793,{da} Fear of being 'bad' in bed? I mostly iden...,da fear of being bad in bed i mostly identify ...


In [8]:
# Remove words that contain digit
def remove_digit(text):
    text_nodigit = re.sub(r'\w*\d\w*', '', text)
    return text_nodigit

df["text_clean"] = df["text_clean"].apply(lambda x: remove_digit(x))

df[["text", "text_clean"]].sample(10)

Unnamed: 0,text,text_clean
506,{fa}+{fa}: Deactivation versus legitimate conc...,fafa deactivation versus legitimate concerns a...
1282,Signs of a good therapist,signs of a good therapist
866,"Core Wounds, Self-Sabotage, and Vulnerability ...",core wounds selfsabotage and vulnerability st...
1994,I think we can all relate to this :D,i think we can all relate to this d
323,Knowing what you want {da} Do any other DAs or...,knowing what you want da do any other das or f...
868,Book recommendation: Adult Children of Emotion...,book recommendation adult children of emotiona...
711,"{DA} so i finally did It. Now I am lost. So, \...",da so i finally did it now i am lost so \r\n\r...
106,{fa} facepalm I was cuddling with the person I...,fa facepalm i was cuddling with the person ive...
811,Healthy Relationship Timeline {FA} So in true ...,healthy relationship timeline fa so in true fa...
1861,Anyone else feel oddly soothed during this cra...,anyone else feel oddly soothed during this cra...


In [9]:
# Tokenize
def tokenize(text):  
    # /W matches any character that is neither alphanumeric nor underscoreb
    # Add a + just in case there are 2 or more spaces between certain words
    tokens = re.split('\W+', text)
    return tokens

df["text_clean"] = df["text_clean"].apply(lambda x: tokenize(x)) 
df[["text", "text_clean"]].sample(5)

Unnamed: 0,text,text_clean
1841,Pay close enough attention and you‚Äôll realize ...,"[pay, close, enough, attention, and, you, ll, ..."
1808,Lack of sex/intimacy triggering negative core ...,"[lack, of, sexintimacy, triggering, negative, ..."
1690,I hate him. I hate him so much for making me f...,"[i, hate, him, i, hate, him, so, much, for, ma..."
1472,Unjoined all of the relationship subreddits ju...,"[unjoined, all, of, the, relationship, subredd..."
1550,‚Äúyou deserve better‚Äù one of the most pathetic ...,"[, you, deserve, better, one, of, the, most, p..."


In [10]:
# List of default stopwords
stopword = nltk.corpus.stopwords.words('english')

# Remove stop words
def remove_stopwords(tokenized_list):
    #Store in text only if word is not found in stopword i.e. it is not a stopword
    text = [word for word in tokenized_list if word not in stopword]
    return text

df["text_clean"] = df["text_clean"].apply(lambda x: remove_stopwords(x))
df[["text", "text_clean"]].sample(5)

Unnamed: 0,text,text_clean
1659,Coworker at work has a crush on another cowork...,"[coworker, work, crush, another, coworker, act..."
1567,my avoidant ex is making an effort to get back...,"[avoidant, ex, making, effort, get, back, toge..."
125,Guys‚Ä¶. {FA},"[guys, fa, ]"
1870,How to deal with jealousy properly? I‚Äôve had h...,"[deal, jealousy, properly, huge, anxiety, flar..."
1215,I think one of the saddest part in life is tha...,"[think, one, saddest, part, life, absolutely, ..."


In [11]:
# Lemmatizer
wn = nltk.WordNetLemmatizer()

def lemmatizing(tokenized_text):
    #return list of all lemmatized words for their corresponding words in tokenized_text
    text = [wn.lemmatize(word) for word in tokenized_text]
    return ' '.join(text)

df["text_clean"] = df["text_clean"].apply(lambda x: lemmatizing(x))
df[["text", "text_clean"]].sample(5)


Unnamed: 0,text,text_clean
1878,he reassured me i wouldnt be rejected and/or a...,reassured wouldnt rejected andor abandoned rej...
521,Dismissive but also secure? Is that a thing? {...,dismissive also secure thing dasa im writing w...
1910,Does anyone else start to feel resentment towa...,anyone else start feel resentment towards part...
815,Update: Today is better u/UpcycledThrowayAccnt...,update today better uupcycledthrowayaccnt aske...
598,"What DA ""fear of intimacy"" and ""deactivating s...",da fear intimacy deactivating strategy actuall...


In [12]:
# Separate the text into two lists based on class
class_0_text = df[df['class'] == 0]["text_clean"].tolist()
class_1_text = df[df['class'] == 1]["text_clean"].tolist()

# Function to find and count duplicate words between both classes
def find_duplicate_words_between_classes(df, column_name, class_column_name):
    class_0_text = df[df[class_column_name] == 0][column_name].tolist()
    class_1_text = df[df[class_column_name] == 1][column_name].tolist()
    
    all_words_class_0 = ' '.join(class_0_text).split()
    all_words_class_1 = ' '.join(class_1_text).split()
    
    common_words = set(all_words_class_0).intersection(all_words_class_1)
    
    word_counts_class_0 = Counter(all_words_class_0)
    word_counts_class_1 = Counter(all_words_class_1)
    
    top_common_words_class_0 = {word: word_counts_class_0[word] for word in common_words if word in word_counts_class_0}
    top_common_words_class_1 = {word: word_counts_class_1[word] for word in common_words if word in word_counts_class_1}
    
    return pd.DataFrame({'Duplicate Words': list(common_words), 
                         'Count (Class 0)': [top_common_words_class_0.get(word, 0) for word in common_words],
                         'Count (Class 1)': [top_common_words_class_1.get(word, 0) for word in common_words]})

# Get the DataFrame containing duplicate words and counts for both classes
duplicate_words_df = find_duplicate_words_between_classes(df, "text_clean", "class")

# Calculate the difference in counts between Class 0 and Class 1 for each word
duplicate_words_df['Count Difference'] = (duplicate_words_df['Count (Class 0)'] - duplicate_words_df['Count (Class 1)']).abs()

# Sort the DataFrame by the largest difference in counts (descending order)
duplicate_words_df = duplicate_words_df.sort_values(by='Count Difference', ascending=True)

# Display the duplicate words and counts for both classes
print(duplicate_words_df)
print(duplicate_words_df.shape)

     Duplicate Words  Count (Class 0)  Count (Class 1)  Count Difference
1210        lukewarm                1                1                 0
3876     translation                1                1                 0
1860            avid                1                1                 0
4815          taxing                2                2                 0
3877        insanity                1                1                 0
...              ...              ...              ...               ...
3140         anxious              218              685               467
3336            like             1791             1322               469
3257            feel             1879             1310               569
3220              im             1495              855               640
5119              fa              743               81               662

[5302 rows x 4 columns]
(5302, 4)


In [13]:
# using the list of duplicate words whose frequency in class 0/1 differed by less than 10 as additional stopwords to further clean the corpus

additional_stopwords = duplicate_words_df[duplicate_words_df['Count Difference'].between(0, 10)]
stopword += additional_stopwords["Duplicate Words"].tolist()


In [14]:
df["text_working"] = df["text_clean"].apply(lambda x: remove_stopwords(x))

In [15]:
df[["text_clean", "text_working"]].sample(5)

Unnamed: 0,text_clean,text_working
181,dating middle age youre imagining thing,"[n, g, , , g, , u, r, , g, n, n, g, , h, ..."
1532,gotos selfsoothing im going pretty dark time t...,"[g, , h, n, g, , , g, n, g, , r, , r, , ..."
344,ive enmeshed someone hello folk ive written fr...,"[v, , n, h, , n, , h, , , v, , r, n, , ..."
1940,feel loved together constantly worry partner c...,"[ , v, , g, h, r, , n, n, , r, r, , r, n, ..."
1613,right thing today honest wanting something ser...,"[r, g, h, , h, n, g, , , h, n, , n, n, g, ..."


In [16]:
# Remove stop words
def remove_stopwords(tokenized_list):
    #Store in text only if word is not found in stopword i.e. it is not a stopword
    text = [word for word in tokenized_list if word not in stopword]
    return text

df["text_working"] = df["text_clean"].apply(lambda x: remove_stopwords(x))

In [17]:
df[["text_clean", "text_working"]].sample(5)

Unnamed: 0,text_clean,text_working
1286,got text feel sad ashamed know posting usually...,"[g, , , , , h, , n, , n, g, , u, u, , ..."
1167,put together list youre looking yet make sure ...,"[u, , g, h, r, , , u, r, , n, g, , , , ..."
1818,envy secure attachers envy fact deal partner w...,"[n, v, , u, r, , h, r, , n, v, , , , r, ..."
1384,putting disturb best thing ever done ap worry ...,"[u, n, g, , u, r, , , h, n, g, , v, r, , ..."
942,fa stop serious tense learn find balance life ...,"[ , , r, u, , n, , r, n, , n, , n, , , ..."
