In [1]:
import pandas as pd
import string
import nltk
import re

from collections import Counter

In [2]:
df = pd.read_csv("../data/df.csv")

In [3]:
df.sample(10)

Unnamed: 0.1,Unnamed: 0,Title,Post Text,ID,Score,Total Comments,Post URL,subreddit,class
1378,378,If you love jumping to the worst scenario this...,I just typed most of this in a reply to anothe...,n02yto,68,13,https://www.reddit.com/r/AnxiousAttachment/com...,AnxiousAttachment,1
1882,882,Any APs who also have ADHD?,I feel like this combo is making it so much wo...,rs5yoo,37,30,https://www.reddit.com/r/AnxiousAttachment/com...,AnxiousAttachment,1
1861,861,Anyone else feel oddly soothed during this cra...,Usually when the person I'm actively intereste...,fmhsb2,37,8,https://www.reddit.com/r/AnxiousAttachment/com...,AnxiousAttachment,1
1276,276,the avoidant butt kissing here is annoying,"Controversial post, but I don't mind. Luckily ...",uhikgx,81,44,https://www.reddit.com/r/AnxiousAttachment/com...,AnxiousAttachment,1
1855,855,Just started dating again. Scared and excited,"He (33M) is secure, I am leaning secure. First...",qfdyyh,39,6,https://www.reddit.com/r/AnxiousAttachment/com...,AnxiousAttachment,1
1863,863,I dated someone for years off and on. I ended ...,**TL;DR:** I am a 28F AP who dated a 34M DA fo...,163p8je,37,43,https://www.reddit.com/r/AnxiousAttachment/com...,AnxiousAttachment,1
912,912,What do you as a {da} do to better yourself?,"Hey peeps! So I'm a dismissive avoidant, and I...",t9nvc7,15,11,https://www.reddit.com/r/AvoidantAttachment/co...,AvoidantAttachment,0
239,239,{DA}{FA}{AP}{SA} How many anxious-avoidant dan...,I am realizing that not feeling defeated in th...,zi16bj,43,2,https://www.reddit.com/r/AvoidantAttachment/co...,AvoidantAttachment,0
1582,582,I'm a coach for folks with an anxious attachme...,,ldmp1s,50,93,https://www.reddit.com/r/AnxiousAttachment/com...,AnxiousAttachment,1
1960,960,On letting go,Some food for thought from Lighter by Yung Pue...,ymzz51,33,2,https://www.reddit.com/r/AnxiousAttachment/com...,AnxiousAttachment,1


In [4]:
# Data Cleaning
# Selecting both the title and post text as the text to be analysed.

df["text"] = df["Title"] + ' ' + df["Post Text"].fillna('')

In [5]:
# the number of rows for each class is similiar, i.e. no sign of class imbalance

df["class"].value_counts()

class
0    1000
1     998
Name: count, dtype: int64

In [6]:
# Remove punctuations and standardise to lowercase

def remove_punct(text):
    # store character only if it is not a punctuation
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

df["text_clean"] = df["text"].apply(lambda x: remove_punct(x.lower()))
df[["text", "text_clean"]].sample(5)

Unnamed: 0,text,text_clean
554,A strange situation in 'behaving secure' journ...,a strange situation in behaving secure journey...
984,"Self Sabotage, Protest Behavior, and Deactivat...",self sabotage protest behavior and deactivatio...
367,I'm so tired of being the bigger person {FA} I...,im so tired of being the bigger person fa ive ...
1904,This is why I have AA.,this is why i have aa
276,"{Da} If you broke up w/ someone, how did you k...",da if you broke up w someone how did you know ...


In [7]:
# Remove urls in the text

def remove_url(text):
    text_nourl = re.sub(r'\S*http\S*', '', text)
    return text_nourl

df["text_clean"] = df["text_clean"].apply(lambda x: remove_url(x))

df[["text", "text_clean"]].sample(10)

Unnamed: 0,text,text_clean
1494,Healing and Sex: Once finally free of the addi...,healing and sex once finally free of the addic...
1043,ah darn it lol,ah darn it lol
1180,Anyone here have experience becoming more secu...,anyone here have experience becoming more secu...
793,{da} Fear of being 'bad' in bed? I mostly iden...,da fear of being bad in bed i mostly identify ...
1560,They say deep down the truth is AA's are actua...,they say deep down the truth is aas are actual...
145,"How do you get started on ""fixing"" avoidant at...",how do you get started on fixing avoidant atta...
1542,A reminder from my journal … \r\nI deserve and...,a reminder from my journal … \r\ni deserve and...
129,I hate who I become in relationships {fa} I tu...,i hate who i become in relationships fa i turn...
517,I don't have a therapist. What do I do from he...,i dont have a therapist what do i do from here...
1267,To those who are hurt by an avoidant ex's beha...,to those who are hurt by an avoidant exs behav...


In [8]:
# Remove words that contain digit
def remove_digit(text):
    text_nodigit = re.sub(r'\w*\d\w*', '', text)
    return text_nodigit

df["text_clean"] = df["text_clean"].apply(lambda x: remove_digit(x))

df[["text", "text_clean"]].sample(10)

Unnamed: 0,text,text_clean
195,I don’t understand why I don’t express excitem...,i don’t understand why i don’t express excitem...
1385,I bet a lot of anxious people are originally s...,i bet a lot of anxious people are originally s...
490,"When your avoidant tendencies are triggered, h...",when your avoidant tendencies are triggered ho...
1826,Taylor Swift anxiously attached? During therap...,taylor swift anxiously attached during therapy...
635,I’m the problem I always thought I had a mild-...,i’m the problem i always thought i had a mildm...
1048,If someone tells you who they are: BELIEVE THE...,if someone tells you who they are believe them...
1404,Set a boundary and got rejected. Trying to fee...,set a boundary and got rejected trying to feel...
445,Is there any information on enmeshed/codepende...,is there any information on enmeshedcodependen...
1768,A break off letter to my situationship of 9 mo...,a break off letter to my situationship of mon...
27,{fa} I think that many of us insecurely attach...,fa i think that many of us insecurely attached...


In [9]:
# Tokenize
def tokenize(text):  
    # /W matches any character that is neither alphanumeric nor underscoreb
    # Add a + just in case there are 2 or more spaces between certain words
    tokens = re.split('\W+', text)
    return tokens

df["text_token"] = df["text_clean"].apply(lambda x: tokenize(x)) 
df[["text", "text_token"]].sample(5)

Unnamed: 0,text,text_token
1012,Cool cool cool cool,"[cool, cool, cool, cool, ]"
1502,A Lasting Relationship Isn't Guaranteed (A Hea...,"[a, lasting, relationship, isnt, guaranteed, a..."
258,Deactivating because life is hard {FA} I had t...,"[deactivating, because, life, is, hard, fa, i,..."
798,deactivation after moving in together {fa} My ...,"[deactivation, after, moving, in, together, fa..."
484,{da} Advice on constant disappointment in inte...,"[da, advice, on, constant, disappointment, in,..."


In [10]:
# List of default stopwords
stopword = nltk.corpus.stopwords.words('english')

# Remove stop words
def remove_stopwords(tokenized_list):
    #Store in text only if word is not found in stopword i.e. it is not a stopword
    text = [word for word in tokenized_list if word not in stopword]
    return text

df["text_stop"] = df["text_token"].apply(lambda x: remove_stopwords(x))
df[["text", "text_stop"]].sample(5)

Unnamed: 0,text,text_stop
654,{da} can a balance even exist on how much time...,"[da, balance, even, exist, much, time, togethe..."
1353,Avoidant/anxious appreciation post Hey y’all! ...,"[avoidantanxious, appreciation, post, hey, vit..."
1552,Anyone else see their avoidant ex on dating ap...,"[anyone, else, see, avoidant, ex, dating, apps..."
633,HOW to separate fear of intimacy from actually...,"[separate, fear, intimacy, actually, wanting, ..."
1601,Realized something in therapy today I (AA/ FA)...,"[realized, something, therapy, today, aa, fa, ..."


In [11]:
# Lemmatizer
wn = nltk.WordNetLemmatizer()

def lemmatizing(tokenized_text):
    #return list of all lemmatized words for their corresponding words in tokenized_text
    text = [wn.lemmatize(word) for word in tokenized_text]
    return ' '.join(text)

df["text_lemmatise"] = df["text_stop"].apply(lambda x: lemmatizing(x))
df[["text", "text_lemmatise"]].sample(5)

Unnamed: 0,text,text_lemmatise
1421,"Anxious in relationships, but cold with my mom...",anxious relationship cold mom hi ask resonates...
1274,A breakthrough in Anxious Attachment with ther...,breakthrough anxious attachment therapy wonder...
51,Update: This sub remains restricted. It is sti...,update sub remains restricted still avoidant a...
968,How much longer do I have to try? I'm really t...,much longer try im really tired da hi im writi...
263,Undecided about dating {fa} I am constantly ch...,undecided dating fa constantly changing mind w...


In [12]:
# Separate the text into two lists based on class
class_0_text = df[df['class'] == 0]["text_clean"].tolist()
class_1_text = df[df['class'] == 1]["text_clean"].tolist()

# Function to find and count duplicate words between both classes
def find_duplicate_words_between_classes(df, column_name, class_column_name):
    class_0_text = df[df[class_column_name] == 0][column_name].tolist()
    class_1_text = df[df[class_column_name] == 1][column_name].tolist()
    
    all_words_class_0 = ' '.join(class_0_text).split()
    all_words_class_1 = ' '.join(class_1_text).split()
    
    common_words = set(all_words_class_0).intersection(all_words_class_1)
    
    word_counts_class_0 = Counter(all_words_class_0)
    word_counts_class_1 = Counter(all_words_class_1)
    
    top_common_words_class_0 = {word: word_counts_class_0[word] for word in common_words if word in word_counts_class_0}
    top_common_words_class_1 = {word: word_counts_class_1[word] for word in common_words if word in word_counts_class_1}
    
    return pd.DataFrame({'Duplicate Words': list(common_words), 
                         'Count (Class 0)': [top_common_words_class_0.get(word, 0) for word in common_words],
                         'Count (Class 1)': [top_common_words_class_1.get(word, 0) for word in common_words]})

# Get the DataFrame containing duplicate words and counts for both classes
duplicate_words_df = find_duplicate_words_between_classes(df, "text_lemmatise", "class")

# Calculate the difference in counts between Class 0 and Class 1 for each word
duplicate_words_df['Count Difference'] = (duplicate_words_df['Count (Class 0)'] - duplicate_words_df['Count (Class 1)']).abs()

# Sort the DataFrame by the largest difference in counts (descending order)
duplicate_words_df = duplicate_words_df.sort_values(by='Count Difference', ascending=True)

# Display the duplicate words and counts for both classes
print(duplicate_words_df)
print(duplicate_words_df.shape)

     Duplicate Words  Count (Class 0)  Count (Class 1)  Count Difference
2192            coin                1                1                 0
940              sum                1                1                 0
1901              er                1                1                 0
3688        mortgage                1                1                 0
4526        creative                8                8                 0
...              ...              ...              ...               ...
4131         anxious              218              685               467
1484            like             1791             1322               469
2800            feel             1879             1310               569
2716              im             1495              855               640
3983              fa              743               81               662

[5302 rows x 4 columns]
(5302, 4)


In [13]:
# using the list of duplicate words whose frequency in class 0/1 differed by less than 10 as additional stopwords to further clean the corpus

additional_stopwords = duplicate_words_df[duplicate_words_df['Count Difference'].between(0, 10)]
stopword += additional_stopwords["Duplicate Words"].tolist()


In [17]:
df["text_final"] = df["text_token"].apply(lambda x: remove_stopwords(x))
df["text_final"] = df["text_final"].apply(lambda x: lemmatizing(x))

In [18]:
df[["text", "text_final"]].sample(5)

Unnamed: 0,text,text_final
377,Are people actually prepared for us to change?...,people actually change fa avoidant think im gu...
462,Speaking of loving yourself {FA},speaking fa
226,Weekly Video Discussion: Avoidant Attachment: ...,weekly discussion avoidant attachment keep rel...
1185,breakup with a DA - before vs after,breakup da v
1682,This has helped me with my anxious attachment ...,helped anxious attachment first want send lot ...
