In order to perform a frequent itemset analysis, the data needs to be pre-processed.

In [104]:
import pandas as pd

# ---- Load the four main tables (TSV) ----
incels_is_comments   = pd.read_csv("data/incels.is_AllComments.anon",           sep="\t", encoding="utf-8", on_bad_lines="skip")
reddit_incel_posts   = pd.read_csv("data/reddit-incel-posts.anon.txt",          sep="\t", encoding="utf-8", on_bad_lines="skip", dtype = str)
reddit_incelexit = pd.read_csv("data/reddit-IncelExit-posts.anon.txt",          sep="\t", encoding="utf-8", engine="python", quoting=3, on_bad_lines="skip", dtype=str)
saidit_incel_posts   = pd.read_csv("data/saidit-incel-posts.anon.txt",          sep="\t", encoding="utf-8", on_bad_lines="skip")
braincels_incel_posts   = pd.read_csv("data/reddit-braincels-posts.anon.txt",          sep="\t", encoding="utf-8", on_bad_lines="skip", dtype=str)

#braincels_incel_posts = braincels_incel_posts.iloc[1:].reset_index(drop=True)

# Optional: keep them in a dict for easy access
dfs = {
    "reddit_incel_posts": reddit_incel_posts,
    "reddit_incelexit": reddit_incelexit,
    "braincels_incel_posts": braincels_incel_posts
}

# streamline column names 
colnames = ["link", "comment_id", "user_id", "parent", "timestamp", "title", "text"]
for name, df in dfs.items():
    df.columns = colnames

# Quick peek so you can see they loaded
for name, df in dfs.items():
    print(f"{name}: {df.shape}")
    display(df.head(10))

print("Unique users in each dataset:")
for name, df in dfs.items():
    print(f"{name}: {len(df['user_id'].unique())}")

#total unique users across all datasets
all_users = pd.concat([df['user_id'] for df in dfs.values()]).unique()
print(f"Total unique users across all datasets: {len(all_users)}")

#total length of all datasets combined 
total_length = sum(df.shape[0] for df in dfs.values())
print(f"Total length of all datasets combined: {total_length}")

reddit_incel_posts: (988060, 7)


Unnamed: 0,link,comment_id,user_id,parent,timestamp,title,text
0,/r/Incels/comments/4k4c9k,4k4c9k,924979784.370224,,1463687543,The cure for your disease,
1,/r/Incels/comments/5VNscmKPVjD9HfBtIpGvtO0X-fX...,5VNscmKPVjD9HfBtIpGvtO0X-fXQRg-ccgQCcLJTaBQ,924980486.069293,,1463454980,Why you're incel,
2,/r/Incels/comments/t5_2y2u1,t5_2y2u1,924980047.255993,,1463453936,Hello,
3,/r/Incels/comments/t5_2y2u1,t5_2y2u1,924980789.8589,,1455499334,the internet's first asocial network,
4,/r/Incels/comments/2716it,2716it,924981346.025006,,1401634505,Incels - People who are alone - People who fee...,
5,/r/Incels/comments/t5_2y2u1,t5_2y2u1,924982222.493196,,1395136982,Why you’re (probably) a feminist or their usef...,
6,/r/Incels/comments/t5_2y2u1,t5_2y2u1,924982222.493196,,1395043047,A 48 year-old virgin's blog,
7,/r/Incels/comments/20m6pu,20m6pu,924982222.493196,,1395042944,A blogger TarnishedSophia has questions for th...,
8,/r/Incels/comments/1ue0yu,1ue0yu,924982552.262455,,1388842629,Have you ever been so far gone that you can't ...,
9,/r/Incels/comments/d3folfo,d3folfo,924983147.481448,d3fo82o,1463957409,,"I hate this psuedoscience of that girls want, ..."


reddit_incelexit: (111083, 7)


Unnamed: 0,link,comment_id,user_id,parent,timestamp,title,text
0,/r/IncelExit/comments/djb2a2/welcome_to_the_va...,f4590y6,924979382.138533,djb2a2,1571369186,,I will... nicely shove mental health and well-...
1,/r/IncelExit/comments/djb2a2/welcome_to_the_va...,f49ec70,924979824.167305,djb2a2,1571463603,,Black pill points and counter points are fine....
2,/r/IncelExit/comments/djb2a2/welcome_to_the_va...,f47q9i0,924980486.069293,djb2a2,1571426361,924980486.069293,I'm already having a hard time understanding w...
3,/r/IncelExit/comments/djb2a2/welcome_to_the_va...,f4586f1,924979382.138533,djb2a2,1571368533,,TIME TO SHOVE MENTAL HEALTH AND WELL-BEING INT...
4,/r/IncelExit/comments/djax7x/hello_brocels_is_...,f438fm6,924979824.167305,djax7x,1571339512,,Wrong Subreddit friend. Thats a permaban.
5,/r/IncelExit/comments/djb2a2/welcome_to_the_va...,f458otg,924979824.167305,djb2a2,1571368920,,Just remember the rules. This is not IncelTear...
6,/r/IncelExit/comments/djb93n/not_an_incel_recr...,f44v1mc,924980763.514056,djb93n,1571359416,,"Good luck with this, and thanks for making the..."
7,/r/IncelExit/comments/djb93n/not_an_incel_recr...,f47kpwj,924979824.167305,djb93n,1571423698,,"If you aren't looking for answers, you won't f..."
8,/r/IncelExit/comments/djbk5o/what_kind_of_advi...,f43m2yg,924981586.192561,djbk5o,1571343047,,I was never what one would consider an outgoin...
9,/r/IncelExit/comments/djb93n/not_an_incel_recr...,f47j736,924980486.069293,djb93n,1571423095,924980486.069293,[deleted]


braincels_incel_posts: (2216969, 7)


Unnamed: 0,link,comment_id,user_id,parent,timestamp,title,text
0,url,id,924978959.98047,parent,created_at,title,text
1,https://www.reddit.com/r/Braincels/comments/77...,77ru0b,924979030.686585,,1508566505,Getting Settled in,General discussion and Q&amp;A thread since we...
2,/r/Braincels/comments/doqectl/comment/doqgmkg,doqgmkg,931484113.951227,doqectl,1508701504,,Thanks. It just seemed that it would be anothe...
3,/r/Braincels/comments/doqe6l5/comment/doqectl,doqectl,924979030.686585,doqe6l5,1508699066,,"To your last question, of course your safe her..."
4,/r/Braincels/comments/77ru0b/comment/doq6k7h,doq6k7h,931484113.951227,77ru0b,1508690481,,Is FHO also used here or is this like FA / IWH
5,/r/Braincels/comments/doplfy9/comment/dopnsx0,dopnsx0,924979030.686585,doplfy9,1508652013,,I think the main thing about r/IncelDiscussion...
6,/r/Braincels/comments/dopk1cs/comment/dopm17l,dopm17l,924979139.192799,dopk1cs,1508648408,,Great! I pmed the guy who wrote the treatise a...
7,/r/Braincels/comments/77ru0b/comment/doplfy9,doplfy9,925078641.54442,77ru0b,1508647327,,This has been tried.\n\n/r/IncelDiscussions\n\...
8,/r/Braincels/comments/dopj9dk/comment/dopk1cs,dopk1cs,924979030.686585,dopj9dk,1508644922,,That's a very good idea!\n\nThis sub's all abo...
9,/r/Braincels/comments/77ru0b/comment/dopj9dk,dopj9dk,924979139.192799,77ru0b,1508643701,,Would it be possible to repost very intellectu...


Unique users in each dataset:
reddit_incel_posts: 29877
reddit_incelexit: 4005
braincels_incel_posts: 69305
Total unique users across all datasets: 101201
Total length of all datasets combined: 3316112


In [103]:
reddit_incelexit

(                  source                                               text
 0       reddit_incelexit  I will... nicely shove mental health and well-...
 1       reddit_incelexit  Black pill points and counter points are fine....
 2       reddit_incelexit  I'm already having a hard time understanding w...
 3       reddit_incelexit  TIME TO SHOVE MENTAL HEALTH AND WELL-BEING INT...
 4       reddit_incelexit          Wrong Subreddit friend. Thats a permaban.
 ...                  ...                                                ...
 111078  reddit_incelexit  Your post/comment was removed for violating ru...
 111079  reddit_incelexit  >I got no stake in society.  If you have clean...
 111080  reddit_incelexit  Remember empathy, as you get practice connecti...
 111081  reddit_incelexit                                          [removed]
 111082  reddit_incelexit                                          [removed]
 
 [111083 rows x 2 columns],)

In [96]:
column_name = ['reddit_incel_posts', 'reddit_incelexit', 'braincels_incel_posts']

for name in column_name:
    df = dfs[name]
    df = df.copy() 
    df['source'] = name
    dfs[name] = df 

reddit_incel_posts = dfs['reddit_incel_posts'][["source", "text"]]
reddit_incelexit = dfs['reddit_incelexit'][["source", "text"]],
braincels_incel_posts = dfs['braincels_incel_posts'][["source", "text"]]

combined_df = pd.concat([
    dfs['reddit_incel_posts'][["source", "text"]],
    dfs['reddit_incelexit'][["source", "text"]],
    dfs['braincels_incel_posts'][["source", "text"]]
], ignore_index=True)

before = combined_df.shape[0]

combined_df.dropna(inplace=True)

after = combined_df.shape[0]

print(f"Dropped {before - after} rows with missing text.")

Dropped 150819 rows with missing text.


In [97]:
# remove rows with typical reddit things

remove = ['[deleted]', '[removed]']

#lowercase all text 
combined_df['text'] = combined_df['text'].astype(str).str.lower()

idx_to_drop = []

for index, row in combined_df.iterrows():
    for i in remove:
        if i in row["text"]:
            idx_to_drop.append(index)


In [98]:
#remove indexes 
print(len(idx_to_drop))

combined_df.drop(idx_to_drop, inplace=True)

print(f"Dropped {len(idx_to_drop)} rows with removed/deleted text.")


232871
Dropped 232871 rows with removed/deleted text.


In [99]:
combined_df['text'] = (
    combined_df['text']
    .str.replace(r'(tl;dr|edit:?|&gt;)', '', regex=True)
    .str.replace(r'\s+', ' ', regex=True)
    .str.strip()
)


In [100]:
# Create translation table
import string
translator = str.maketrans('', '', string.punctuation)

combined_df['text'] = combined_df['text'].str.translate(translator)

In [102]:
print(reddit_incelexit)

(                  source                                               text
0       reddit_incelexit  I will... nicely shove mental health and well-...
1       reddit_incelexit  Black pill points and counter points are fine....
2       reddit_incelexit  I'm already having a hard time understanding w...
3       reddit_incelexit  TIME TO SHOVE MENTAL HEALTH AND WELL-BEING INT...
4       reddit_incelexit          Wrong Subreddit friend. Thats a permaban.
...                  ...                                                ...
111078  reddit_incelexit  Your post/comment was removed for violating ru...
111079  reddit_incelexit  >I got no stake in society.  If you have clean...
111080  reddit_incelexit  Remember empathy, as you get practice connecti...
111081  reddit_incelexit                                          [removed]
111082  reddit_incelexit                                          [removed]

[111083 rows x 2 columns],)
