In [46]:
import pandas as pd
import json
import duckdb

In [63]:
# Connect to the database
with duckdb.connect('my.db') as con:
    # Create the table
    con.execute("""
        CREATE OR REPLACE TABLE submissions (
            subreddit TEXT,
            title TEXT,
            selftext TEXT,
            link_flair_text TEXT,
            ups INT,
            upvote_ratio FLOAT,
            permalink TEXT,
            linked_url TEXT,
            created_utc TIMESTAMPTZ
        )
    """)
    
    # Read and insert data from JSON files
    json_files = [
        'reddit/extracted/MachineLearning_submissions.json',
        'reddit/extracted/learnmachinelearning_submissions.json',
        'reddit/extracted/singularity_submissions.json'
    ]
    
    for file in json_files:
        subreddit = file.split('/')[-1].split('_')[0]
        con.execute(f"""
            INSERT INTO submissions
            SELECT '{subreddit}', title, selftext, link_flair_text, ups, upvote_ratio, permalink, url AS linked_url, TO_TIMESTAMP(created_utc::BIGINT) FROM read_json_auto('{file}')
        """)
    
    # Update the permalink field
    con.execute("""
        UPDATE submissions SET permalink = CONCAT('https://reddit.com', permalink)
    """)

    # con.sql("""
    #     UPDATE submissions
    #     SET permalink = NULL
    #     WHERE permalink = '';
            
    #     UPDATE submissions
    #     SET link_flair_text = NULL
    #     WHERE link_flair_text = '';
            
    #     UPDATE submissions
    #     SET selftext = NULL
    #     WHERE selftext = '';
            
    #     UPDATE submissions
    #     SET title = NULL
    #     WHERE title = '';
            
    #     UPDATE submissions
    #     SET linked_url = NULL
    #     WHERE linked_url = '';
    #     """)

In [64]:
with duckdb.connect('my.db') as con:
    df = con.sql("SELECT * FROM submissions;").df()

In [65]:
from copy import deepcopy

In [66]:
data = deepcopy(df.sample(frac=.8, random_state=42))

In [67]:
data

Unnamed: 0,subreddit,title,selftext,link_flair_text,ups,upvote_ratio,permalink,linked_url,created_utc
121429,MachineLearning,[R] Outcome-Driven Reinforcement Learning via ...,,Research,,0.74,https://reddit.com/r/MachineLearning/comments/...,https://arxiv.org/abs/2104.10190v1,2021-04-22 14:48:30-04:00
266572,singularity,Post your Guess When the Singularity will Occur.,06/21/2034 7:01am CST,,37.0,,https://reddit.com/r/singularity/comments/1q26...,http://www.reddit.com/r/singularity/comments/1...,2013-11-06 18:15:48-05:00
24443,MachineLearning,[D] How To Think About Indexing Inferred Values?,[removed],Discussion,1.0,,https://reddit.com/r/MachineLearning/comments/...,https://www.reddit.com/r/MachineLearning/comme...,2016-11-12 21:21:12-05:00
313023,singularity,Why do Mods remove so many posts?,title,Discussion,1.0,1.00,https://reddit.com/r/singularity/comments/175l...,https://www.reddit.com/r/singularity/comments/...,2023-10-11 14:58:48-04:00
145494,MachineLearning,Multiple PDF Files Similarity,[removed],,,1.00,https://reddit.com/r/MachineLearning/comments/...,https://www.reddit.com/r/MachineLearning/comme...,2022-05-01 13:40:14-04:00
...,...,...,...,...,...,...,...,...,...
162156,MachineLearning,Looking for feedback on a new deep-learning tool,[removed],,,1.00,https://reddit.com/r/MachineLearning/comments/...,,2023-01-11 10:17:59-05:00
172314,MachineLearning,Guys I think my decision tree may be overfitti...,,,1.0,1.00,https://reddit.com/r/MachineLearning/comments/...,https://i.redd.it/ajplxsi9drxa1.jpg,2023-05-03 21:25:06-04:00
286838,singularity,Artificial Nightmares: Stone Golem Ruins || Cl...,,AI,,0.77,https://reddit.com/r/singularity/comments/u1qp...,https://www.youtube.com/watch?v=kzjbE5sDfIs,2022-04-12 00:48:00-04:00
253440,learnmachinelearning,Gradient Descent Algorithm &amp; Standardization,"Hey all,\n\nI’m implementing gradient descent ...",Help,,1.00,https://reddit.com/r/learnmachinelearning/comm...,https://www.reddit.com/r/learnmachinelearning/...,2023-01-27 02:26:14-05:00


In [19]:
import pandas as pd 
import random 
from enum import Enum
from typing import Callable


class Labels(Enum):
    
    SPAM = 'spam'
    NOT_SPAM = 'not_spam'


class Rule: 
    def __init__(self, name: str, condition: Callable[[pd.Series], bool], label: str): 
        self.name = name 
        self.condition = condition 
        self.label = label 

    def apply(self, post: pd.DataFrame) -> pd.Series: 
        return self.condition(post).replace(True, self.label).replace(False, None)

    def __repr__(self):
        return self.name


def last_non_null(row):
    reversed_row = row[::-1]
    for value in reversed_row:
        if value is not None:
            return value
    return None

    
class AnnotationSystem: 
    def __init__(self, rules: list[Rule]): 
        self.rules = rules 

    def annotate_posts(self, posts: pd.DataFrame) -> pd.DataFrame: 
        labels = []
        for rule in self.rules:
            label = rule.apply(posts)
            label.name = rule.name
            labels.append(label)

        labels_df = pd.DataFrame(labels).T
        labels_df.columns = [l.name for l in labels]

        return labels_df.apply(lambda row: last_non_null(row), axis=1)

    
# Example rules 
def title_has_exclamation_marks(data: pd.DataFrame): 
    return data['title'].str.contains('!!!') 

def from_subreddit(data: pd.DataFrame, subreddit: str): 
    return data['subreddit'] == subreddit 

def post_with_tag_and_upvotes(data: pd.DataFrame, subreddit: str, tag: str, upvotes: int): 
    return ((data['subreddit'] == subreddit) & (data['title'].str.contains(tag)) & (data['ups'] > upvotes))

# Creating rules 
rules = [ 
    Rule("Title has 3+ exclamation marks", title_has_exclamation_marks, Labels.SPAM.value), 
    Rule("From r/singularity", lambda post: from_subreddit(post, 'singularity'), Labels.SPAM.value), 
    Rule("From r/learnmachinelearning", lambda post: from_subreddit(post, 'learnmachinelearning'), Labels.SPAM.value), 
    #Rule("From r/MachineLearning with [R]/[Research] tag and >10 upvotes", lambda post: post_with_tag_and_upvotes(post, 'MachineLearning', '[R]', 10), Labels.NOT_SPAM),
    Rule("From r/MachineLearning", lambda post: from_subreddit(post, 'MachineLearning'), Labels.NOT_SPAM.value)
    ] 
    
from itertools import permutations
def test_robustness(posts: pd.DataFrame, rules: list[Rule]): 
    all_annotations = []
    for permuted_rule in permutations(rules): 
        print(f"Test {permuted_rule}")
        annotator = AnnotationSystem(permuted_rule)
        annotation = annotator.annotate_posts(posts)
        all_annotations.append(annotation)
    
    return all_annotations

In [20]:
# Test the robustness of the rules 
comparisons = test_robustness(data, rules)

Test (Title has 3+ exclamation marks, From r/singularity, From r/learnmachinelearning, From r/MachineLearning)
Test (Title has 3+ exclamation marks, From r/singularity, From r/MachineLearning, From r/learnmachinelearning)
Test (Title has 3+ exclamation marks, From r/learnmachinelearning, From r/singularity, From r/MachineLearning)
Test (Title has 3+ exclamation marks, From r/learnmachinelearning, From r/MachineLearning, From r/singularity)
Test (Title has 3+ exclamation marks, From r/MachineLearning, From r/singularity, From r/learnmachinelearning)
Test (Title has 3+ exclamation marks, From r/MachineLearning, From r/learnmachinelearning, From r/singularity)
Test (From r/singularity, Title has 3+ exclamation marks, From r/learnmachinelearning, From r/MachineLearning)
Test (From r/singularity, Title has 3+ exclamation marks, From r/MachineLearning, From r/learnmachinelearning)
Test (From r/singularity, From r/learnmachinelearning, Title has 3+ exclamation marks, From r/MachineLearning)
T

In [23]:
data

Unnamed: 0,permalink,link_flair_text,ups,selftext,title,subreddit
121429,https://reddit.com/r/singularity/comments/cq0k...,video,,,AI*DAOs*Singularity*BCIs with visionary AI ser...,singularity
266572,https://reddit.com/r/MachineLearning/comments/...,,1.0,[removed],Adapter Tuning Multiple Choice Q/A Models (Ada...,MachineLearning
24443,https://reddit.com/r/MachineLearning/comments/...,Rule 6 - Beginner tutorial or project,,[removed],[D] Clearest version of Stanford's 229 - Anand...,MachineLearning
313023,https://reddit.com/r/MachineLearning/comments/...,,,[removed],Discriminator Loss on Real Images in GAN,MachineLearning
145494,https://reddit.com/r/MachineLearning/comments/...,,,[removed],What would you do if you cannot find any large...,MachineLearning
...,...,...,...,...,...,...
162156,https://reddit.com/r/singularity/comments/kcxr...,reddit,,There are different methods to mind transfer. ...,At what point does it stop being you?,singularity
172314,https://reddit.com/r/MachineLearning/comments/...,,12.0,,Making Tree Ensembles Interpretable,MachineLearning
286838,https://reddit.com/r/singularity/comments/182k...,COMPUTING,1.0,[removed],4peat,singularity
253440,https://reddit.com/r/MachineLearning/comments/...,Project,,,[P] A 3D Rasterizer written in Tensorflow,MachineLearning


In [24]:
collected = pd.DataFrame(comparisons).T

In [25]:
collected

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
121429,spam,spam,spam,spam,spam,spam,spam,spam,spam,spam,...,spam,spam,spam,spam,spam,spam,spam,spam,spam,spam
266572,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,...,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam
24443,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,...,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam
313023,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,...,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam
145494,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,...,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162156,spam,spam,spam,spam,spam,spam,spam,spam,spam,spam,...,spam,spam,spam,spam,spam,spam,spam,spam,spam,spam
172314,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,...,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam
286838,spam,spam,spam,spam,spam,spam,spam,spam,spam,spam,...,spam,spam,spam,spam,spam,spam,spam,spam,spam,spam
253440,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,...,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam,not_spam


In [26]:
def calculate_proportion(row):
    first_value = row.iloc[0]
    num_same_values = (row == first_value).sum()
    return num_same_values / len(row)

In [27]:
proportions = collected.apply(lambda row: calculate_proportion(row), axis=1)

In [33]:
labels = collected.mode(axis=1, dropna=True).iloc[:,0]

In [34]:
labels

121429        spam
266572    not_spam
24443     not_spam
313023    not_spam
145494    not_spam
            ...   
162156        spam
172314    not_spam
286838        spam
253440    not_spam
219844        spam
Name: 0, Length: 256457, dtype: object

In [36]:
pd.concat([data, labels], axis=1)

Unnamed: 0,permalink,link_flair_text,ups,selftext,title,subreddit,0
121429,https://reddit.com/r/singularity/comments/cq0k...,video,,,AI*DAOs*Singularity*BCIs with visionary AI ser...,singularity,spam
266572,https://reddit.com/r/MachineLearning/comments/...,,1.0,[removed],Adapter Tuning Multiple Choice Q/A Models (Ada...,MachineLearning,not_spam
24443,https://reddit.com/r/MachineLearning/comments/...,Rule 6 - Beginner tutorial or project,,[removed],[D] Clearest version of Stanford's 229 - Anand...,MachineLearning,not_spam
313023,https://reddit.com/r/MachineLearning/comments/...,,,[removed],Discriminator Loss on Real Images in GAN,MachineLearning,not_spam
145494,https://reddit.com/r/MachineLearning/comments/...,,,[removed],What would you do if you cannot find any large...,MachineLearning,not_spam
...,...,...,...,...,...,...,...
162156,https://reddit.com/r/singularity/comments/kcxr...,reddit,,There are different methods to mind transfer. ...,At what point does it stop being you?,singularity,spam
172314,https://reddit.com/r/MachineLearning/comments/...,,12.0,,Making Tree Ensembles Interpretable,MachineLearning,not_spam
286838,https://reddit.com/r/singularity/comments/182k...,COMPUTING,1.0,[removed],4peat,singularity,spam
253440,https://reddit.com/r/MachineLearning/comments/...,Project,,,[P] A 3D Rasterizer written in Tensorflow,MachineLearning,not_spam


In [120]:
data.loc[289241, :].to_dict()

{'permalink': 'https://reddit.com/r/MachineLearning/comments/x3l7zp/d_senior_research_scientist_at_googleai_negar/',
 'link_flair_text': 'Discussion',
 'ups': nan,
 'selftext': '[deleted]',
 'title': '[D] Senior research scientist at GoogleAI, Negar Rostamzadeh: “ Can\'t believe Stable Diffusion is out there for public use and that\'s considered as "ok"!!!”',
 'subreddit': 'MachineLearning'}

In [68]:
t = AnnotationSystem(rules)

In [69]:
a = t.annotate_posts(data)

In [70]:
a

Unnamed: 0,Title has 3+ exclamation marks,From r/singularity,From r/learnmachinelearning,From r/MachineLearning with [R]/[Research] tag and >10 upvotes
121429,,Labels.SPAM,,
266572,,,,
24443,,,,
313023,,,Labels.NOT_SPAM,
145494,,,,
...,...,...,...,...
162156,,Labels.SPAM,,
172314,,,,
286838,,Labels.SPAM,,
253440,,,,


In [71]:
a.iloc[0, 2] = Labels.NOT_SPAM

In [72]:
a

Unnamed: 0,Title has 3+ exclamation marks,From r/singularity,From r/learnmachinelearning,From r/MachineLearning with [R]/[Research] tag and >10 upvotes
121429,,Labels.SPAM,Labels.NOT_SPAM,
266572,,,,
24443,,,,
313023,,,Labels.NOT_SPAM,
145494,,,,
...,...,...,...,...
162156,,Labels.SPAM,,
172314,,,,
286838,,Labels.SPAM,,
253440,,,,


In [73]:
def last_non_null(row):
    reversed_row = row[::-1]
    for value in reversed_row:
        if value is not None:
            return value
    return None

In [58]:
a

Unnamed: 0,Title has 3+ exclamation marks,From r/singularity,From r/learnmachinelearning,From r/MachineLearning with [R]/[Research] tag and >10 upvotes,label
121429,,Labels.SPAM,Labels.NOT_SPAM,,unset
266572,,,,,unset
24443,,,,,unset
313023,,,Labels.NOT_SPAM,,unset
145494,,,,,unset
...,...,...,...,...,...
162156,,Labels.SPAM,,,unset
172314,,,,,unset
286838,,Labels.SPAM,,,unset
253440,,,,,unset


In [74]:
a.apply(lambda row: last_non_null(row), axis=1)

121429    Labels.NOT_SPAM
266572               None
24443                None
313023    Labels.NOT_SPAM
145494               None
               ...       
162156        Labels.SPAM
172314               None
286838        Labels.SPAM
253440               None
219844               None
Length: 256457, dtype: object