In [2]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt

In [3]:
LOCATION = 'data'
USE_FILENAMES = ["1727069599", "1727016938", "1727020308", "1727024177", "1727160042"]
FOLDERS = ["comment", "submission", "subreddit"]

# get all folder names
print(FOLDERS)

# load all data into respective dataframes
dfs = {}
for folder in FOLDERS:
    dfs[folder] = {}
    for filename in USE_FILENAMES:
        dfs[folder][filename] = pd.read_csv(f"{LOCATION}/{folder}/{filename}.csv")

['comment', 'submission', 'subreddit']


In [4]:
# count the number of rows in each dataframe
for folder in FOLDERS:
    for filename in USE_FILENAMES:
        print(f"{folder}/{filename}: {len(dfs[folder][filename])}")

comment/1727069599: 429916
comment/1727016938: 57102
comment/1727020308: 46451
comment/1727024177: 15695
comment/1727160042: 191511
submission/1727069599: 941
submission/1727016938: 100
submission/1727020308: 100
submission/1727024177: 100
submission/1727160042: 880
subreddit/1727069599: 1
subreddit/1727016938: 1
subreddit/1727020308: 1
subreddit/1727024177: 1
subreddit/1727160042: 1


In [5]:
# for each folder, combine all dataframes into one
for folder in FOLDERS:
    dfs[folder] = pd.concat(dfs[folder].values())

# count the number of rows in each dataframe
for folder in FOLDERS:
    print(f"{folder}: {len(dfs[folder])}")

comment: 740675
submission: 2121
subreddit: 5


In [6]:
comment_bodies = dfs["comment"]["body"]

In [7]:
# convert to a list of strings
comment_bodies = comment_bodies.astype(str)

In [8]:
# count the number of words
words = [comment.split(" ") for comment in comment_bodies]
word_count = sum([len(comment) for comment in words])

In [9]:
print(f"Word count: {word_count}")

Word count: 23666731


In [10]:
# use nltk to remove stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# remove stopwords
words = [[word for word in comment if word.lower() not in stop_words] for comment in words]
word_count = sum([len(comment) for comment in words])
print(f"Word count (no stopwords): {word_count}")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adamlass/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Word count (no stopwords): 13141559


In [11]:
# make a df with top 100 words and their respective counts
word_count_dict = {}
for comment in words:
    for word in comment:
        if word in word_count_dict:
            word_count_dict[word] += 1
        else:
            word_count_dict[word] = 1
items = word_count_dict.items()
switched_items = [(count, word) for word, count in items]
word_df = pd.DataFrame(switched_items, columns=["weight", "word"])
word_df = word_df.sort_values("weight", ascending=False)
word_df = word_df.head(1000)

# save to csv
word_df.to_csv("word_count.csv", index=False)

In [12]:
# get submissions
submissions = dfs["submission"]

# count how many submissions have a url
urls = submissions["url"]
urls = urls.astype(str)
urls = urls.str.contains("http")
urls = urls.sum()
print(f"Submissions with URLs: {urls}")

Submissions with URLs: 2118


In [14]:
print(len(submissions))
print("Percentage with URLs: ", urls / len(submissions))

2121
Percentage with URLs:  0.9985855728429985
