# Reddit - The Ultimate Dataset for Everything
## *scroll down for details*

## Module installation

In [None]:
!pip install praw psaw wordcloud fontTools nltk

In [None]:
import datetime
from collections import Counter
import json
import praw
from psaw import PushshiftAPI
from datetime import date
import os
import string
import datetime
from tqdm.auto import tqdm

from wordcloud import WordCloud
import matplotlib.pyplot as plt
import fontTools

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize  
stop_words = set(stopwords.words('english'))

## PRAW configuration

In [None]:
# An example config file has been attached. 
# For client_id, client_secret and user_agent please refer to the related posts:
# https://www.reddit.com/r/redditdev/comments/hasnnc/where_do_i_find_the_reddit_client_id_and_secret/
def load_praw_config(path = "../praw_config.json"):
    with open(path) as json_file:
        return json.load(json_file)

configuration = load_praw_config()

reddit = praw.Reddit(client_id = configuration["client_id"], client_secret = configuration["client_secret"], user_agent = configuration["user_agent"])

## Dataset preparation

In [None]:
def process_x(submission, from_pushshift = False):
    if from_pushshift:
        submission_id = submission["id"]
        submission = reddit.submission(submission_id)
    
    submission.title
    submission = vars(submission)

    score = submission.pop("score")
    
    return submission, score

def dataset_iterator(subreddit_name='politics', past_dataset_size = 100, before_time=None):
    api = PushshiftAPI()
    
    if not before_time:
        before_time = date.today() 
        
    before_time_timestamp = int(before_time.timestamp())

    iterator = api.search_submissions(subreddit = subreddit_name, limit = past_dataset_size, before = before_time_timestamp)
    
    for x in iterator:
        x, y = process_x(x.d_, from_pushshift = True)
        yield x, y

## Wordcloud generation and text processing

In [None]:
def clean_text(text):
    text = text.replace("-\r\n", "")
    text = text.replace("-\n", "")
    text = text.replace("deleted", "")
    text = text.replace("removed", "")

    exclist = string.punctuation.replace("-", "") + string.digits + "∗"
    table_ = str.maketrans('', '', exclist)
    text = text.translate(table_)

    text = word_tokenize(text)
    text = [word.lower() for word in text]
    text = [word for word in text if not word in stop_words]  

    return " ".join(text)

def create_wordcloud(text_counts, save_to_file = False, filename = "wordcloud.pdf"):
    wordcloud = WordCloud(font_path = "arial", width = 1000, height = 1000, random_state=1, colormap = "viridis", mode = "RGBA", background_color=None, collocations=False)

    wordcloud.generate_from_frequencies(text_counts)

    plt.figure(figsize=(10, 10))
    plt.imshow(wordcloud) 
    plt.axis("off")

    if save_to_file:
        wordcloud_svg = wordcloud.to_svg()
        f = open(filename + ".svg", "w+" , encoding="utf-8")
        f.write(wordcloud_svg)
        f.close()

        from svglib.svglib import svg2rlg
        from reportlab.graphics import renderPDF

        drawing = svg2rlg(filename + ".svg")
        renderPDF.drawToFile(drawing, filename)

        os.remove(filename + ".svg")

def get_subbreddit_name(phrase):
    subreddit_list = list(praw.models.Subreddits(reddit, _data = None).search(phrase))
    return [x.display_name for x in subreddit_list][0] if len(subreddit_list) > 0 else None

# Search for subreddit and generate wordclouds!

## 1. Specify the search phrase

In [None]:
SEARCH_TOPICS = ["music", "rock", "The Beatles"]

## 2. Specify the numbers of posts to be processed 
### (Each number will produce a different wordcloud!)

In [None]:
LAST_N_POST_COUNT = [200]

### (Optionally) Specify the date BEFORE which the posts should be fetched (default: current time)

In [None]:
BEFORE_DATE = None
# Or simply:
BEFORE_DATE = '01/09/20 00:00:00'

if not BEFORE_DATE:
    BEFORE_DATE = datetime.datetime.now()
else:
    BEFORE_DATE = datetime.datetime.strptime(BEFORE_DATE, '%d/%m/%y %H:%M:%S')

## 3. Wait for posts to be downloaded and wordcloud to generate...
### (this depends on how fast Reddit works today)

In [None]:
for topic in tqdm(SEARCH_TOPICS):
    print("Processing search topic: {}".format(topic))
    subreddit_name = get_subbreddit_name(topic)

    posts_text = ""
    titles_text = ""
    
    for past_dataset_size in LAST_N_POST_COUNT:
        print("Processing last {} posts from subreddit r/{} (https://www.reddit.com/r/{}/)...".format(past_dataset_size, subreddit_name, subreddit_name))
        for x, y in tqdm(dataset_iterator(subreddit_name = subreddit_name, past_dataset_size = past_dataset_size, before_time = BEFORE_DATE)):
            posts_text = posts_text + " " + clean_text(x["selftext"])
            titles_text = titles_text + " " + clean_text(x["title"])

        posts_words = [word for word in posts_text.split() if len(word) > 2]
        titles_words = [word for word in titles_text.split() if len(word) > 2]
        posts_word_counts = Counter(posts_words)
        titles_word_counts = Counter(titles_words)

        title_wordcloud_file_path = "{}_last_{}_titles_wordcloud_{}.pdf".format(subreddit_name, past_dataset_size, BEFORE_DATE.strftime('%d-%m-%y_%H-%M-%S'))
        post_wordcloud_file_path = "{}_last_{}_posts_wordcloud_{}.pdf".format(subreddit_name, past_dataset_size, BEFORE_DATE.strftime('%d-%m-%y_%H-%M-%S'))
        print("\rDone. See files {} and {}".format(title_wordcloud_file_path, post_wordcloud_file_path))
        create_wordcloud(titles_word_counts, save_to_file = True, filename = title_wordcloud_file_path)
        create_wordcloud(posts_word_counts, save_to_file = True, filename = post_wordcloud_file_path)

## 4. Enjoy the wordclouds!
### (saved in the same folder as this notebook)

## 5. Now return to 1. and check if you can find topics that do NOT appear on Reddit *(spoiler: it's hard!)*