In [None]:
import os
import logging

from reddit_reader.reader import RedditReader

In [None]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)-8s | %(name)s | %(message)s",
    datefmt="%d-%m-%Y %H:%M:%S",
)

### Database setup

In [None]:
reader = RedditReader(
    storage_path="reddit_data",
    config_path="reddit_reader/default_config.yaml",
    # enable_json=False,
)

In [None]:
# Connect to Reddit.
reader.connect(
    username=os.getenv("USER"),
    password=os.getenv("PASSWORD"),
    app_id=os.getenv("APP_ID"),
    app_secret=os.getenv("APP_SECRET"),
)

In [None]:
# Subreddits to scrape.
subreddits = [
    "beleggen",
    "beleggenvoorbeginners",
    "geldzaken",
]

In [None]:
# Scrape subreddits.
for subreddit in subreddits:
    reader.download(subreddit, limit=250, more_comments=150)

In [None]:
reader._database.query("SELECT COUNT(*) FROM submissions")

In [None]:
# Post counts per subreddit.
reader._database.query(
    """
    SELECT
        subreddit,
        COUNT(*) AS posts
    FROM submissions
    GROUP BY subreddit;
    """
)

In [None]:
reader._database.query("SELECT COUNT(*) FROM comments")

In [None]:
reader._database.query(
    """
    SELECT
        subreddit,
        COUNT(*) AS comments
    FROM comments
    GROUP BY subreddit;
    """
)

In [None]:
reader.close()

## Notes

In [None]:
# Non-informative columns.
drop = [
    # Subreddit related fields.
    "comment_limit",
    "comment_sort",
    "_reddit",
    "subreddit_id",                    # Prefer subreddit.
    "subreddit_name_prefixed",         # Prefer subreddit.
    "subreddit_type",                  # Always public for r/beleggen.
    "whitelist_status",                # White-list status.
    "wls",                             # White-list status code.
    "parent_whitelist_status",         # Parent white list status.
    "pwls",                            # Parent white list status code.

    # System fields.
    "approved_at_utc",
    "suggested_sort",
    "is_robot_indexable",
    "is_created_from_ads_ui",
    "allow_live_comments",
    "url_overridden_by_dest",
    "_additional_fetch_params",
    "_comments_by_id",
    "_fetched",

    # Post related
    "selftext_html",                    # Prefer selftext.
    "permalink",                        # Prefer url.
    "domain",                           # Always self.beleggen.
    "spoiler",                          # Unlikely given subreddit.
    "preview",                          # Seems ad related only.
    "post_hint",                        # Not informative?
    "is_original_content",              # Not informative?
    "is_crosspostable",                 # Not informative?
    "is_meta",                          # Not informative?
    "is_self",                          # Not informative?


    # Layout / flair related.
    "thumbnail",                        # Always self
    "thumbnail_width",
    "thumbnail_height",

    "link_flair_type",
    "link_flair_richtext",              # Prefer link_flair_text.
    "link_flair_css_class",
    "link_flair_text_color",
    "link_flair_background_color",
    "link_flair_template_id",

    "author_cakeday",                  # No longer used?
    "author_is_blocked",               # Get from author, not relevant.
    "author_flair_type",
    "author_flair_text",               # Not used it seems.
    "author_flair_richtext",           # Prefer author_flair_text.
    "author_flair_css_class",
    "author_flair_text_color",
    "author_flair_background_color",
    "author_flair_template_id",

    "author_premium",                   # Not informative?
    "author_patreon_flair",             # Not filled.

    # Media related.
    "media_only",
    "media_embed",                      # HTML code for embedding.
    "secure_media_embed",               # HTML code for embedding with HTTPS.
    "is_reddit_media_domain",           # Not informative?

    # Account interaction.
    "archived",
    "clicked",
    "distinguished",
    "gilded",
    "hidden",
    "hide_score",
    "pinned",
    "quarantine",
    "saved",
    "stickied",
    "visited",

    "can_mod_post",
    "can_gild",

    # Not interested in cross posts.
    "num_crossposts",
    "crosspost_parent",
    "crosspost_parent_list",

    # No variance in 1000 records
    "approved_by",
    "banned_by",
    "banned_at_utc",
    "removed_by",
    "removed_by_category",
    "removal_reason",

    "mod_reason_title",
    "mod_reason_by",
    "mod_note",
    "mod_reports",

    "treatment_tags",

    "num_reports",
    "user_reports",
    "report_reasons",

    "gildings",
    "likes",
    "view_count",

    "awarders",
    "all_awardings",
    "top_awarded_type",
    "total_awards_received",

    "category",
    "content_categories",
    "discussion_type",
    "contest_mode",
]


# Author only has variables:
# - fullname
# - name

# TBD
# "mod_reason_title" = ??
# "media" / "secure_media" = embedded video / pictures
