In [None]:
import spacy
import pandas as pd
import matplotlib.pyplot as plt

from reddit_reader.database import Database

from helpers import detect_language

In [None]:
db = Database("reddit_data/reddit.db")

In [None]:
db.list_tables()

## Submissions

In [None]:
submissions = pd.DataFrame(db.query("SELECT * FROM submissions"))
submissions.shape

In [None]:
submissions.sample(5)

In [None]:
submissions.dtypes

In [None]:
# Convert data types
submissions = submissions.assign(
    num_comments=submissions["num_comments"].astype(int),
    ups=submissions["ups"].astype(int),
    downs=submissions["downs"].astype(int),
    score=submissions["score"].astype(int),
    language=submissions["selftext"].map(detect_language),
    created_dt=pd.to_datetime(submissions["created"].astype(float), unit="s"),
)

In [None]:
# Detected languages
(
    submissions["language"]
    .value_counts()
    .sort_values(ascending=True)
    .plot.barh(figsize=(6, 3), title="Detected Language")
)
None

In [None]:
# submissions[submissions["language"] == "EN"].sample(3)

In [None]:
# Counts per day
(
    submissions
    .assign(post_date=submissions["created_dt"].dt.date)
    .groupby("post_date", as_index=False)
    .agg(post_count=("id", "nunique"))
    .plot.scatter(
        x="post_date",
        y="post_count",
        title="Posts per Day",
        figsize=(18, 3),
        alpha=0.5,
        ylim=(0, 8)
    )
)

In [None]:
columns = "num_comments", "ups", "score", "downs"

fig, axes = plt.subplots(1, len(columns), figsize=(len(columns) * 4, 3))
for idx, column in enumerate(columns):
    (
        submissions[column]
        .plot.hist(
            bins=50,
            edgecolor="white",
            ax=axes[idx],
        )
    )
    axes[idx].axvline(submissions[column].median(), color="red")
    axes[idx].set_title(column)

In [None]:
(
    submissions["link_flair_text"]
    .value_counts()
    .sort_values(ascending=True)
    .plot.barh(
        figsize=(10, 4),
        title="Submission Category"
    )
)
None

In [None]:
# Author "None" means user account was deleted.
(
    submissions["author"]
    .value_counts()
    .head(15)
    .sort_values(ascending=True)
    .plot.barh(
        figsize=(10, 4),
        title="Authors"
    )
)
None

### NLP

In [None]:
# Load Dutch language model.
nlp = spacy.load("nl_core_news_md")

In [None]:
# Tokenize docs.
docs = pd.Series(nlp.pipe(submissions["selftext"], n_process=-1))

In [None]:
# Get entities with ORG label.
organizations = (
    docs
    .map(
        lambda doc: [
            ent.text.lower().strip("'s") for ent in doc.ents
            if ent.label_ == "ORG"
        ]
    )
    # One organization per row
    .explode()
)

In [None]:
(
    organizations
    .value_counts()
    .head(20)
    .sort_values(ascending=True)
    .plot.barh(
        title="Common Organizations",
        figsize=(8, 5),
    )
)
None

In [None]:
# Get entities with PERSON label.
persons = (
    docs
    .map(
        lambda doc: [
            ent.text.lower().strip("'s") for ent in doc.ents
            if ent.label_ == "PERSON"
        ]
    )
    # One person per row
    .explode()
)

In [None]:
(
    persons
    .value_counts()
    .head(20)
    .sort_values(ascending=True)
    .plot.barh(
        title="Common Persons",
        figsize=(8, 5),
    )
)
None

## Comments

In [None]:
comments = pd.DataFrame(db.query("SELECT * FROM comments"))
comments.shape

In [None]:
comments.dtypes

In [None]:
# Convert data types
comments = comments.assign(
    ups=comments["ups"].astype(int),
    downs=comments["downs"].astype(int),
    language=submissions["selftext"].map(detect_language),
    created_dt=pd.to_datetime(submissions["created"].astype(float), unit="s"),
)

In [None]:
# Author "None" means user account was deleted.
(
    comments["author"]
    .value_counts()
    .head(15)
    .sort_values(ascending=True)
    .plot.barh(figsize=(10, 4), title="Authors")
)
None

In [None]:
# Deleted versus valid comments.
(
    comments["body"]
    .map(lambda v: "deleted" if v in ("[deleted]", "[removed]") else "valid")
    .value_counts()
    .plot.barh(figsize=(6, 3), title="Deleted Comments")
)

In [None]:
# Up and down votes
columns = "ups", "downs"

fig, axes = plt.subplots(1, len(columns), figsize=(len(columns) * 4, 3))
for idx, column in enumerate(columns):
    (
        comments[column]
        .plot.hist(
            bins=50,
            edgecolor="white",
            ax=axes[idx],
        )
    )
    axes[idx].axvline(comments[column].median(), color="red")
    axes[idx].set_title(column)

In [None]:
# Comment length
post_length = comments["body"].map(lambda t: len(t.split()))
post_length = post_length.clip(upper=post_length.quantile(.98))

fig = plt.figure(figsize=(8, 3))
ax = fig.add_subplot(111)

(
    post_length
    .plot.hist(
        bins=100,
        edgecolor="white",
        ax=ax
    )
)
ax.axvline(post_length.median(), color="red")
ax.set_title("Comment Word Count")
None

## BERTopic

In [None]:
import re
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer

In [None]:
def cleanse_text(text):
    """Cleanse submission body text."""
    # Strip encoded characters
    text = re.sub(r"&#x[0-9]+B;", "", text)

    # Strip URLs
    text = re.sub(r"https?://[^\s]+", "", text)

    # Strip excess whitespace
    text = re.sub(r"[\s\n\r]+", " ", text)

    return text

In [None]:
stop_words = [
    "de", "een", "en", "of", "het", "van", "is", "te", "met",
    "wat", "dat", "dit", "om", "op", "in", "voor", "er", "naar",
    "ik", "jij", "je", "jullie", "zij", "hij", "haar", "mijn", "hun", "hen",
    "hallo", "hi", "groet", "groeten", "groetjes", "welkom",
    "dank", "bedankt", "alvast",
]

In [None]:
#vectorizer =  ClassTfidfTransformer(reduce_frequent_words=True)
vectorizer = CountVectorizer(stop_words=stop_words)
topic_model = BERTopic(
    language="Dutch",
    nr_topics=50,
    vectorizer_model=vectorizer,
)

In [None]:
texts = (
    submissions.query("selftext != '' & language == 'NL'")
    ["selftext"]
    .map(cleanse_text)
)

In [None]:
sent = texts.map(lambda s: s.split(".")).explode()

In [None]:
sent = sent[sent != ""]

In [None]:
topics, probs = topic_model.fit_transform(sent)

In [None]:
topic_model.get_topic_info()