In [1]:
import pandas as pd

# Load dataset files
print("Loading raw datasets...")

# Introduce the *paths* of the raw datasets

# Depression
data_depression = pd.DataFrame(pd.read_json("../backups/subr_author_posts.jsonl", lines=True))
data_depression["depression_related"] = [1] * len(data_depression.index)  # Dep. identifier: true
dep_size = len(data_depression.index)

# Non-depression
data_control = pd.DataFrame(pd.read_json("../backups/ref_author_posts.jsonl", lines=True))
data_control["depression_related"] = [0] * len(data_control.index)  # Dep. identifier: false
non_dep_size = len(data_control.index)

print("Raw datasets successfully loaded")

Loading raw datasets...
Raw datasets successfully loaded


In [2]:
# In case you want to analyze the author posts
authors_depression = pd.read_excel("../data/subr_authors_selected.xlsx")
authors_reference = pd.read_excel("../data/ref_authors_selected.xlsx")

authors_depression = authors_depression.loc[:, authors_depression.columns != "Unnamed: 0"]
authors_reference = authors_reference.loc[:, authors_reference.columns != "Unnamed: 0"]

dep_authors = data_depression["author"].value_counts(sort=True)
# Complete authors' data with their posts' count
authors_depression["posts_count"] = authors_depression["username"].apply(
    lambda usr: int(dep_authors[usr]) if usr in dep_authors else 0)

ctrl_authors = data_control["author"].value_counts(sort=True)
# Complete authors' data with their posts' count
authors_reference["posts_count"] = authors_reference["username"].apply(
    lambda usr: int(ctrl_authors[usr]) if usr in ctrl_authors else 0)

In [5]:
import date_utils as d

paired_authors = pd.DataFrame(columns=["acc_id", "username", "created", "updated", "comment_karma", "link_karma", "posts_count"])
days_diffs = [30, 60, 90, 120, 150, 180]
similarity = 0.10

for days_diff in days_diffs:
    for index, author in authors_depression.iterrows():
        acc_id, created, comment, link, posts = author["acc_id"], author["created"], author["comment_karma"], \
                                                author["link_karma"], author["posts_count"]
        # Create the ranges
        ranges = [[d.substract_days_from_epoch(created, days_diff), d.add_days_to_epoch(created, days_diff)],
                  [comment - comment * similarity, comment + comment * similarity],
                  [link - link * similarity, link + link * similarity],
                  [posts - posts * similarity, posts + posts * similarity]]
        # Query
        result = authors_reference[(ranges[0][0] <= authors_reference["created"]) &
                                   (authors_reference["created"] <= ranges[0][1]) &
                                   (ranges[1][0] <= authors_reference["comment_karma"]) &
                                   (authors_reference["comment_karma"] <= ranges[1][1]) &
                                   (ranges[2][0] <= authors_reference["link_karma"]) &
                                   (authors_reference["link_karma"] <= ranges[2][1]) &
                                   (ranges[3][0] <= authors_reference["posts_count"]) &
                                   (authors_reference["posts_count"] <= ranges[3][1])]
        if not result.empty:
            for i, found in result.iterrows():
                if acc_id not in paired_authors["acc_id"].tolist() and found["acc_id"] not in paired_authors["acc_id"].tolist():
                    paired_authors = paired_authors.append(author, ignore_index=True)
                    paired_authors = paired_authors.append(found, ignore_index=True)

    if not paired_authors.empty:
        paired_authors.to_excel("../data/cleaned_authors_{}.xlsx".format(days_diff))