In [11]:
import pandas as pd

# Load dataset files
print("Loading raw datasets...")

# Introduce the *paths* of the raw datasets

# Depression
data_depression = pd.DataFrame(pd.read_json("../backups/subr_author_posts.jsonl", lines=True))
data_depression["depression_related"] = [1] * len(data_depression.index)  # Dep. identifier: true
dep_size = len(data_depression.index)

# Non-depression
data_control = pd.DataFrame(pd.read_json("../backups/ref_author_posts.jsonl", lines=True))
data_control["depression_related"] = [0] * len(data_control.index)  # Dep. identifier: false
non_dep_size = len(data_control.index)

print("Raw datasets succesfully loaded")

Loading raw datasets...
Raw datasets succesfully loaded


In [12]:
# Only to execute if the authors have been further preprocessed using the notebooks 'authors_preprocessing'

import tools

clean_authors, clean_subreddits = True, True

authors_dep = pd.read_excel("../data/cleaned_authors_180.xlsx")
remove_subreddits = tools.list_excluded_subreddits("../data/dep_subreddits.txt", ["depression"])

if clean_authors:
    data_depression = data_depression[data_depression["author"].isin(authors_dep["username"].tolist()[0::2])]
    data_control = data_control[data_control["author"].isin(authors_dep["username"].tolist()[1::2])]
if clean_subreddits:
    data_depression = data_depression[~data_depression["subreddit"].isin(remove_subreddits)]
    data_control = data_control[~data_control["subreddit"].isin(remove_subreddits)]

In [13]:
# Percentage of posts that we want to be in the test set
# Datasets should be ordered in descending order of date (created_utc)
percentage = 20
cut_off_row = data_depression.head(int(len(data_depression) * (percentage / 100))).tail(1)
cut_off_date, cut_off_id = cut_off_row["created_utc"].iloc[0], cut_off_row["id"].iloc[0]
control_mask, depression_mask = data_control["created_utc"] <= cut_off_date, \
                                data_depression["created_utc"] <= cut_off_date

print("Cut off date: {}".format(pd.to_datetime(cut_off_date, unit="s")))
print("Cut off ID for control: {}".format(data_control[control_mask].head(1)["id"].iloc[0]))
print("Cut off ID for depression: {}".format(cut_off_id))

Cut off date: 2018-10-04 13:45:48
Cut off ID for control: 9l929z
Cut off ID for depression: 9lc0z9


In [14]:
# Introduce the *paths* to save the cut datasets

# Save the datasets with all the features (all columns)
data_depression[depression_mask].to_json(orient="records", lines=True, force_ascii=True,
                                         path_or_buf="../datasets/training/raw_dep_authors_training.jsonl")
data_depression[~depression_mask].to_json(orient="records", lines=True, force_ascii=True,
                                          path_or_buf="../datasets/testing/raw_dep_authors_testing.jsonl")

data_control[control_mask].to_json(orient="records", lines=True, force_ascii=True,
                                   path_or_buf="../datasets/training/raw_ctrl_authors_training.jsonl")
data_control[~control_mask].to_json(orient="records", lines=True, force_ascii=True,
                                    path_or_buf="../datasets/testing/raw_ctrl_authors_testing.jsonl")

print("Raw training/testing datasets generated")

Raw training/testing datasets generated


In [15]:
# Introduce the *paths* to save the cut datasets

# Save the datasets with only the data required for our task of text classification
data_depression[["title", "selftext", "depression_related"]][depression_mask].to_json(orient="records", lines=True,
                                                                                      force_ascii=True,
                                                                                      path_or_buf="../datasets/training/raw_dep_authors_training_cut.jsonl")
data_depression[["title", "selftext", "depression_related"]][~depression_mask].to_json(orient="records", lines=True,
                                                                                       force_ascii=True,
                                                                                       path_or_buf="../datasets/testing/raw_dep_authors_testing_cut.jsonl")

data_control[["title", "selftext", "depression_related"]][control_mask].to_json(orient="records", lines=True,
                                                                                force_ascii=True,
                                                                                path_or_buf="../datasets/training/raw_ctrl_authors_training_cut.jsonl")
data_control[["title", "selftext", "depression_related"]][~control_mask].to_json(orient="records", lines=True,
                                                                                 force_ascii=True,
                                                                                 path_or_buf="../datasets/testing/raw_ctrl_authors_testing_cut.jsonl")

print("Processed training/testing datasets generated")


Processed training/testing datasets generated
