In [1]:
import pandas as pd

# Load dataset files
print("Loading raw datasets...")

# Depression (train)
data_depression = pd.DataFrame(pd.read_json("../backups/r_depression_base.jsonl", lines=True))
data_depression["depression_related"] = [1] * len(data_depression.index)  # Dep. identifier: true
dep_size = len(data_depression.index)

# Non-depression (train)
data_control = pd.DataFrame(pd.read_json("../backups/reference_collection.jsonl", lines=True))
data_control["depression_related"] = [0] * len(data_control.index)  # Dep. identifier: false
non_dep_size = len(data_control.index)

print("Raw datasets succesfully loaded")

Loading raw datasets...
Raw datasets succesfully loaded


In [2]:
# Percentage of posts that we want to be in the test set
percentage = 20
cut_off_row = data_depression.head(int(len(data_control) * (percentage / 100))).tail(1)
cut_off_date, cut_off_id = cut_off_row["created_utc"].iloc[0], cut_off_row["id"].iloc[0]
control_mask, depression_mask = data_control["created_utc"] <= cut_off_date, \
                                data_depression["created_utc"] <= cut_off_date

print("Cut off date: {}".format(pd.to_datetime(cut_off_date, unit="s")))
print("Cut off ID for control: {}".format(data_control[control_mask].head(1)["id"].iloc[0]))
print("Cut off ID for depression: {}".format(cut_off_id))

Cut off date: 2019-04-16 18:57:09
Cut off ID for control: bdxkhq
Cut off ID for depression: bdxph5


In [8]:
# Save the datasets with all the features (all columns)
data_depression[depression_mask].to_json(orient="records", lines=True, force_ascii=True,
                                         path_or_buf="../datasets/training/raw_dep_training.jsonl")
data_depression[~depression_mask].to_json(orient="records", lines=True, force_ascii=True,
                                          path_or_buf="../datasets/testing/raw_dep_testing.jsonl")

data_control[control_mask].to_json(orient="records", lines=True, force_ascii=True,
                                   path_or_buf="../datasets/training/raw_ctrl_training.jsonl")
data_control[~control_mask].to_json(orient="records", lines=True, force_ascii=True,
                                    path_or_buf="../datasets/testing/raw_ctrl_testing.jsonl")

print("Raw training/testing datasets generated")

Training/testing datasets generated


In [3]:
# Save the datasets with only the data required for our task of text classification
data_depression[["title", "selftext", "depression_related"]][depression_mask].to_json(orient="records", lines=True,
                                                                                      force_ascii=True,
                                                                                      path_or_buf="../datasets/training/raw_dep_training_cut.jsonl")
data_depression[["title", "selftext", "depression_related"]][~depression_mask].to_json(orient="records", lines=True,
                                                                                       force_ascii=True,
                                                                                       path_or_buf="../datasets/testing/raw_dep_testing_cut.jsonl")

data_control[["title", "selftext", "depression_related"]][control_mask].to_json(orient="records", lines=True,
                                                                                force_ascii=True,
                                                                                path_or_buf="../datasets/training/raw_ctrl_training_cut.jsonl")
data_control[["title", "selftext", "depression_related"]][~control_mask].to_json(orient="records", lines=True,
                                                                                 force_ascii=True,
                                                                                 path_or_buf="../datasets/testing/raw_ctrl_testing_cut.jsonl")

print("Processed training/testing datasets generated")


Processed training/testing datasets generated
