In [1]:
import pickle
import json
import csv
import pandas as pd
import numpy as np
import os

In [2]:
REDDIT_TIMELINE_DIR = "/import/nlp/datasets/MoC/CLPsych_Reddit/"
REDDIT_HASH_DIR = "/import/nlp/datasets/MoC/CLPsych_Reddit/hashing/"
REDDIT_PICKLE_DIR = "/import/nlp/datasets/Reddit/users/"

In [3]:
# reddit user name
USER = "spectralconfetti"

# open pickle file
with open(os.path.join(REDDIT_PICKLE_DIR, USER + ".p"), "rb") as f:
    reddit_data = pickle.load(f)

In [4]:
# load hashtable 
with open(os.path.join(REDDIT_HASH_DIR, "reddit_new_hashed.json"), "r") as f:
    reddit_hash = json.load(f)

In [5]:
# find all timelines for user in hash table
user_timeline_ids = []

for hashkey in reddit_hash.keys():
    if USER in hashkey:
        user_timeline_ids.append(reddit_hash[hashkey])

print(user_timeline_ids)

['50819d54b1', 'f55ec24987', 'e71b4656e6', '6d8bcf0531']


In [6]:
# load tsv files with timelines
user_timeline_paths = []
for timeline_id in user_timeline_ids:

    # find where the file is located
    for subfolder in ["test", "train"]:
        file_path = os.path.join(REDDIT_TIMELINE_DIR, subfolder, timeline_id + ".tsv")
        print(file_path)
        if os.path.exists(file_path):
            user_timeline_paths.append(file_path)
            print(f"Found file {file_path}")
            break

/import/nlp/datasets/MoC/CLPsych_Reddit/test/50819d54b1.tsv
/import/nlp/datasets/MoC/CLPsych_Reddit/train/50819d54b1.tsv
Found file /import/nlp/datasets/MoC/CLPsych_Reddit/train/50819d54b1.tsv
/import/nlp/datasets/MoC/CLPsych_Reddit/test/f55ec24987.tsv
/import/nlp/datasets/MoC/CLPsych_Reddit/train/f55ec24987.tsv
Found file /import/nlp/datasets/MoC/CLPsych_Reddit/train/f55ec24987.tsv
/import/nlp/datasets/MoC/CLPsych_Reddit/test/e71b4656e6.tsv
/import/nlp/datasets/MoC/CLPsych_Reddit/train/e71b4656e6.tsv
Found file /import/nlp/datasets/MoC/CLPsych_Reddit/train/e71b4656e6.tsv
/import/nlp/datasets/MoC/CLPsych_Reddit/test/6d8bcf0531.tsv
/import/nlp/datasets/MoC/CLPsych_Reddit/train/6d8bcf0531.tsv


In [7]:
# try to open tsv file
user_timeline_files = []
for user_timeline_path in user_timeline_paths:
    df = pd.read_csv(user_timeline_path, sep='\t')
    user_timeline_files.append(df)

In [8]:
# the post ids from the original posts are not the same as the post ids in the timeline files
# so we have to do text matching and then attach the original id to the timeline

# get all titles from the original posts
og_titles = {}
for post in reddit_data:
    og_titles[post["title"]] = post["id"]

for timeline_df in user_timeline_files:
    og_ids = []
    # iteratie through titles
    for i, title in enumerate(timeline_df["title"]):
        
        # iterate through the original titles
        
        for og_title in og_titles.keys():
            # if the title is in the original titles
            if title in og_title:
                og_ids.append(og_titles[og_title])
                break
        
    # add og_ids to the timeline_df as new column
    timeline_df["ogpostid"] = og_ids


In [12]:
# create a dictionary with all posts where the keys or the post id:

# iterate through the original posts
og_posts = {}
for post in reddit_data:
    og_posts[post["id"]] = {
        "title": post["title"],
        "body": post.get("selftext", ""),
        "created_utc": post["created_utc"],
        "label": [0]
    }

In [16]:
# convert timelines to dictionaries with a summary and post ids
# also add labels to original data

timeline_dict = {}

for timeline_df in user_timeline_files:
    # get og_ids 
    og_ids = timeline_df["ogpostid"].tolist()

    # create key that indicates the range of posts
    key = f"{og_ids[0]}-{og_ids[-1]}"

    timeline_dict[key] = {
        "timeline_of_interest": True,
        "posts": og_ids,
        "summary": "",
    }

    for row in timeline_df.iterrows():
        # get the post id
        post_id = row[1]["ogpostid"]

        # get the label
        label = row[1]["label"]

        # add the label to the original post
        og_posts[post_id]["label"] = [label]


In [None]:
# save data as json
with open(f"public/data/{USER}_posts.json", "w") as f:
    json.dump(og_posts, f, indent=4)

with open(f"public/data/{USER}_timelines.json", "w") as f:
    json.dump(timeline_dict, f, indent=4)

# open user id json and add the new id
with open(f"src/assets/user_ids.json", "r") as f:
    user_ids = json.load(f)

# add the new user id to the json
user_ids["ids"].append(USER)
# save the json
with open(f"src/assets/user_ids.json", "w") as f:
    json.dump(user_ids, f, indent=4)
