In [None]:
import pandas as pd
import numpy as np
import json
import random
from datetime import datetime, timedelta

In [None]:
# Load data
cluster_ids = pd.read_csv("../../data/rowClustered.csv")
coordinates = pd.read_csv("../../data/t-SNE_projected.csv")
row_data = pd.read_csv("../../data/big_data/the-reddit-climate-change-dataset-comments.csv", nrows=500_000)
with open("../../data/names.json") as json_file:
    cluster_names = json.load(json_file)

In [None]:
# Generate timestamps
def generate_timestamps(year, count):
    start_date = datetime(year, 1, 1)
    end_date = datetime(year, 12, 31, 23, 59, 59)
    return [start_date + (end_date - start_date) * random.random() for _ in range(count)]

# Distribution of comments per year
distribution = {
    2011: 5000,
    2012: 10000,
    2013: 14500,
    2014: 17500,
    2015: 21300,
    2016: 36100,
    2017: 51200,
    2018: 71000,
    2019: 61000,
    2020: 49300,
    2021: 62000,
    2022: 101100
}

In [None]:
# Generate timestamps for each year
timestamps = []
for year, count in distribution.items():
    timestamps.extend(generate_timestamps(year, count))

# Shuffle timestamps to ensure randomness
random.shuffle(timestamps)

# Convert timestamps to Unix time
timestamps_unix = [int(ts.timestamp()) for ts in timestamps]

# Assign timestamps to the dataframe
row_data["created_utc"] = timestamps_unix


In [None]:
# Process the data
result = pd.concat([cluster_ids, coordinates], axis=1).drop(columns=["Unnamed: 0"])
result["body"] = row_data["body"]
result['cluster_id'] = result['cluster_id'].apply(lambda x: cluster_names[str(x)])
result["created_utc"] = pd.to_datetime(row_data["created_utc"], unit="s")


In [40]:
result.to_csv("../../data/result.csv", index=False)