In [3]:
import asyncio
import aiohttp
import json
import os
from tqdm.notebook import tqdm

In [4]:
NUM_KIDS = 100 # per story

CHATGPT_PROGRESS_FILENAME = 'chatgpt_progress.json'
CHATGPT_COMMENTS_PROGRESS_FILENAME = 'chatgpt_comments_progress.json'

KEYWORDS_FILENAME = '../ai_keywords.txt'

# chatgpt_gh_filename = 'github_links_chatgpt.json'

# Define the base URL for the Hacker News API
BASE_URL = 'https://hacker-news.firebaseio.com/v0'

# CHATGPT_RELEASE_ID = 33804874
# START_ID = 31300000 # may 8th 2022
# END_ID = 40300000 # may 9th 2024

DEPTH = 2 # comments depth

In [16]:
import random

async def get_top_story_ids(session):
    async with session.get(f'{BASE_URL}/topstories.json') as response:
        return await response.json()

async def get_item(session, item_id):
    async with session.get(f'{BASE_URL}/item/{item_id}.json') as response:
        return await response.json()

async def get_kids_text(session, item, depth=DEPTH):
    # Base condition for recursion
    if 'kids' not in item or depth <= 0:
        return []

    kids_texts = []
    tasks = []
    for kid_id in item['kids'][:NUM_KIDS]:
        tasks.append(get_item(session, kid_id))

    # keep getting kids recursively depending on DEPTH
    kids = await asyncio.gather(*tasks)
    for kid in kids:
        if kid and 'text' in kid:
            kids_texts.append(kid['text'])
            kids_texts.extend(await get_kids_text(session, kid, depth - 1))
    return kids_texts

def load_progress(filename):
    if os.path.exists(filename):
        with open(filename, 'r') as f:
            return json.load(f)
    return {'processed_story_max_id': -1, 'stories': []}

def save_progress(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f, indent=4)

async def retrieve_stories_comments(stories_source_filename, comments_dest_filename):
    stories_dest = load_progress(comments_dest_filename)
    progress_story_max_id = stories_dest['processed_story_max_id'] # inclusive - the last processed story id from the file
    stories_comments_dest = stories_dest['stories']
    curr_num_stories = len(stories_comments_dest)

    stories_source = load_progress(stories_source_filename)
    total_num_stories = len(stories_source['stories'])

    async with aiohttp.ClientSession() as session:
        pbar = tqdm(total=total_num_stories, desc="Fetching comments from stories", )
        pbar.update(curr_num_stories)

        # go through each stories from the source
        for story in stories_source['stories']:
            try:
                # if the story has already been searched, continue
                if story['id'] <= progress_story_max_id:
                    continue

                # go through each comments and fetch the comment's data.
                story['kids_text'] = await get_kids_text(session, story, depth=DEPTH)
                stories_comments_dest.append(story)

                # update progress, save to file
                pbar.update(1)
                progress_story_max_id = story['id']
                save_progress({'processed_story_max_id': progress_story_max_id,
                            'stories': stories_comments_dest}, comments_dest_filename)
            except Exception as e:
                print(f"Error processing story {story['id']}: {e}")
                # Save progress before exiting due to error
                save_progress({'processed_story_max_id': progress_story_max_id,
                            'stories': stories_comments_dest}, comments_dest_filename)
        pbar.close()
    return stories_comments_dest

In [17]:
try:
    # Load the original JSON file
    with open(CHATGPT_PROGRESS_FILENAME, "r") as file:
        data = json.load(file)

    # Process the stories and fetch comments
    processed_data = await retrieve_stories_comments(CHATGPT_PROGRESS_FILENAME, CHATGPT_COMMENTS_PROGRESS_FILENAME)

    print('Done')
except Exception as e:
    print(f"Error: {e}")

Fetching comments from stories:   0%|          | 0/399 [00:00<?, ?it/s]

Done


# counting

In [18]:
def count_comments(data):
    return len(data), sum(len(story['kids_text']) for story in data)

num_stories, num_comments = count_comments(processed_data)
print(f"Total number of stories: {num_stories}")
print(f"Total number of comments: {num_comments}")

Total number of stories: 399
Total number of comments: 14837


# convert from json to csv

In [24]:
import json
import html
from datetime import datetime
import csv
import random
import re
from bs4 import BeautifulSoup

def read_json_file(filename):
    with open(filename, 'r') as f:
        return json.load(f)

def extract_github_urls(text):
    github_url_pattern = r"https://github\.com/[a-zA-Z0-9-]+/[a-zA-Z0-9-_.]+(?:/(?:issues|pull)/\d+)?"
    return list(set(re.findall(github_url_pattern, text)))

def is_github_repo_or_issue(url):
    if not url.startswith("https://github.com/"):
        return "Not GitHub"
    if "/issues/" in url:
        return "GitHub Issue"
    if "/pull/" in url:
        return "GitHub PR"
    return "GitHub Repo"

def unix_to_datetime(unix_timestamp):
    return datetime.fromtimestamp(unix_timestamp)

def clean_html(html_text):
    soup = BeautifulSoup(html_text, 'html.parser')
    return soup.get_text(separator=' ', strip=True)

def process_stories(discussions):
    rows = []
    github_urls = set()

    for discussion in discussions['stories']:
        discussion_id = discussion.get('id')
        title = discussion.get('title', '')
        url = discussion.get('url', '')
        date = (discussion.get('time', 0))
        # text = post.get('text', '')

        github_urls.update(extract_github_urls(title))
        github_urls.update(extract_github_urls(url))

        if 'kids_text' in discussion:
            for i, kid_text in enumerate(discussion['kids_text']):
                cleaned_text = clean_html(kid_text)
                print(i)
                rows.append({
                    'discussion_id': discussion_id,
                    'title': title,
                    'url': url,
                    'date': date,
                    'post_id': str(discussion_id)+'_'+str(i),
                    'post_text': cleaned_text,
                    # 'is_github_link': is_github_repo_or_issue(url),
                })
        else:
            rows.append({
                'discussion_id': discussion_id,
                'title': title,
                'url': url,
                'date': date,
                'post_id': '',
                'post_text': '',
                # 'is_github_link': is_github_repo_or_issue(url),
            })

    return rows, github_urls

def write_csv(filename, data, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(data)

In [None]:
chatgpt_data = read_json_file(CHATGPT_COMMENTS_PROGRESS_FILENAME)

comments_rows, comments_github_urls = process_stories(chatgpt_data)

fieldnames = [
    'discussion_id',
    'title',
    'url',
    'date',
    'post_id',
    'post_text',]

write_csv('chatgpt_comments.csv', comments_rows, fieldnames)