In [1]:
import asyncio
import aiohttp
import json
import os
from tqdm.notebook import tqdm

In [2]:
NUM_KIDS = 100 # per story

HN_STORIES_DATASET_FILENAME = '../hn_stories_dataset.json'
HN_COMMENTS_DATASET_FILENAME = '../hn_comments_dataset.json'

KEYWORDS_FILENAME = '../ai_keywords.txt'

# chatgpt_gh_filename = 'github_links_chatgpt.json'

# Define the base URL for the Hacker News API
BASE_URL = 'https://hacker-news.firebaseio.com/v0'

# CHATGPT_RELEASE_ID = 33804874
# START_ID = 31300000 # may 8th 2022
# END_ID = 40300000 # may 9th 2024

DEPTH = 2 # comments depth

In [3]:
async def get_top_story_ids(session):
    async with session.get(f'{BASE_URL}/topstories.json') as response:
        return await response.json()

async def get_item(session, item_id):
    async with session.get(f'{BASE_URL}/item/{item_id}.json') as response:
        return await response.json()

async def get_kids_hierarchical(session, item, depth=DEPTH):
    if 'kids' not in item or depth <= 0:
        return []

    kids_hierarchy = []
    tasks = []
    for kid_id in item['kids'][:NUM_KIDS]:
        tasks.append(get_item(session, kid_id))

    kids = await asyncio.gather(*tasks)
    for kid in kids:
        if kid:
            kid_data = {
                'id': kid.get('id'),
                'text': kid.get('text'),
                'time': kid.get('time'),
                'author': kid.get('by'),
                'depth': DEPTH - depth + 1,
                'children': await get_kids_hierarchical(session, kid, depth - 1)
            }
            kids_hierarchy.append(kid_data)
    return kids_hierarchy

def load_progress(filename):
    if os.path.exists(filename):
        with open(filename, 'r') as f:
            return json.load(f)
    return {'processed_story_max_id': -1, 'stories': []}

def save_progress(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f, indent=4)

async def retrieve_stories_comments(stories_source_filename, comments_dest_filename):
    stories_dest = load_progress(comments_dest_filename)
    progress_story_max_id = stories_dest['processed_story_max_id']
    stories_comments_dest = stories_dest['stories']
    curr_num_stories = len(stories_comments_dest)

    stories_source = load_progress(stories_source_filename)
    total_num_stories = len(stories_source['stories'])

    async with aiohttp.ClientSession() as session:
        pbar = tqdm(total=total_num_stories, desc="Fetching comments from stories")
        pbar.update(curr_num_stories)

        for story in stories_source['stories']:
            try:
                if story['id'] <= progress_story_max_id:
                    continue

                story['kids_text'] = await get_kids_hierarchical(session, story, depth=DEPTH)
                stories_comments_dest.append(story)

                pbar.update(1)
                progress_story_max_id = story['id']
                save_progress({'processed_story_max_id': progress_story_max_id,
                               'stories': stories_comments_dest}, comments_dest_filename)
            except Exception as e:
                print(f"Error processing story {story['id']}: {e}")
                save_progress({'processed_story_max_id': progress_story_max_id,
                               'stories': stories_comments_dest}, comments_dest_filename)
        pbar.close()
    return stories_comments_dest

In [4]:
try:
    # Load the original JSON file
    with open(HN_STORIES_DATASET_FILENAME, "r") as file:
        data = json.load(file)

    # Process the stories and fetch comments
    processed_data = await retrieve_stories_comments(HN_STORIES_DATASET_FILENAME, HN_COMMENTS_DATASET_FILENAME)

    print('Done')
except Exception as e:
    print(f"Error: {e}")

Fetching comments from stories:   0%|          | 0/807 [00:00<?, ?it/s]

CancelledError: 

# counting

In [5]:
def count_comments(data):
    return len(data), sum(len(story['kids_text']) for story in data)

num_stories, num_comments = count_comments(processed_data)
print(f"Total number of stories: {num_stories}")
print(f"Total number of comments: {num_comments}")

NameError: name 'processed_data' is not defined

# convert from json to csv

In [22]:
import json
import csv
import html
from datetime import datetime
import re
from bs4 import BeautifulSoup

def read_json_file(filename):
    with open(filename, 'r') as f:
        return json.load(f)

def extract_github_urls(text):
    github_url_pattern = r"https://github\.com/[a-zA-Z0-9-]+/[a-zA-Z0-9-_.]+(?:/(?:issues|pull)/\d+)?"
    return list(set(re.findall(github_url_pattern, text)))

def is_github_repo_or_issue(url):
    if not url.startswith("https://github.com/"):
        return "Not GitHub"
    if "/issues/" in url:
        return "GitHub Issue"
    if "/pull/" in url:
        return "GitHub PR"
    return "GitHub Repo"

def unix_to_datetime(unix_timestamp):
    return datetime.fromtimestamp(unix_timestamp).strftime('%Y-%m-%d %H:%M:%S')

def clean_html(html_text):
    if html_text is None:
        return ''
    soup = BeautifulSoup(html_text, 'html.parser')
    return soup.get_text(separator=' ', strip=True)

def flatten_comments(comments, discussion_id, title, url, discussion_date, parent_id=None, depth=0):
    flattened = []
    if not isinstance(comments, list):
        print(f"Warning: comments is not a list. Type: {type(comments)}")
        return flattened

    for i, comment in enumerate(comments):
        if not isinstance(comment, dict):
            print(f"Warning: comment {i} is not a dict. Type: {type(comment)}")
            continue

        comment_id = comment.get('id', '')
        comment_text = comment.get('text')
        if comment_text is None:
            print(f"Warning: comment {comment_id} has None text")
            comment_text = ''
        else:
            comment_text = clean_html(comment_text)
        comment_date = unix_to_datetime(comment.get('time', 0))
        comment_author = comment.get('author', '')

        flattened.append({
            'discussion_id': discussion_id,
            'title': title,
            'url': url,
            'discussion_date': discussion_date,
            'comment_id': comment_id,
            'parent_id': parent_id,
            'depth': depth,
            'comment_text': comment_text,
            'comment_date': comment_date,
            'comment_author': comment_author,
        })

        children = comment.get('children', [])
        if isinstance(children, list):
            flattened.extend(flatten_comments(children, discussion_id, title, url, discussion_date, comment_id, depth + 1))
        else:
            print(f"Warning: children for comment {comment_id} is not a list. Type: {type(children)}")

    return flattened

def process_stories(discussions):
    rows = []
    github_urls = set()

    for i, discussion in enumerate(discussions['stories']):
        discussion_id = discussion.get('id')
        title = discussion.get('title', '')
        url = discussion.get('url', '')
        discussion_date = unix_to_datetime(discussion.get('time', 0))

        print(f"Processing discussion {i+1}: ID {discussion_id}, Title: {title[:30]}...")

        github_urls.update(extract_github_urls(title))
        github_urls.update(extract_github_urls(url))

        comments_hierarchy = discussion.get('comments_hierarchy')
        if comments_hierarchy is not None:
            print(f"  Found {len(comments_hierarchy)} top-level comments")
            for j, comment in enumerate(comments_hierarchy):
                comment_text = comment.get('text', 'N/A')
                comment_text_preview = comment_text[:30] if comment_text is not None else 'None'
                print(f"    Comment {j+1}: ID {comment.get('id', 'N/A')}, Author: {comment.get('author', 'N/A')}, Text: {comment_text_preview}...")
            rows.extend(flatten_comments(comments_hierarchy, discussion_id, title, url, discussion_date))
        else:
            print("  No comments found for this discussion")
            rows.append({
                'discussion_id': discussion_id,
                'title': title,
                'url': url,
                'discussion_date': discussion_date,
                'comment_id': '',
                'parent_id': '',
                'depth': 0,
                'comment_text': '',
                'comment_date': '',
                'comment_author': '',
            })

    print(f"Processed {len(discussions['stories'])} discussions, found {len(rows)} total comments")
    return rows, github_urls

def write_csv(filename, data, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(data)


In [23]:
chatgpt_data = read_json_file(HN_COMMENTS_DATASET_FILENAME)

comments_rows, comments_github_urls = process_stories(chatgpt_data)

fieldnames = [
    'discussion_id',
    'title',
    'url',
    'discussion_date',
    'comment_id',
    'parent_id',
    'depth',
    'comment_text',
    'comment_date',
    'comment_author',
]

write_csv('chatgpt_comments.csv', comments_rows, fieldnames)

Processing discussion 1: ID 31342409, Title: AI-engineered enzyme eats enti...
  Found 9 top-level comments
    Comment 1: ID 31345137, Author: marginalia_nu, Text: I&#x27;ve always sort of thoug...
    Comment 2: ID 31347608, Author: mleonhard, Text: In 2009, a Taiwanese high-scho...
    Comment 3: ID 31357459, Author: foxyv, Text: Why not just bury the stuff? T...
    Comment 4: ID 31348726, Author: ComradePhil, Text: What are the long term effects...
    Comment 5: ID 31346954, Author: slackstation, Text: The title of this link sounds ...
    Comment 6: ID 31347377, Author: newlikeice, Text: Brainstorming. With all the ta...
    Comment 7: ID 31346415, Author: ajb, Text: Hmm, having paid quite a lot t...
    Comment 8: ID 31346919, Author: jrvarela56, Text: Would be cool if this eats the...
    Comment 9: ID 31346186, Author: rolph, Text: fast forward to - AI-engineere...
Processing discussion 2: ID 31355348, Title: BlindAI: Open-source, fast and...
  Found 7 top-level comments
    

  soup = BeautifulSoup(html_text, 'html.parser')


Processing discussion 172: ID 32953344, Title: Nvidia's new AI model quickly ...
  Found 3 top-level comments
    Comment 1: ID 32954580, Author: apozem, Text: One of the biggest issues in t...
    Comment 2: ID 32955991, Author: irq-1, Text: Project website: <a href="http...
    Comment 3: ID 32957892, Author: avmich, Text: Great. Now it would be nice to...
Processing discussion 173: ID 32956013, Title: Shasta: AI-powered audio recor...
  Found 13 top-level comments
    Comment 1: ID 32956636, Author: inasmuch, Text: I know everyone hates Adobe (I...
    Comment 2: ID 32956131, Author: nonoesp, Text: I discovered today that Adobe ...
    Comment 3: ID 32957495, Author: thomasqbrady, Text: Is it still called &quot;Sherl...
    Comment 4: ID 32958096, Author: relwin, Text: The denoising works very well....
    Comment 5: ID 32957037, Author: samdotdesign, Text: Hi! I’m Sam, lead designer of ...
    Comment 6: ID 32956720, Author: jannyfer, Text: This “Mic Check AI” - I actual...
    Com