In [6]:
import asyncio
import aiohttp
import json
import os
from tqdm.notebook import tqdm

In [15]:
NUM_KIDS = 100 # per story

HN_STORIES_DATASET_FILENAME = '../hn_stories_dataset.json'
HN_COMMENTS_DATASET_FILENAME = '../hn_comments_dataset.json'

KEYWORDS_FILENAME = '../ai_keywords.txt'

# chatgpt_gh_filename = 'github_links_chatgpt.json'

# Define the base URL for the Hacker News API
BASE_URL = 'https://hacker-news.firebaseio.com/v0'

# CHATGPT_RELEASE_ID = 33804874
# START_ID = 31300000 # may 8th 2022
# END_ID = 40300000 # may 9th 2024

DEPTH = 2 # comments depth

In [8]:
async def get_top_story_ids(session):
    async with session.get(f'{BASE_URL}/topstories.json') as response:
        return await response.json()

async def get_item(session, item_id):
    async with session.get(f'{BASE_URL}/item/{item_id}.json') as response:
        return await response.json()

async def get_kids_hierarchical(session, item, depth=DEPTH):
    if 'kids' not in item or depth <= 0:
        return []

    kids_hierarchy = []
    tasks = []
    for kid_id in item['kids'][:NUM_KIDS]:
        tasks.append(get_item(session, kid_id))

    kids = await asyncio.gather(*tasks)
    for kid in kids:
        if kid:
            kid_data = {
                'id': kid.get('id'),
                'text': kid.get('text'),
                'time': kid.get('time'),
                'author': kid.get('by'),
                'depth': DEPTH - depth + 1,
                'children': await get_kids_hierarchical(session, kid, depth - 1)
            }
            kids_hierarchy.append(kid_data)
    return kids_hierarchy

def load_progress(filename):
    if os.path.exists(filename):
        with open(filename, 'r') as f:
            return json.load(f)
    return {'processed_story_max_id': -1, 'stories': []}

def save_progress(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f, indent=4)

async def retrieve_stories_comments(stories_source_filename, comments_dest_filename):
    stories_dest = load_progress(comments_dest_filename)
    progress_story_max_id = stories_dest['processed_story_max_id']
    stories_comments_dest = stories_dest['stories']
    curr_num_stories = len(stories_comments_dest)

    stories_source = load_progress(stories_source_filename)
    total_num_stories = len(stories_source['stories'])

    async with aiohttp.ClientSession() as session:
        pbar = tqdm(total=total_num_stories, desc="Fetching comments from stories")
        pbar.update(curr_num_stories)

        for story in stories_source['stories']:
            try:
                if story['id'] <= progress_story_max_id:
                    continue

                story['kids_text'] = await get_kids_hierarchical(session, story, depth=DEPTH)
                stories_comments_dest.append(story)

                pbar.update(1)
                progress_story_max_id = story['id']
                save_progress({'processed_story_max_id': progress_story_max_id,
                               'stories': stories_comments_dest}, comments_dest_filename)
            except Exception as e:
                print(f"Error processing story {story['id']}: {e}")
                save_progress({'processed_story_max_id': progress_story_max_id,
                               'stories': stories_comments_dest}, comments_dest_filename)
        pbar.close()
    return stories_comments_dest

In [10]:
try:
    # Load the original JSON file
    with open(HN_STORIES_DATASET_FILENAME, "r") as file:
        data = json.load(file)

    # Process the stories and fetch comments
    processed_data = await retrieve_stories_comments(HN_STORIES_DATASET_FILENAME, HN_COMMENTS_DATASET_FILENAME)

    print('Done')
except Exception as e:
    print(f"Error: {e}")

Exception ignored in: <function tqdm.__del__ at 0x00000207B6066290>
Traceback (most recent call last):
  File "c:\Users\prach\AppData\Local\Programs\Python\Python310\lib\site-packages\tqdm\std.py", line 1148, in __del__
    self.close()
  File "c:\Users\prach\AppData\Local\Programs\Python\Python310\lib\site-packages\tqdm\notebook.py", line 279, in close
    self.disp(bar_style='danger', check_delay=False)
AttributeError: 'tqdm_notebook' object has no attribute 'disp'


Error: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


# counting

In [18]:
def count_comments(data):
    return len(data), sum(len(story['kids_text']) for story in data)

num_stories, num_comments = count_comments(processed_data)
print(f"Total number of stories: {num_stories}")
print(f"Total number of comments: {num_comments}")

Total number of stories: 399
Total number of comments: 14837


# convert from json to csv

In [11]:
import json
import csv
import html
from datetime import datetime
import re
from bs4 import BeautifulSoup

def read_json_file(filename):
    with open(filename, 'r') as f:
        return json.load(f)

def extract_github_urls(text):
    github_url_pattern = r"https://github\.com/[a-zA-Z0-9-]+/[a-zA-Z0-9-_.]+(?:/(?:issues|pull)/\d+)?"
    return list(set(re.findall(github_url_pattern, text)))

def is_github_repo_or_issue(url):
    if not url.startswith("https://github.com/"):
        return "Not GitHub"
    if "/issues/" in url:
        return "GitHub Issue"
    if "/pull/" in url:
        return "GitHub PR"
    return "GitHub Repo"

def unix_to_datetime(unix_timestamp):
    return datetime.fromtimestamp(unix_timestamp).strftime('%Y-%m-%d %H:%M:%S')

def clean_html(html_text):
    soup = BeautifulSoup(html_text, 'html.parser')
    return soup.get_text(separator=' ', strip=True)

def flatten_comments(comments, discussion_id, title, url, discussion_date, parent_id=None, depth=0):
    flattened = []
    for comment in comments:
        comment_id = comment.get('id', '')
        comment_text = clean_html(comment.get('text', ''))
        comment_date = unix_to_datetime(comment.get('time', 0))
        comment_author = comment.get('author', '')

        flattened.append({
            'discussion_id': discussion_id,
            'title': title,
            'url': url,
            'discussion_date': discussion_date,
            'comment_id': comment_id,
            'parent_id': parent_id,
            'depth': depth,
            'comment_text': comment_text,
            'comment_date': comment_date,
            'comment_author': comment_author,
        })

        if 'children' in comment:
            flattened.extend(flatten_comments(comment['children'], discussion_id, title, url, discussion_date, comment_id, depth + 1))

    return flattened

def process_stories(discussions):
    rows = []
    github_urls = set()

    for discussion in discussions['stories']:
        discussion_id = discussion.get('id')
        title = discussion.get('title', '')
        url = discussion.get('url', '')
        discussion_date = unix_to_datetime(discussion.get('time', 0))

        github_urls.update(extract_github_urls(title))
        github_urls.update(extract_github_urls(url))

        if 'comments_hierarchy' in discussion:
            rows.extend(flatten_comments(discussion['comments_hierarchy'], discussion_id, title, url, discussion_date))
        else:
            rows.append({
                'discussion_id': discussion_id,
                'title': title,
                'url': url,
                'discussion_date': discussion_date,
                'comment_id': '',
                'parent_id': '',
                'depth': 0,
                'comment_text': '',
                'comment_date': '',
                'comment_author': '',
            })

    return rows, github_urls

def write_csv(filename, data, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(data)


In [16]:
chatgpt_data = read_json_file(HN_COMMENTS_DATASET_FILENAME)

comments_rows, comments_github_urls = process_stories(chatgpt_data)

fieldnames = [
    'discussion_id',
    'title',
    'url',
    'date',
    'post_id',
    'post_text',]

write_csv('chatgpt_comments.csv', comments_rows, fieldnames)

TypeError: object of type 'NoneType' has no len()