# process comments dataset

## RQ2: remove stories before chatgpt release

In [None]:
input_filename = '/content/drive/MyDrive/datasets/data/hn_comments_dataset_final.csv' # Replace with your input CSV file
output_filename = '/content/drive/MyDrive/datasets/data/rq2/hn_comments_dataset_final_after_chatgpt_release.csv' # Replace with your desired output CSV file
input_filename = '/content/drive/MyDrive/datasets/muict-naist-senior/rq1/rq1_comments_all.csv'

In [None]:
import csv
from datetime import datetime

def filter_comments(input_file, output_file, date_threshold):
    try:
        with open(input_file, 'r', encoding='utf-8') as infile, \
                open(output_file, 'w', newline='', encoding='utf-8') as outfile:

            reader = csv.DictReader(infile)
            fieldnames = reader.fieldnames
            writer = csv.DictWriter(outfile, fieldnames=fieldnames)
            writer.writeheader()

            for row in reader:
                try:
                    comment_date_str = row['discussion_date']
                    comment_date = datetime.strptime(comment_date_str, '%Y-%m-%d %H:%M:%S')

                    if comment_date >= date_threshold:
                        writer.writerow(row)
                except ValueError:
                    print(f"Skipping row due to invalid date format: {row}")
        print(f"Filtered comments written to {output_file}")
    except FileNotFoundError:
        print(f"Error: Input file '{input_file}' not found.")
    except Exception as e:
        print(f"An error occurred: {e}")


In [None]:
threshold_date = datetime(2022, 12, 1) # After ChatGPT release date

filter_comments(input_filename, output_filename, threshold_date)

Filtered comments written to /content/drive/MyDrive/datasets/data/rq2/hn_comments_dataset_final_after_chatgpt_release.csv


# process story dataset

## RQ1 and RQ3: remove non-github repository rows

In [None]:
import csv
import re
from urllib.parse import urlparse, unquote

In [None]:
def is_github_repo_url(url):
    """
    Check if a URL is a GitHub repository root URL.
    Returns True for URLs like 'https://github.com/username/repository'
    Returns False for URLs pointing to specific files, commits, pulls, gists, etc.
    """
    try:
        # Parse the URL
        parsed = urlparse(unquote(url))

        # Check if it's a GitHub URL
        if parsed.netloc != 'github.com':
            return False

        # Split the path into components
        parts = [p for p in parsed.path.split('/') if p]

        # A valid repo URL should have exactly 2 parts (username/repository)
        if len(parts) != 2:
            return False

        # Check for specific patterns that indicate non-repository URLs
        non_repo_patterns = [
            r'/blob/',
            r'/tree/',
            r'/commit/',
            r'/pull/',
            r'/issues/',
            r'/releases/',
            r'/actions/',
            r'/wiki/',
            r'/settings/',
            r'/branches/'
        ]

        return not any(pattern in url for pattern in non_repo_patterns)

    except Exception:
        return False

def filter_github_urls(input_file, output_file, url_column):
    """
    Filter CSV file to keep only rows with valid GitHub repository URLs.

    Args:
        input_file (str): Path to input CSV file
        output_file (str): Path to output CSV file
        url_column (str): Name of the column containing URLs
    """
    with open(input_file, 'r', encoding='utf-8') as infile:
        reader = csv.DictReader(infile)

        # Verify URL column exists
        if url_column not in reader.fieldnames:
            raise ValueError(f"Column '{url_column}' not found in CSV file")

        with open(output_file, 'w', encoding='utf-8', newline='') as outfile:
            writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames)
            writer.writeheader()

            for row in reader:
                url = row[url_column].strip()
                if is_github_repo_url(url):
                    writer.writerow(row)


In [None]:
input_filename = '/content/drive/MyDrive/datasets/data/hn_stories_dataset_gh_final.csv'
output_filename = '/content/drive/MyDrive/datasets/data/hn_stories_dataset_gh_final_v2.csv'
url_column = "url"  # Change this to match your CSV column name

filter_github_urls(input_filename, output_filename, url_column)

# brief stats

## comments

In [None]:
# prompt: import csv and count the number of entries (excluding headers). just in case it's useful, comments are defined by comment_id column

import csv

def count_entries(filename):
    try:
        with open(filename, 'r', encoding='utf-8') as csvfile:
            reader = csv.reader(csvfile)
            next(reader)  # Skip the header row
            entry_count = sum(1 for row in reader)
            return entry_count
    except FileNotFoundError:
        print(f"Error: File '{filename}' not found.")
        return 0
    except Exception as e:
        print(f"An error occurred: {e}")
        return 0

# Example usage (replace with your actual filename):
filename = '/content/drive/MyDrive/datasets/data/hn_comments_dataset_final_v2.csv'
entry_count = count_entries(filename)
print(f"{entry_count} comments")

2963 comments


## stories

In [None]:
# prompt: import csv and count the number of entries (excluding headers). just in case it's useful, stories are defined by discussion_id column

def count_stories(filename):
    try:
        with open(filename, 'r', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            stories = set()
            for row in reader:
                stories.add(row['discussion_id'])
            return len(stories)
    except FileNotFoundError:
        print(f"Error: File '{filename}' not found.")
        return 0
    except Exception as e:
        print(f"An error occurred: {e}")
        return 0

# Example usage (replace with your actual filename):
filename = '/content/drive/MyDrive/datasets/data/hn_stories_dataset_gh_final_v2.csv'
story_count = count_stories(filename)
print(f"{story_count} stories")

300 stories
