In [None]:
import pandas as pd
import requests
import re
from datetime import datetime, timezone
import html
import json
from difflib import SequenceMatcher
from tqdm.notebook import tqdm

# Load the datasets
hn_stories = pd.read_csv('./hn-stories-gh-ai-[no-dupes].csv')
github_metadata = pd.read_csv('./hn-stories-gh-ai-metadata.csv')

def get_hn_item(item_id):
    url = f"https://hacker-news.firebaseio.com/v0/item/{item_id}.json"
    response = requests.get(url)
    return response.json() if response.status_code == 200 else None

def get_hn_user(username):
    url = f"https://hacker-news.firebaseio.com/v0/user/{username}.json"
    response = requests.get(url)
    return response.json() if response.status_code == 200 else None

def extract_identifiers(text):
    if not text:
        return set()

    # Decode HTML entities
    text = html.unescape(text)

    # Extract GitHub usernames
    github_patterns = [
        r'github\.com/(\w+)',
        r'github\.com/orgs/(\w+)',
        r'github\.com/organizations/(\w+)',
        r'@(\w+) on GitHub'
    ]

    # Extract email addresses
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'

    # Extract social media handles
    social_patterns = [
        r'x\.com/(\w+)',
        r'twitter\.com/(\w+)',
        r'facebook\.com/(\w+)',
        r'linkedin\.com/in/(\w+)',
        r'instagram\.com/(\w+)'
    ]

    extracted = set()

    for pattern in github_patterns + social_patterns:
        extracted.update(re.findall(pattern, text, re.IGNORECASE))

    extracted.update(re.findall(email_pattern, text))

    # Extract potential company names or projects (sequences of 2+ words)
    extracted.update(re.findall(r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)\b', text))

    # Extract domain names from URLs
    extracted.update(re.findall(r'https?://(?:www\.)?([a-zA-Z0-9.-]+)\.(?:[a-zA-Z]{2,})', text))

    return extracted

def unix_to_utc(timestamp):
    return datetime.fromtimestamp(timestamp, tz=timezone.utc)

def str_to_utc(date_str):
    return datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S%z").astimezone(timezone.utc)

def string_similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

def is_likely_same_user(hn_username, github_repo_owner, extracted_identifiers, about_section):
    # Convert all to lowercase for case-insensitive comparison
    hn_username = hn_username.lower()
    github_repo_owner = github_repo_owner.lower()
    extracted_identifiers = {id.lower() for id in extracted_identifiers}
    about_section = about_section.lower()

    # Direct match
    if hn_username == github_repo_owner:
        return True

    # Check if github_repo_owner is in extracted identifiers
    if github_repo_owner in extracted_identifiers:
        return True

    # Check for partial matches and high similarity
    for identifier in extracted_identifiers:
        if (github_repo_owner in identifier or
            identifier in github_repo_owner or
            string_similarity(identifier, github_repo_owner) > 0.8):
            return True

    # Check for email domain match
    email_domains = [id.split('@')[-1].split('.')[0] for id in extracted_identifiers if '@' in id]
    if any(domain == github_repo_owner for domain in email_domains):
        return True

    # Check for company name or project name match
    words = re.findall(r'\b\w+\b', github_repo_owner)
    if len(words) > 1:
        company_name = ' '.join(words)
        if company_name in about_section:
            return True

    return False


In [9]:
results = []
for _, story in tqdm(hn_stories.iterrows(), total=hn_stories.shape[0], desc="Finding matching username"):
    hn_item = get_hn_item(story['id'])
    if not hn_item:
        continue

    hn_username = hn_item.get('by')
    hn_user = get_hn_user(hn_username)

    about_section = hn_user.get('about', '') if hn_user else ''
    extracted_identifiers = extract_identifiers(about_section)

    github_repo_owner = story['url'].split('github.com/')[1].split('/')[0]

    is_same_user = is_likely_same_user(hn_username, github_repo_owner, extracted_identifiers, about_section)

    repo_data = github_metadata[github_metadata['repo_full_name'].str.startswith(github_repo_owner + '/')]
    # repo_created_at = str_to_utc(repo_data['repo_creation_date'].iloc[0]) if not repo_data.empty else None

    # hn_submission_date = unix_to_utc(story['time'])

    results.append({
        'id': story['id'],
        'hn_username': hn_username,
        'github_repo_owner': github_repo_owner,
        'is_likely_same_user': is_same_user,
        # 'repo_created_at': repo_created_at,
        # 'hn_submission_date': hn_submission_date,
        # 'submitted_before_creation': hn_submission_date < repo_created_at if repo_created_at else None,
        'extracted_identifiers': json.dumps(list(extracted_identifiers))  # Store as JSON for CSV compatibility
    })

result_df = pd.DataFrame(results)
result_df.to_csv('hn_github_user_comparison.csv', index=False)

same_user_count = result_df['is_likely_same_user'].sum()
total_stories = len(result_df)
percentage_same_user = (same_user_count / total_stories) * 100

print(f"Total stories analyzed: {total_stories}")
print(f"Stories where HN poster is likely the same as GitHub repo owner: {same_user_count}")
print(f"Percentage of stories with likely same user: {percentage_same_user:.2f}%")

submitted_before_creation = result_df['submitted_before_creation'].sum()
percentage_before_creation = (submitted_before_creation / total_stories) * 100

print(f"Stories submitted before repo creation: {submitted_before_creation}")
print(f"Percentage of stories submitted before repo creation: {percentage_before_creation:.2f}%")

Finding matching username:   0%|          | 0/1814 [00:00<?, ?it/s]

Total stories analyzed: 1814
Stories where HN poster is likely the same as GitHub repo owner: 471
Percentage of stories with likely same user: 25.96%


KeyError: 'submitted_before_creation'