# Retrieving GitHub Metrics for each discussion

setup

In [11]:
import pandas as pd
import requests
import time
from datetime import datetime
import json
import os
from urllib.parse import urlparse
from tqdm.notebook import tqdm
import traceback

In [2]:
# GitHub API credentials
GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN")
HEADERS = {'Authorization': f'token {GITHUB_TOKEN}'}
print(GITHUB_TOKEN)

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


In [12]:
# File paths
INPUT_CSV = '../../data/hn_stories_dataset_gh_final.csv'
OUTPUT_JSON = '../../data/hn_gh.csv'
PROGRESS_FILE = 'progress.json'
ERROR_LOG = 'error_log.txt'

# Maximum number of retries
MAX_RETRIES = 3

In [13]:
def is_github_repo_url(url):
    """Check if the URL is a valid GitHub repository URL."""
    parsed = urlparse(url)
    return parsed.netloc == 'github.com' and len(parsed.path.split('/')) == 3

def get_repo_stats(owner, repo):
    """Retrieve star and fork history for a repository."""
    stars_url = f'https://api.github.com/repos/{owner}/{repo}/stargazers'
    forks_url = f'https://api.github.com/repos/{owner}/{repo}/forks'

    stars_history = []
    forks_history = []

    for url in [stars_url, forks_url]:
        page = 1
        while True:
            response = requests.get(f'{url}?page={page}&per_page=100', headers=HEADERS)
            if response.status_code == 200:
                data = response.json()
                if not data:
                    break
                for item in data:
                    timestamp = datetime.strptime(item['starred_at'] if 'starred_at' in item else item['created_at'], '%Y-%m-%dT%H:%M:%SZ').timestamp()
                    if 'starred_at' in item:
                        stars_history.append((timestamp, page * 100 + len(stars_history) + 1))
                    else:
                        forks_history.append((timestamp, page * 100 + len(forks_history) + 1))
                page += 1
            elif response.status_code == 403:
                reset_time = int(response.headers.get('X-RateLimit-Reset', 0))
                sleep_time = max(reset_time - time.time(), 0) + 1
                raise Exception(f"Rate limit exceeded. Need to sleep for {sleep_time} seconds.")
            else:
                raise Exception(f"HTTP Error: {response.status_code}, {response.text}")

    return {'stars_history': stars_history, 'forks_history': forks_history}

def log_error(url, error_msg):
    """Log errors to a file."""
    with open(ERROR_LOG, 'a') as f:
        f.write(f"{datetime.now().isoformat()} - URL: {url} - Error: {error_msg}\n")

def process_repos():
    """Process repositories from the CSV file and retrieve their stats."""
    df = pd.read_csv(INPUT_CSV)

    # Load progress if it exists
    if os.path.exists(PROGRESS_FILE):
        with open(PROGRESS_FILE, 'r') as f:
            progress = json.load(f)
    else:
        progress = {'processed': 0, 'stats': {}, 'valid_urls': 0, 'invalid_urls': 0, 'error_urls': 0}

    # Create tqdm progress bar
    pbar = tqdm(total=len(df), initial=progress['processed'], desc="Processing repositories")

    for index, row in df.iloc[progress['processed']:].iterrows():
        url = row['url']
        if not is_github_repo_url(url):
            progress['invalid_urls'] += 1
            pbar.update(1)
            continue

        progress['valid_urls'] += 1
        owner, repo = url.split('/')[-2:]

        for attempt in range(MAX_RETRIES):
            try:
                stats = get_repo_stats(owner, repo)
                progress['stats'][url] = {
                    'discussion_id': row['discussion_id'],
                    'title': row['title'],
                    'date': row['date'],
                    'stars_history': stats['stars_history'],
                    'forks_history': stats['forks_history']
                }
                break
            except Exception as e:
                error_msg = f"{type(e).__name__}: {str(e)}\n{traceback.format_exc()}"
                if attempt == MAX_RETRIES - 1:
                    pbar.write(f"Failed to process {url} after {MAX_RETRIES} attempts.")
                    pbar.write(f"Error: {error_msg}")
                    log_error(url, error_msg)
                    progress['error_urls'] += 1
                else:
                    pbar.write(f"Attempt {attempt + 1} failed for {url}. Retrying...")
                    pbar.write(f"Error: {error_msg}")
                    if "Rate limit exceeded" in str(e):
                        sleep_time = int(str(e).split()[-2])
                        pbar.write(f"Rate limit exceeded. Sleeping for {sleep_time} seconds.")
                        time.sleep(sleep_time)
                    else:
                        time.sleep(5)

        progress['processed'] = index + 1

        # Save progress
        with open(PROGRESS_FILE, 'w') as f:
            json.dump(progress, f)

        pbar.update(1)
        pbar.set_postfix({
            'Valid': progress['valid_urls'],
            'Invalid': progress['invalid_urls'],
            'Errors': progress['error_urls']
        })

    pbar.close()

    # Save final results
    with open(OUTPUT_JSON, 'w') as f:
        json.dump(progress, f)

    print(f"Processing complete. Valid URLs: {progress['valid_urls']}, Invalid URLs: {progress['invalid_urls']}, Error URLs: {progress['error_urls']}")
    print(f"Detailed error log saved to {ERROR_LOG}")

In [14]:
# Run the processing
process_repos()

Processing repositories:   2%|2         | 8/354 [00:00<?, ?it/s]

Attempt 1 failed for https://github.com/chidiwilliams/buzz. Retrying...
Error: KeyError: 'created_at'
Traceback (most recent call last):
  File "C:\Users\prach\AppData\Local\Temp\ipykernel_14716\4281034150.py", line 69, in process_repos
    stats = get_repo_stats(owner, repo)
  File "C:\Users\prach\AppData\Local\Temp\ipykernel_14716\4281034150.py", line 23, in get_repo_stats
    timestamp = datetime.strptime(item['starred_at'] if 'starred_at' in item else item['created_at'], '%Y-%m-%dT%H:%M:%SZ').timestamp()
KeyError: 'created_at'

Attempt 2 failed for https://github.com/chidiwilliams/buzz. Retrying...
Error: KeyError: 'created_at'
Traceback (most recent call last):
  File "C:\Users\prach\AppData\Local\Temp\ipykernel_14716\4281034150.py", line 69, in process_repos
    stats = get_repo_stats(owner, repo)
  File "C:\Users\prach\AppData\Local\Temp\ipykernel_14716\4281034150.py", line 23, in get_repo_stats
    timestamp = datetime.strptime(item['starred_at'] if 'starred_at' in item else it

KeyboardInterrupt: 