# info

This notebook's purpose is to search and retrieve a list of GitHub repository URLs related to AI/LLM topics, to prepare as a control dataset for RQ3 PSM used in causal inference.

# setup

## setup paths and load csv

In [1]:
import pandas as pd
import requests
from datetime import datetime, timedelta
import time
from urllib.parse import urlparse
import os
from tqdm.notebook import tqdm

In [2]:
# new version
CHECKPOINT_PATH = "/content/drive/MyDrive/datasets/hn_rq3_treatment/hn_rq3_repos_control_checkpoint_v2.json"
OUTPUT_PATH = "/content/drive/MyDrive/datasets/hn_rq3_treatment/hn_rq3_repos_control_v2_.csv"
URLS_PATH = "/content/drive/MyDrive/datasets/hn_rq3_treatment/hn_rq3_repos_control_urls_v2.csv"
# Read the CSV file
CHECKPOINT_PATH = "/content/drive/MyDrive/datasets/hn_rq3_treatment/hn_rq3_repos_control_checkpoint_v2.json"
OUTPUT_PATH = "/content/drive/MyDrive/datasets/hn_rq3_treatment/hn_rq3_repos_control_v2_.csv"
URLS_PATH = "/content/drive/MyDrive/datasets/hn_rq3_treatment/hn_rq3_repos_control_urls_v2.csv"


## setup github api

In [3]:
from google.colab import userdata
GITHUB_TOKEN = userdata.get('GITHUB_TOKEN')

In [4]:
# GitHub API authentication
if not GITHUB_TOKEN:
    raise ValueError("Please set GITHUB_TOKEN environment variable")

headers = {
    'Authorization': f'token {GITHUB_TOKEN}',
    'Accept': 'application/vnd.github.v3+json'
}

# v5

In [8]:
# List of GitHub search queries for AI/ML repositories
AI_ML_SEARCH_QUERIES = [
    'machine-learning',
    'artificial-intelligence',
    'deep-learning',
    'neural-networks',
    'data-science',
    'ai',
    'ai-research',
    'llm',
    'transformer',
    'openai',
    'ml-models',
    'pytorch',
    'tensorflow',
    'scikit-learn',
    'keras',
    'nlp',
    'generative-ai'
]

def generate_ai_ml_repo_list(
    created_date_start: str = '2022-06-01',
    created_date_end: str = '2024-06-01',
    min_stars: int = 1,
    max_stars: int = 50000,
    min_forks: int = 1,
    total_repos: int = 500
) -> list:
    """
    Generate a list of AI/ML GitHub repositories

    :param min_forks: Minimum number of forks for a repository
    :param total_repos: Total number of repositories to collect
    :return: List of GitHub repository URLs
    """
    import requests
    import random

    # GitHub Search API base URL
    base_url = 'https://api.github.com/search/repositories'

    # GitHub token for higher rate limits
    headers = {
        'Accept': 'application/vnd.github.v3+json',
        'Authorization': f'token {GITHUB_TOKEN}'
    }

    ai_ml_repos = set()

    # Shuffle queries to get diverse results
    random.shuffle(AI_ML_SEARCH_QUERIES)

    for query in AI_ML_SEARCH_QUERIES:
        # Construct search query
        search_query = f'{query} created:{created_date_start}..{created_date_end} stars:{min_stars}..{max_stars} forks:>={min_forks}'

        params = {
            'q': search_query,
            'sort': 'stars',
            'order': 'desc',
            'per_page': 100  # Max allowed by GitHub API
        }

        try:
            response = requests.get(base_url, headers=headers, params=params)
            response.raise_for_status()

            # Extract repository URLs
            repos = response.json().get('items', [])
            for repo in repos:
                repo_url = repo['html_url']
                ai_ml_repos.add(repo_url)

                # Break if we've collected enough repositories
                if len(ai_ml_repos) >= total_repos:
                    break

            # Break outer loop if we've collected enough repositories
            if len(ai_ml_repos) >= total_repos:
                break

        except requests.RequestException as e:
            print(f"Error fetching repositories for query {query}: {e}")

    # Convert to list and truncate if necessary
    ai_ml_repos_list = list(ai_ml_repos)[:total_repos]

    return ai_ml_repos_list


In [None]:
# Example usage
control_repos = generate_ai_ml_repo_list()
df = pd.DataFrame({'url': control_repos})
df.to_csv(URLS_PATH, index=False)

In [None]:
len(df)

1277