In [1]:
import urllib
import pandas as pd
import json
from bs4 import BeautifulSoup as bs
from dotenv import load_dotenv
import os

load_dotenv()

True

In [2]:
import urllib.request


TOP_PACKEGES_URL = "https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.min.json"

def get_top_packages():
    response = urllib.request.urlopen(TOP_PACKEGES_URL)

    data = response.read()

    packages = json.loads(data)
    metadata = packages['last_update']
    rows = packages['rows']

    data = pd.DataFrame(rows)

    print(f"Data from {metadata}")

    return data


packages = get_top_packages()

packages.head()

Data from 2024-10-01 11:28:00


Unnamed: 0,download_count,project
0,1295883942,boto3
1,615425351,urllib3
2,518765013,botocore
3,515350549,requests
4,505945583,setuptools


In [3]:
packages.size

16000

In [11]:
import urllib.request
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup as bs
import asyncio
import os

base_pypl_url = "https://pypi.org/project/{}"

def get_repo_info(repo_name):
    url = base_pypl_url.format(repo_name)

    try:
        response = urllib.request.urlopen(url)
        html = response.read()

        bsObj = bs(html, features="html.parser")
        
        github = bsObj.find('a', href=lambda href: href and "github.com" in href)
        if github is not None:
            github_url = github['href']
        else:
            github_url = None

        if isinstance(github_url, list):
            return github_url[0]

        return github_url
    except Exception as e:
        print(f"Error fetching {repo_name}: {e}")
        return None

def strip(url: str):
    if not url:
        return None
    url = url.removeprefix("https://github.com/")
    
    # Remove anything after the first two slashes
    url_parts = url.split("/")
    if len(url_parts) > 2:
        url = "/".join(url_parts[:2])
    else:
        url = "/".join(url_parts)
    
    return url

async def fetch_all_repos(repo_names):
    loop = asyncio.get_event_loop()
    # Run the get_repo_info function in a thread pool
    tasks = [loop.run_in_executor(None, get_repo_info, repo_name) for repo_name in repo_names]
    repos = await asyncio.gather(*tasks)
    return [strip(repo) for repo in repos if repo is not None]

# Setup
SLICE_START = int(os.getenv("SLICE_START", 0))
SLICE_END = int(os.getenv("SLICE_END", 10))

# Assume you have a `packages` DataFrame with 'project' column
repo_names = packages['project'][SLICE_START:SLICE_END]

# Run the tasks asynchronously
repos = await fetch_all_repos(repo_names)

print(f"Found {len(repos)} repos")

Found 50 repos


In [12]:
repos

['boto/boto3',
 'urllib3/urllib3',
 'boto/botocore',
 'psf/requests',
 'pypa/setuptools',
 'certifi/python-certifi',
 'kjd/idna',
 'Ousret/charset_normalizer',
 'python/typing_extensions',
 'dateutil/dateutil',
 'boto/s3transfer',
 'pypa/packaging',
 'pypi/warehouse',
 'aio-libs/aiobotocore',
 'benjaminp/six',
 'yaml/pyyaml',
 'numpy/numpy',
 'http:/',
 'fsspec/filesystem_spec',
 'pyca/cryptography',
 'python/importlib_metadata',
 'pypa/pip',
 'python-cffi/cffi',
 'pandas-dev/pandas',
 'jaraco/zipp',
 'pydantic/pydantic',
 'googleapis/python-api-core',
 'eliben/pycparser',
 'pypa/wheel',
 'jmespath/jmespath.py',
 'sponsors/hynek',
 'pypi/warehouse',
 'sybrenstuvel/python-rsa',
 'pallets/click',
 'pyasn1/pyasn1',
 'aws/aws-cli',
 'tox-dev/platformdirs',
 'stub42/pytz',
 'tartley/colorama',
 'pallets/jinja',
 'pallets/markupsafe',
 'jpadilla/pyjwt',
 'googleapis/python-api-common-protos',
 'hukkin/tomli',
 'tox-dev/py-filelock',
 'pydantic/pydantic-core',
 'tkem/cachetools',
 'GrahamDump

In [13]:
TOKEN = os.getenv("GITHUB_TOKEN")

def get_auth_headers():
    return {
        "Authorization": f"token {TOKEN}"
    }

In [None]:
# get repo info
base_github_url = "https://api.github.com/repos/{}"

# fetch repo info

def get_repo_info(repo_name):
    try:
        url = base_github_url.format(repo_name)

        request = urllib.request.Request(url, headers=get_auth_headers())

        response = urllib.request.urlopen(request)

        data = response.read()

        repo_info = json.loads(data)

        return repo_info
    except:
        return None

repo_info = [get_repo_info(repo) for repo in tqdm(repos) if repo is not None]

In [21]:
repo_info = [ri for ri in repo_info if ri is not None]
repo_info = pd.DataFrame(repo_info)

In [None]:
repo_info.head()

In [22]:

def get_branch_info(repo) -> dict | None:
    url = repo['branches_url'].replace("{/branch}", "")
    default_branch = repo['default_branch']

    PREFERED_BRANCHES = ["master", "main", "develop"]
    request = urllib.request.Request(url, headers={"Authorization": f"token {TOKEN}"})
    response = urllib.request.urlopen(request)

    data = response.read()

    branches = json.loads(data)

    branches = [branch for branch in branches if branch['name'] in PREFERED_BRANCHES]

    if len(branches) == 0:
        branches = [branch for branch in branches if branch['name'] == default_branch]

    if len(branches) == 0:
        return None

    return branches[0]

branches = [get_branch_info(repo) for repo in tqdm(repo_info.to_dict(orient="records")) if repo is not None]


  0%|          | 0/48 [00:00<?, ?it/s]

In [23]:
branch_name = [(branch['name'] if branch is not None else None) for branch in branches]
branch_url = [(branch['commit']['url'] if branch is not None else None) for branch in branches]


repo_info['branch_name'] = branch_name
repo_info['branch_url'] = branch_url

In [24]:
# save as csv
repo_info.to_csv(f"repo_info-{SLICE_START}-{SLICE_END}.csv")
repo_info.head()

Unnamed: 0,id,node_id,name,full_name,private,owner,html_url,description,fork,url,...,permissions,temp_clone_token,custom_properties,organization,network_count,subscribers_count,parent,source,branch_name,branch_url
0,24774658,MDEwOlJlcG9zaXRvcnkyNDc3NDY1OA==,boto3,boto/boto3,False,"{'login': 'boto', 'id': 327752, 'node_id': 'MD...",https://github.com/boto/boto3,AWS SDK for Python,False,https://api.github.com/repos/boto/boto3,...,"{'admin': False, 'maintain': False, 'push': Fa...",,{},"{'login': 'boto', 'id': 327752, 'node_id': 'MD...",1872,233,,,develop,https://api.github.com/repos/boto/boto3/commit...
1,2410676,MDEwOlJlcG9zaXRvcnkyNDEwNjc2,urllib3,urllib3/urllib3,False,"{'login': 'urllib3', 'id': 26825299, 'node_id'...",https://github.com/urllib3/urllib3,urllib3 is a user-friendly HTTP client library...,False,https://api.github.com/repos/urllib3/urllib3,...,"{'admin': False, 'maintain': False, 'push': Fa...",,{},"{'login': 'urllib3', 'id': 26825299, 'node_id'...",1145,101,,,main,https://api.github.com/repos/urllib3/urllib3/c...
2,6670942,MDEwOlJlcG9zaXRvcnk2NjcwOTQy,botocore,boto/botocore,False,"{'login': 'boto', 'id': 327752, 'node_id': 'MD...",https://github.com/boto/botocore,"The low-level, core functionality of boto3 and...",False,https://api.github.com/repos/boto/botocore,...,"{'admin': False, 'maintain': False, 'push': Fa...",,{},"{'login': 'boto', 'id': 327752, 'node_id': 'MD...",1089,69,,,develop,https://api.github.com/repos/boto/botocore/com...
3,1362490,MDEwOlJlcG9zaXRvcnkxMzYyNDkw,requests,psf/requests,False,"{'login': 'psf', 'id': 50630501, 'node_id': 'M...",https://github.com/psf/requests,"A simple, yet elegant, HTTP library.",False,https://api.github.com/repos/psf/requests,...,"{'admin': False, 'maintain': False, 'push': Fa...",,{},"{'login': 'psf', 'id': 50630501, 'node_id': 'M...",9316,1327,,,main,https://api.github.com/repos/psf/requests/comm...
4,54980593,MDEwOlJlcG9zaXRvcnk1NDk4MDU5Mw==,setuptools,pypa/setuptools,False,"{'login': 'pypa', 'id': 647025, 'node_id': 'MD...",https://github.com/pypa/setuptools,Official project repository for the Setuptools...,False,https://api.github.com/repos/pypa/setuptools,...,"{'admin': False, 'maintain': False, 'push': Fa...",,{},"{'login': 'pypa', 'id': 647025, 'node_id': 'MD...",1184,99,,,,
