In [1]:
import urllib
import pandas as pd
import json
from bs4 import BeautifulSoup as bs
from dotenv import load_dotenv
import os

load_dotenv()

True

In [2]:
import urllib.request


TOP_PACKEGES_URL = "https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.min.json"

def get_top_packages():
    response = urllib.request.urlopen(TOP_PACKEGES_URL)

    data = response.read()

    packages = json.loads(data)
    metadata = packages['last_update']
    rows = packages['rows']

    data = pd.DataFrame(rows)

    print(f"Data from {metadata}")

    return data


packages = get_top_packages()

packages.head()

Data from 2024-10-01 11:28:00


Unnamed: 0,download_count,project
0,1295883942,boto3
1,615425351,urllib3
2,518765013,botocore
3,515350549,requests
4,505945583,setuptools


In [3]:
packages.size

16000

In [4]:
import urllib.request
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup as bs
import asyncio
import os

base_pypl_url = "https://pypi.org/project/{}"

def get_repo_info(repo_name):
    url = base_pypl_url.format(repo_name)

    try:
        response = urllib.request.urlopen(url)
        html = response.read()

        bsObj = bs(html, features="html.parser")
        
        github = bsObj.find('a', href=lambda href: href and "github.com" in href)
        if github is not None:
            github_url = github['href']
        else:
            github_url = None

        if isinstance(github_url, list):
            return github_url[0]

        return github_url
    except Exception as e:
        print(f"Error fetching {repo_name}: {e}")
        return None

def strip(url: str):
    if not url:
        return None
    url = url.removeprefix("https://github.com/")
    
    # Remove anything after the first two slashes
    url_parts = url.split("/")
    if len(url_parts) > 2:
        url = "/".join(url_parts[:2])
    else:
        url = "/".join(url_parts)
    
    return url

async def fetch_all_repos(repo_names):
    loop = asyncio.get_event_loop()
    # Run the get_repo_info function in a thread pool
    tasks = [loop.run_in_executor(None, get_repo_info, repo_name) for repo_name in repo_names]
    repos = await asyncio.gather(*tasks)
    return [strip(repo) for repo in repos if repo is not None]

# Setup
SLICE_START = int(os.getenv("SLICE_START", 0))
SLICE_END = int(os.getenv("SLICE_END", 10))

# Assume you have a `packages` DataFrame with 'project' column
repo_names = packages['project'][SLICE_START:SLICE_END]

# Run the tasks asynchronously
repos = await fetch_all_repos(repo_names)

print(f"Found {len(repos)} repos")

Found 99 repos


In [5]:
repos

['msgpack/msgpack-python',
 'python/mypy_extensions',
 'Textualize/rich',
 'pexpect/pexpect',
 'python/importlib_resources',
 'quantopian/zipline',
 'pexpect/ptyprocess',
 'chardet/chardet',
 'grpc/grpc',
 'cloudpipe/cloudpickle',
 'jd/tenacity',
 'uqfoundation/dill',
 'aio-libs/aiohappyeyeballs',
 'encode/httpx',
 'sponsors/Julian',
 'python-poetry/poetry-core',
 'sponsors/Julian',
 'pallets/flask',
 'matplotlib/matplotlib',
 'googleapis/python-cloud-core',
 'theskumar/python-dotenv',
 'AzureAD/microsoft-authentication-library-for-python',
 'encode/httpcore',
 'python-jsonschema/jsonschema-specifications',
 'psycopg/psycopg2',
 'executablebooks/markdown-it-py',
 'jaraco/keyring',
 'pyca/bcrypt',
 'googleapis/google-resumable-media-python',
 'python-poetry/poetry-plugin-export',
 'executablebooks/mdurl',
 'scikit-learn/scikit-learn',
 'pypi/warehouse',
 'cpburnz/python-pathspec',
 'snowflakedb/snowflake-connector-python',
 'paramiko/paramiko',
 'astanin/python-tabulate',
 'gitpython-de

In [6]:
TOKEN = os.getenv("GITHUB_TOKEN")

def get_auth_headers():
    return {
        "Authorization": f"token {TOKEN}"
    }

In [7]:
# get repo info
base_github_url = "https://api.github.com/repos/{}"

# fetch repo info

def get_repo_info(repo_name):
    try:
        url = base_github_url.format(repo_name)

        request = urllib.request.Request(url, headers=get_auth_headers())

        response = urllib.request.urlopen(request)

        data = response.read()

        repo_info = json.loads(data)

        return repo_info
    except:
        return None

repo_info = [get_repo_info(repo) for repo in tqdm(repos) if repo is not None]

  0%|          | 0/99 [00:00<?, ?it/s]

In [8]:
repo_info = [ri for ri in repo_info if ri is not None]
repo_info = pd.DataFrame(repo_info)

In [9]:
repo_info.head()

Unnamed: 0,id,node_id,name,full_name,private,owner,html_url,description,fork,url,...,forks,open_issues,watchers,default_branch,permissions,temp_clone_token,custom_properties,organization,network_count,subscribers_count
0,2242705,MDEwOlJlcG9zaXRvcnkyMjQyNzA1,msgpack-python,msgpack/msgpack-python,False,"{'login': 'msgpack', 'id': 198264, 'node_id': ...",https://github.com/msgpack/msgpack-python,MessagePack serializer implementation for Pyth...,False,https://api.github.com/repos/msgpack/msgpack-p...,...,230,7,1916,main,"{'admin': False, 'maintain': False, 'push': Fa...",,{},"{'login': 'msgpack', 'id': 198264, 'node_id': ...",230,47
1,165752308,MDEwOlJlcG9zaXRvcnkxNjU3NTIzMDg=,mypy_extensions,python/mypy_extensions,False,"{'login': 'python', 'id': 1525981, 'node_id': ...",https://github.com/python/mypy_extensions,Extensions for mypy,False,https://api.github.com/repos/python/mypy_exten...,...,32,6,134,master,"{'admin': False, 'maintain': False, 'push': Fa...",,{},"{'login': 'python', 'id': 1525981, 'node_id': ...",32,16
2,220809393,MDEwOlJlcG9zaXRvcnkyMjA4MDkzOTM=,rich,Textualize/rich,False,"{'login': 'Textualize', 'id': 93378883, 'node_...",https://github.com/Textualize/rich,Rich is a Python library for rich text and bea...,False,https://api.github.com/repos/Textualize/rich,...,1718,194,49296,master,"{'admin': False, 'maintain': False, 'push': Fa...",,{},"{'login': 'Textualize', 'id': 93378883, 'node_...",1718,537
3,12902206,MDEwOlJlcG9zaXRvcnkxMjkwMjIwNg==,pexpect,pexpect/pexpect,False,"{'login': 'pexpect', 'id': 5480175, 'node_id':...",https://github.com/pexpect/pexpect,A Python module for controlling interactive pr...,False,https://api.github.com/repos/pexpect/pexpect,...,477,161,2607,master,"{'admin': False, 'maintain': False, 'push': Fa...",,{},"{'login': 'pexpect', 'id': 5480175, 'node_id':...",477,91
4,306147182,MDEwOlJlcG9zaXRvcnkzMDYxNDcxODI=,importlib_resources,python/importlib_resources,False,"{'login': 'python', 'id': 1525981, 'node_id': ...",https://github.com/python/importlib_resources,Backport of the importlib.resources module,False,https://api.github.com/repos/python/importlib_...,...,44,4,64,main,"{'admin': False, 'maintain': False, 'push': Fa...",,{},"{'login': 'python', 'id': 1525981, 'node_id': ...",44,12


In [10]:

def get_branch_info(repo) -> dict | None:
    url = repo['branches_url'].replace("{/branch}", "")
    default_branch = repo['default_branch']

    PREFERED_BRANCHES = ["master", "main", "develop"]
    request = urllib.request.Request(url, headers={"Authorization": f"token {TOKEN}"})
    response = urllib.request.urlopen(request)

    data = response.read()

    branches = json.loads(data)

    branches = [branch for branch in branches if branch['name'] in PREFERED_BRANCHES]

    if len(branches) == 0:
        branches = [branch for branch in branches if branch['name'] == default_branch]

    if len(branches) == 0:
        return None

    return branches[0]

branches = [get_branch_info(repo) for repo in tqdm(repo_info.to_dict(orient="records")) if repo is not None]


  0%|          | 0/95 [00:00<?, ?it/s]

In [11]:
branch_name = [(branch['name'] if branch is not None else None) for branch in branches]
branch_url = [(branch['commit']['url'] if branch is not None else None) for branch in branches]


repo_info['branch_name'] = branch_name
repo_info['branch_url'] = branch_url

In [12]:
# save as csv
repo_info.to_csv(f"repo_info-{SLICE_START}-{SLICE_END}.csv")
repo_info.head()

Unnamed: 0,id,node_id,name,full_name,private,owner,html_url,description,fork,url,...,watchers,default_branch,permissions,temp_clone_token,custom_properties,organization,network_count,subscribers_count,branch_name,branch_url
0,2242705,MDEwOlJlcG9zaXRvcnkyMjQyNzA1,msgpack-python,msgpack/msgpack-python,False,"{'login': 'msgpack', 'id': 198264, 'node_id': ...",https://github.com/msgpack/msgpack-python,MessagePack serializer implementation for Pyth...,False,https://api.github.com/repos/msgpack/msgpack-p...,...,1916,main,"{'admin': False, 'maintain': False, 'push': Fa...",,{},"{'login': 'msgpack', 'id': 198264, 'node_id': ...",230,47,main,https://api.github.com/repos/msgpack/msgpack-p...
1,165752308,MDEwOlJlcG9zaXRvcnkxNjU3NTIzMDg=,mypy_extensions,python/mypy_extensions,False,"{'login': 'python', 'id': 1525981, 'node_id': ...",https://github.com/python/mypy_extensions,Extensions for mypy,False,https://api.github.com/repos/python/mypy_exten...,...,134,master,"{'admin': False, 'maintain': False, 'push': Fa...",,{},"{'login': 'python', 'id': 1525981, 'node_id': ...",32,16,master,https://api.github.com/repos/python/mypy_exten...
2,220809393,MDEwOlJlcG9zaXRvcnkyMjA4MDkzOTM=,rich,Textualize/rich,False,"{'login': 'Textualize', 'id': 93378883, 'node_...",https://github.com/Textualize/rich,Rich is a Python library for rich text and bea...,False,https://api.github.com/repos/Textualize/rich,...,49296,master,"{'admin': False, 'maintain': False, 'push': Fa...",,{},"{'login': 'Textualize', 'id': 93378883, 'node_...",1718,537,master,https://api.github.com/repos/Textualize/rich/c...
3,12902206,MDEwOlJlcG9zaXRvcnkxMjkwMjIwNg==,pexpect,pexpect/pexpect,False,"{'login': 'pexpect', 'id': 5480175, 'node_id':...",https://github.com/pexpect/pexpect,A Python module for controlling interactive pr...,False,https://api.github.com/repos/pexpect/pexpect,...,2607,master,"{'admin': False, 'maintain': False, 'push': Fa...",,{},"{'login': 'pexpect', 'id': 5480175, 'node_id':...",477,91,master,https://api.github.com/repos/pexpect/pexpect/c...
4,306147182,MDEwOlJlcG9zaXRvcnkzMDYxNDcxODI=,importlib_resources,python/importlib_resources,False,"{'login': 'python', 'id': 1525981, 'node_id': ...",https://github.com/python/importlib_resources,Backport of the importlib.resources module,False,https://api.github.com/repos/python/importlib_...,...,64,main,"{'admin': False, 'maintain': False, 'push': Fa...",,{},"{'login': 'python', 'id': 1525981, 'node_id': ...",44,12,main,https://api.github.com/repos/python/importlib_...
