In [1]:
# Modules
import requests
import json
from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup
import re
import xmlrpc.client as xc
from tqdm import tqdm
import pickle

In [5]:
# Set client to the PyPI XML-RPC server
client = xc.ServerProxy('http://pypi.python.org/pypi')

# Get a list of all the packages
pypi_packages = client.list_packages()

# Save the list of packages
with open("data/packages.pkl", "wb") as f:
    pickle.dump(pypi_packages, f)

In [2]:
def get_github_link(packages: list) -> list:
    """
    Function that takes a list of python packages and returns a list of tuples with the package name, the link to the PyPI page and the link to the GitHub page.
    
    list_of_packages: list
        List of python packages to search for.
        
    return: list
        List of tuples with the package name, the link to the PyPI page and the link to the GitHub page.
    """
    
    all_links = []
    for i, package in enumerate(packages):
        # The link to the python package
        LINK = f"https://pypi.org/project/{package}/"
        
        # Get the HTML content of the page
        r = requests.get(LINK)
        
        # If the request was not successful, alert the user
        if r.status_code != 200:
            print(f"Request failed for {package, i}: {r.status_code}")
            continue
        
        # Parse the HTML content of the page
        soup = BeautifulSoup(r.content)
        
        # Get sidebar with links
        sidebar = soup.find("div", {"class": "vertical-tabs__tabs"})
        
        # Get all the links in the sidebar
        references = [link.get("href") for link in sidebar.find_all("a")]
        
        # Join into one string to regex in
        reference_text = " ".join([reference for reference in references if reference is not None])
        
        # Find the first link that contains the word "github.com"
        github_links = []
        for link in re.finditer(r"github.com(/\w*|/\W|[-]\w*|[-]\W*)*", reference_text):
            if link.group() != "github.com/" and link.group() != "github.com":
                github_links.append(link.group())
        
        # If there are no links, append None
        if len(github_links) == 0:
            github_link = None
        
        # If there's several take the shortest and alert the user
        elif len(github_links) > 1:
            print(f"Several GitHub links found for {package, i}: {github_links}")
            github_link = min(github_links, key=len)
        
        # If there is just one link, take that out of the list
        elif len(github_links) == 1:
            github_link = github_links[0]
        
        # Else alert the user no githublink is found
        else:
            print(f"No GitHub link found for {package, i}")
            github_link = None
        
        # Append the triplet to the list
        all_links.append((package, LINK, github_link))
    
    return all_links

In [None]:
# Load packages
with open("data/packages.pkl", "rb") as f:
    pypi_packages = pickle.load(f)

# Run the function with threadpool executor to speed up the process - still takes a loooong time so be aware
with ThreadPoolExecutor() as executor:
    all_links = list(tqdm(executor.map(get_github_link, [pypi_packages]), total=len(pypi_packages)))

# Save the list to a json file
with open("data/all_links_github.json", "w") as f:
    json.dump(all_links, f)

# Clean the list of None links
all_links_c = [(p, l, g) for p, l, g in all_links if g is not None]

with open("data/all_links_github_c.json", "w") as f:
    json.dump(all_links_c, f)

In [27]:
print("Number of packages on pypi:", len(pypi_packages))

with open('data/all_links_github.json', 'r') as f:
    data = json.load(f)
print("Number of packages to successfully access the webpage:", len(data))

with open('data/all_links_github_c.json', 'r') as f:
    data_clean = json.load(f)
print("Number of packages to successfully get the github link from:", len(data_clean))

Number of packages on pypi: 528577
Number of packages to successfully access the webpage: 512780
Number of packages to successfully get the github link from: 344841


In [None]:
# For each package go to the GitHub page and get the readme.text if theres a README.md
def get_readme_text(github_link: str):
    """
    Function that takes a GitHub link and returns the text of the README.md file.
    
    github_link: str
        Link to the GitHub page.
        
    return: str
        Text of the README.md file.
    """
    # If there's no link, return None
    if github_link is None:
        return None
    
    # Get the link to the raw README.md file
    raw_link = 404
    return