In [3]:
# Modules
import requests
import json
from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup
import re
import xmlrpc.client as xc
from tqdm import tqdm

In [4]:
# Set client to the PyPI XML-RPC server
client = xc.ServerProxy('http://pypi.python.org/pypi')

# Get a list of all the packages
packages = client.list_packages()

In [None]:
def get_github_link(list_of_packages: list) -> list:
    """
    Function that takes a list of python packages and returns a list of tuples with the package name, the link to the PyPI page and the link to the GitHub page.
    
    list_of_packages: list
        List of python packages to search for.
        
    return: list
        List of tuples with the package name, the link to the PyPI page and the link to the GitHub page.
    """
    all_links_list = []
    for i, package in enumerate(packages):
        # The link to the python package
        LINK = f"https://pypi.org/project/{package}/"
        
        # Get the HTML content of the page
        r = requests.get(LINK)
        
        # If the request was not successful, alert the user
        if r.status_code != 200:
            print(f"Request failed for {package, i}: {r.status_code}")
            continue
        
        # Parse the HTML content of the page
        soup = BeautifulSoup(r.content)
        
        # Get sidebar with links
        sidebar = soup.find("div", {"class": "vertical-tabs__tabs"})
        
        # Get all the links in the sidebar
        references = [link.get("href") for link in sidebar.find_all("a")]
        
        # Join into one string to regex in
        reference_text = " ".join([reference for reference in references if reference is not None])
        
        # Find the first link that contains the word "github.com"
        github_links = []
        for link in re.finditer(r"github.com(/\w*|/\W|[-]\w*|[-]\W*)*", reference_text):
            if link.group() != "github.com/" and link.group() != "github.com":
                github_links.append(link.group())
        
        # If there are no links, append None
        if len(github_links) == 0:
            github_link = None
        
        # If there's several take the shortest and alert the user
        elif len(github_links) > 1:
            print(f"Several GitHub links found for {package, i}: {github_links}")
            github_link = min(github_links, key=len)
        
        elif len(github_links) == 1:
            github_link = github_links[0]
        
        else:
            print(f"No GitHub link found for {package, i}")
            github_link = None
            
        all_links_list.append((package, LINK, github_link))
    
    return all_links_list

In [None]:
# Run the function with threadpool executor to speed up the process
with ThreadPoolExecutor() as executor:
    all_links = list(tqdm(executor.map(get_github_link, [packages]), total=len(packages)))
    
# Flatten the list
all_links = [item for sublist in all_links for item in sublist]

# Save the list to a json file
with open("all_links_github.json", "w") as f:
    json.dump(all_links, f)


In [32]:
# For each package go to the GitHub page and get the readme.text if theres a README.md

def get_readme_text(github_link: str):
    """
    Function that takes a GitHub link and returns the text of the README.md file.
    
    github_link: str
        Link to the GitHub page.
        
    return: str
        Text of the README.md file.
    """
    # If there's no link, return None
    if github_link is None:
        return None
    
    github_link = github_link.replace("https://github.com", "https://raw.githubusercontent.com")
    
    response = requests.get(f"{github_link}/main/README.md")

    if response.status_code != 200:
        response = requests.get(f"{github_link}/master/README.md")
        if response.status_code != 200:
            return None
    
    readme_text = response.text

    return readme_text


def get_requirements_text(github_link: str) -> list:
    """
    Function that takes a GitHub link and returns the text of the requirements.txt file.
    
    github_link: str
        Link to the GitHub page.
        
    return: str
        Text of the requirements.txt file.
    """
    # If there's no link, return None
    if github_link is None:
        return None
    
    github_link = github_link.replace("https://github.com", "https://raw.githubusercontent.com")
    
    response = requests.get(f"{github_link}/main/requirements.txt")

    txt_bool = False

    # if response.status_code != 200:
    #     response = requests.get(f"{github_link}/master/requirements.txt")
    #     if response.status_code != 200:
    #         response = requests.get(f"{github_link}/main/requirements-dev.txt")
    #         if response.status_code != 200:
    #             response = requests.get(f"{github_link}/master/requirements-dev.txt")
    #             if response.status_code != 200:
    #                 txt_bool = False
    response = requests.get(f"{github_link}/main/environment.yml")
    if response.status_code != 200:
        response = requests.get(f"{github_link}/master/environment.yml")
        if response.status_code != 200:
            return None
    
    requirements_text = response.text

    # Clean the text using regex
    if txt_bool:
        # Example: 
            # versioneer[toml]
            # cython~=3.0.5
            # meson[ninja]==1.2.1
            # meson-python==0.13.1
            # pytest>=7.3.2
            # pytest-cov
            # pytest-xdist>=2.2.0
            # pytest-qt>=4.2.0
        # We only want the package name and not the version or extras
        requirements_text = re.sub(r"==.*|>=.*|~=.*|\[.*\]|;.*|<.*", "", requirements_text)
        # Remove comments
        requirements_text = re.sub(r"#.*", "", requirements_text)
        # Convert to list
        requirements_text = requirements_text.split("\n")
        # Remove empty strings
        requirements_text = [requirement for requirement in requirements_text if requirement != ""]
    else:
        # Example:
            # name: myenv
            # channels:
            #   - defaults
            # dependencies:
            #   - numpy
            #   - pandas
            #   - pip
            #   - pip:
            #     - matplotlib
        
        # Remove comments
        requirements_text = re.sub(r"#.*", "", requirements_text)
        # Only get the dependencies which start with '- '
        requirements_text = re.findall(r"- .*", requirements_text)
        # Clean the text using regex
        requirements_text = [re.sub(r"=.*|>=.*|~=.*|\[.*\]|;.*|<.*", "", requirement) for requirement in requirements_text]
        # Convert to list
        requirements_text = [requirement[2:] for requirement in requirements_text]
        # Remove empty strings
        requirements_text = [requirement for requirement in requirements_text if requirement != ""]
           

    return requirements_text

text = get_readme_text("https://github.com/pandas-dev/pandas")
get_req_text = get_requirements_text("https://github.com/pandas-dev/pandas")

print(get_req_text)


['conda-forge', 'python', 'pip', 'versioneer', 'cython', 'meson', 'meson-python', 'pytest', 'pytest-cov', 'pytest-xdist', 'pytest-qt', 'pytest-localserver', 'pyqt', 'coverage', 'python-dateutil', 'numpy', 'pytz', 'beautifulsoup4', 'blosc', 'bottleneck', 'fastparquet', 'fsspec', 'html5lib', 'hypothesis', 'gcsfs', 'ipython', 'jinja2', 'lxml', 'matplotlib', 'numba', 'numexpr', 'openpyxl', 'odfpy', 'py', 'psycopg2', 'pyarrow', 'pymysql', 'pyreadstat', 'pytables', 'python-calamine', 'pyxlsb', 's3fs', 'scipy', 'sqlalchemy', 'tabulate', 'xarray', 'xlrd', 'xlsxwriter', 'zstandard', 'dask-core', 'seaborn-base', 'moto', 'flask', 'asv', 'c-compiler', 'cxx-compiler', 'flake8', 'mypy', 'tokenize-rt  ', 'pre-commit', 'gitpython  ', 'gitdb', 'google-auth', 'natsort  ', 'numpydoc', 'pydata-sphinx-theme', 'pytest-cython  ', 'sphinx', 'sphinx-design', 'sphinx-copybutton', 'types-python-dateutil', 'types-PyMySQL', 'types-pytz', 'types-PyYAML', 'types-setuptools', 'nbconvert', 'nbsphinx', 'pandoc', 'ipywi