In [None]:
from typing import List, Dict, Any, Tuple, Union, Optional
# Modules
import requests
import json
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
from bs4 import BeautifulSoup
import re
import xmlrpc.client as xc
from tqdm import tqdm
import pickle
import networkx as nx
from collections import defaultdict

In [None]:
# Set client to the PyPI XML-RPC server
client = xc.ServerProxy('http://pypi.python.org/pypi')

# Get a list of all the packages
pypi_packages = client.list_packages()

# lowercase all the package names
pypi_packages = [package.lower() for package in pypi_packages]

# Save the list of packages
with open("data/packages.pkl", "wb") as f:
    pickle.dump(pypi_packages, f)

In [None]:
def get_github_link(packages: list) -> list:
    """
    Function that takes a list of python packages and returns a list of tuples with the package name, the link to the PyPI page and the link to the GitHub page.
    
    list_of_packages: list
        List of python packages to search for.
        
    return: list
        List of tuples with the package name, the link to the PyPI page and the link to the GitHub page.
    """
    
    all_links = []
    for i, package in enumerate(packages):
        # The link to the python package
        LINK = f"https://pypi.org/project/{package}/"
        
        # Get the HTML content of the page
        r = requests.get(LINK)
        
        # If the request was not successful, alert the user
        if r.status_code != 200:
            print(f"Request failed for {package, i}: {r.status_code}")
            continue
        
        # Parse the HTML content of the page
        soup = BeautifulSoup(r.content)
        
        # Get sidebar with links
        sidebar = soup.find("div", {"class": "vertical-tabs__tabs"})
        
        # Get all the links in the sidebar
        references = [link.get("href") for link in sidebar.find_all("a")]
        
        # Join into one string to regex in
        reference_text = " ".join([reference for reference in references if reference is not None])
        
        # Find the first link that contains the word "github.com"
        github_links = []
        for link in re.finditer(r"github\.com(/\w*|/\W|[-]\w*|[-]\W*)*", reference_text):
            if link.group() != "github.com/" and link.group() != "github.com":
                github_links.append(link.group())
        
        # If there are no links, append None
        if len(github_links) == 0:
            github_link = None
        
        # If there's several take the shortest and alert the user
        elif len(github_links) > 1:
            print(f"Several GitHub links found for {package, i}: {github_links}")
            github_link = min(github_links, key=len)
        
        # If there is just one link, take that out of the list
        elif len(github_links) == 1:
            github_link = github_links[0]
        
        # Else alert the user no githublink is found
        else:
            print(f"No GitHub link found for {package, i}")
            github_link = None
        
        # Append the triplet to the list
        all_links.append((package, LINK, github_link))
    
    return all_links

In [None]:
# Load packages
with open("data/packages.pkl", "rb") as f:
    pypi_packages = pickle.load(f)

# Run the function with threadpool executor to speed up the process - still takes a loooong time so be aware
with ThreadPoolExecutor() as executor:
    all_links = list(tqdm(executor.map(get_github_link, [pypi_packages]), total=len(pypi_packages)))

# Save the list to a json file
with open("data/all_links_github.json", "w") as f:
    json.dump(all_links, f)

# Clean the list of None links
all_links_c = [(p, l, g) for p, l, g in all_links if g is not None]

with open("data/all_links_github_c.json", "w") as f:
    json.dump(all_links_c, f)

In [None]:
with open("data/packages.pkl", "rb") as f:
    pypi_packages = pickle.load(f)
print("Number of packages on pypi:", len(pypi_packages))

with open('data/all_links_github.json', 'r') as f:
    data = json.load(f)
print("Number of packages to successfully access the webpage:", len(data))

with open('data/all_links_github_c.json', 'r') as f:
    data_clean = json.load(f)
print("Number of packages to successfully get the github link from:", len(data_clean))

In [None]:
# For each package go to the GitHub page and get the readme.text if theres a README.md
def get_readme_text(github_link: str) -> list:
    """
    Function that takes a GitHub link and returns the text of the README.md file.
    
    github_link: str
        Link to the GitHub page.
        
    return: str
        Text of the README.md file.
    """
    # If there's no link, return None
    if github_link is None:
        return None
    
    github_link = github_link.replace("github.com", "https://raw.githubusercontent.com")
    
    try:
        response = requests.get(f"{github_link}/main/README.md")
        if response.status_code != 200:
            response = requests.get(f"{github_link}/master/README.md")
            if response.status_code != 200:
                response = requests.get(f"{github_link}/main/REAMDE.rst")
                if response.status_code != 200:
                    response = requests.get(f"{github_link}/master/REAMDE.rst")
                    if response.status_code != 200:
                        response = requests.get(f"{github_link}/main/README.txt")
                        if response.status_code != 200:
                            response = requests.get(f"{github_link}/master/README.txt")
                            if response.status_code != 200:
                                return None
                            
    except Exception as e:
        print(e)
        return None
    
    readme_text = response.text

    # Remove links which start with http
    readme_text = re.sub(r"http.*", "", readme_text)
    # Remove links to files in the repository which start with / or ./ or ../
    readme_text = re.sub(r"/.*|./.*|../.*", "", readme_text)
    # Convert /n to space
    readme_text = re.sub(r"\n", " ", readme_text)
    # Make all text lowercase
    readme_text = readme_text.lower()
    # Only keep Alphanumeric characters and - and _
    readme_text = re.sub(r"[^a-z0-9-_ ]", "", readme_text)
    # Remove multiple spaces
    readme_text = re.sub(r" +", " ", readme_text)
    # Remove empty strings
    readme_text = [line for line in readme_text.split(" ") if line != ""]

    return readme_text


def get_requirements_text(github_link: str) -> list:
    """
    Function that takes a GitHub link and returns the text of the requirements.txt file.
    
    github_link: str
        Link to the GitHub page.
        
    return: str
        Text of the requirements.txt file.
    """
    # If there's no link, return None
    if github_link is None:
        return None
    
    github_link = github_link.replace("github.com", "https://raw.githubusercontent.com")
    

    txt_bool = True
    pyproject_bool = False

    try:
        response = requests.get(f"{github_link}/main/requirements-dev.txt")
        if response.status_code != 200:
            response = requests.get(f"{github_link}/master/requirements-dev.txt")
            if response.status_code != 200:
                response = requests.get(f"{github_link}/main/dev-requirements.txt")
                if response.status_code != 200:
                    response = requests.get(f"{github_link}/master/dev-requirements.txt")
                    if response.status_code != 200:
                        txt_bool = False
                        response = requests.get(f"{github_link}/main/environment.yml")
                        if response.status_code != 200:
                            response = requests.get(f"{github_link}/master/environment.yml")
                            if response.status_code != 200:
                                pyproject_bool = True
                                response = requests.get(f"{github_link}/main/pyproject.toml")
                                if response.status_code != 200:
                                    response = requests.get(f"{github_link}/master/pyproject.toml")
                                    if response.status_code != 200:
                                        pyproject_bool = False
                                        txt_bool = True
                                        response = requests.get(f"{github_link}/main/requirements.txt")
                                        if response.status_code != 200:
                                            response = requests.get(f"{github_link}/master/requirements.txt")
                                            if response.status_code != 200:
                                                response = requests.get(f"{github_link}/main/requirements.txt")
                                                if response.status_code != 200:
                                                    return None

    except Exception as e:
        print(e)
        return None                                

    requirements_text = response.text

    # Clean the text using regex
    cleaning_reg = r"=.*|>.*|~.*|\[.*\]|;.*|<.*|!.*"

    if txt_bool:
        # Example: 
            # versioneer[toml]
            # cython~=3.0.5
            # meson[ninja]==1.2.1
            # meson-python==0.13.1
            # pytest>=7.3.2
            # pytest-cov
            # pytest-xdist>=2.2.0
            # pytest-qt>=4.2.0
        # We only want the package name and not the version or extras
        requirements_text = re.sub(r"\[.*\]", "", requirements_text)
        # Remove comments
        requirements_text = re.sub(r"#.*", "", requirements_text)
        # Clean the text using regex
        requirements_text = re.sub(f"{cleaning_reg}", "", requirements_text)
        # lower case
        requirements_text = requirements_text.lower()
        # Convert to list
        requirements_text = requirements_text.split("\n")
        # Remove trailing spaces
        requirements_text = [requirement.strip() for requirement in requirements_text]
        # Remove empty strings
        requirements_text = [requirement for requirement in requirements_text if requirement != ""]

    elif pyproject_bool:
        # Example:
            # [project]
            # name = "pydata-sphinx-theme"
            # description = "Bootstrap-based Sphinx theme from the PyData community"
            # readme = "README.md"
            # requires-python = ">=3.9"
            # dependencies = [
            # "Babel",
            # "pygments>=2.7",
            # "accessible-pygments",
            # "typing-extensions"
            # ]
            # [project.optional-dependencies]
            # doc = [
            # "numpydoc",
            # "linkify-it-py", # for link shortening
            # "rich",
            # # For examples section
            # "myst-parser"
            # ]

        # Remove comments
        requirements_text = re.sub(r"#.*", "", requirements_text)
        dependencies = re.findall(r'dependencies = \[\n(.*?)\n\]', requirements_text, re.DOTALL)
        optional_dependencies = re.findall(r'optional-dependencies\]\n.*? = \[\n(.*?)\n\]', requirements_text, re.DOTALL)
        if len(dependencies) == 0:
            return None
        if len(optional_dependencies) == 0:
            optional_dependencies = [""]
        
        dependencies = re.findall(r'".*"', dependencies[0])
        optional_dependencies = re.findall(r'".*"', optional_dependencies[0])
        requirements_text = dependencies + optional_dependencies
        # Remove double quotes
        requirements_text = [requirement[1:-1] for requirement in requirements_text]
        # Clean the text using regex
        requirements_text = [re.sub(f"{cleaning_reg}", "", requirement) for requirement in requirements_text]
        # lower case
        requirements_text = [requirement.lower() for requirement in requirements_text]
        # Remove trailing spaces
        requirements_text = [requirement.strip() for requirement in requirements_text]
        # Remove empty strings
        requirements_text = [requirement for requirement in requirements_text if requirement != ""]

    else:
        # Example:
            # name: myenv
            # channels:
            #   - defaults
            # dependencies:
            #   - numpy
            #   - pandas
            #   - pip
            #   - pip:
            #     - matplotlib
        
        # Remove comments
        requirements_text = re.sub(r"#.*", "", requirements_text)
        # Only get the dependencies which start with '- '
        requirements_text = re.findall(r"- .*", requirements_text)
        # Clean the text using regex
        requirements_text = [re.sub(f"{cleaning_reg}", "", requirement) for requirement in requirements_text]
        # lower case
        requirements_text = [requirement.lower() for requirement in requirements_text]
        # Convert to list
        requirements_text = [requirement[2:] for requirement in requirements_text]
        # Remove trailing spaces
        requirements_text = [requirement.strip() for requirement in requirements_text]
        # Remove empty strings
        requirements_text = [requirement for requirement in requirements_text if requirement != ""]

           
    return requirements_text


def node_creator(data: Tuple[str, str, str]) -> Dict[str, Any]:
    """
    Function that takes a list of tuples with the package name, the link to the PyPI page and the link to the GitHub page and returns a dictionary with the package name as the key and the value is a dictionary with the package name, the link to the PyPI page, the link to the GitHub page, the text of the README.md file and the text of the requirements.txt file.
    
    data: list
        List of tuples with the package name, the link to the PyPI page and the link to the GitHub page.
        
    return: dict
        Dictionary with the package name as the key and the value is a dictionary with the package name, the link to the PyPI page, the link to the GitHub page, the text of the README.md file and the text of the requirements.txt file.
    """
    node = {}
    
    package, link, github_link = data

    readme_text = get_readme_text(github_link)
    requirements_text = get_requirements_text(github_link)
    if requirements_text is None:
        return None
    node[package] = {"package": package, "link": link, "github_link": github_link, "readme_text": readme_text, "requirements_text": requirements_text}
    
    return node

# Test the function
test_data = ("numpy", "https://pypi.org/project/numpy/", "github.com/numpy/numpy")
test_node = node_creator(test_data)
print(test_node)

In [None]:
# Load clean data
with open('data/all_links_github_c.json', 'r') as f:
    data_clean = json.load(f)

# Run the function with threadpool executor to speed up the process
with ThreadPoolExecutor() as executor:
    nodes = list(tqdm(executor.map(node_creator, data_clean[110000:220000]), total=len(data_clean[110000:220000])))

# Save the list to a json file
with open("data/nodes_anton.json", "w") as f:
    json.dump(nodes, f)

In [78]:
#### THIS IS THE FINAL CLEANING OF THE DATA IF NEEDED ####
with open('data/nodes_anton_clean.json', 'r') as f:
    data = json.load(f)

cleaned_data = []

for node in data:
    # lower the keys
    node = {key.lower(): value for key, value in node.items()}
    for key, value in node.items():
        value["package"] = value["package"].lower()
        value["requirements_text"] = [requirement.strip() for requirement in value["requirements_text"]]
        value["requirements_text"] = [requirement for requirement in value["requirements_text"] if requirement != ""]
        value["requirements_text"] = [requirement.lower() for requirement in value["requirements_text"]]
        value["requirements_text"] = [re.sub(r"==.*|>=.*|<=.*|~=.*|!=.*|>.*|<.*", "", requirement) for requirement in value["requirements_text"]]

    cleaned_data.append(node)


with open("data/nodes_anton_clean.json", "w") as f:
    json.dump(cleaned_data, f)

In [80]:
# Now we make the edgelist 
# Load the data
with open('data/nodes_anton_clean.json', 'r') as f:
    nodes = json.load(f)

edge_list = []

packages_not_in_pypi = set()

for node in tqdm(nodes):
    if node is None:
        continue
    for package in node:
        if node[package]["requirements_text"] is None:
            continue
        for requirement in node[package]["requirements_text"]:
            if requirement not in pypi_packages:
                packages_not_in_pypi.add(requirement)
                continue
            edge_list.append((package, requirement))

print("Number of packages not in PyPI:", len(packages_not_in_pypi))

# Save the edge list
with open("data/edge_list_anton.pkl", "wb") as f:
    pickle.dump(edge_list, f)

edge_list[:10]

100%|██████████| 10524/10524 [03:58<00:00, 44.16it/s]

Number of packages not in PyPI: 1735





[('biotracks', 'datapackage'),
 ('biotracks', 'jsontableschema'),
 ('biotracks', 'jsontableschema-pandas'),
 ('windio', 'defaults'),
 ('windio', 'jsonschema'),
 ('windio', 'numpy'),
 ('windio', 'pyyaml'),
 ('windio', 'pytest'),
 ('windio', 'xarray'),
 ('python-etcd-lock', 'nose')]

In [87]:
# Convert the list of dictionaries to a dictionary
nodes_dict = {}
for node in nodes:
    if node is None:
        continue
    for key, value in node.items():
        nodes_dict[key] = value

nodes_dict['windio']

{'package': 'windio',
 'link': 'https://pypi.org/project/windIO/',
 'github_link': 'github.com/IEAWindTask37/windIO',
 'readme_text': ['build',
  'status',
  'documentation',
  'status',
  'windio',
  'frameworks',
  'defining',
  'the',
  'inputs',
  'and',
  'outputs',
  'for',
  'systems',
  'engineering',
  'mdao',
  'of',
  'wind',
  'turbine',
  'and',
  'plants',
  'the',
  'framework',
  'was',
  'developed',
  'by',
  'the',
  'iea',
  'wind',
  'task',
  '37',
  'team',
  'within',
  'work',
  'package',
  '1',
  'author',
  'iea',
  'wind',
  'task',
  '37',
  'teammailtopietrobortolottinrelgov',
  'version',
  'this',
  'software',
  'is',
  'a',
  'version',
  '10',
  'documentation',
  'and',
  'citation',
  'the',
  'online',
  'documentation',
  'can',
  'be',
  'accessed',
  'here',
  'if',
  'you',
  'use',
  'this',
  'model',
  'in',
  'your',
  'research',
  'or',
  'publications',
  'please',
  'cite',
  'this',
  'iea',
  'technical',
  'report',
  'articleosti_1

In [83]:
# Load the edge list
with open("data/edge_list_anton.pkl", "rb") as f:
    edge_list = pickle.load(f)
    
G = nx.Graph()
G.add_edges_from(edge_list)
print("Number of nodes:", G.number_of_nodes())
print("Number of edges:", G.number_of_edges())
print("Number of connected components:", nx.number_connected_components(G))

Number of nodes: 16146
Number of edges: 67881
Number of connected components: 172


In [84]:
# Get the largest connected component
largest_cc = max(nx.connected_components(G), key=len)
G = G.subgraph(largest_cc).copy()
print("Number of nodes in the largest connected component:", G.number_of_nodes())

Number of nodes in the largest connected component: 15750
