Fetch Website Content

In [1]:
!pip install requests --upgrade



In [2]:
import requests
import time
import random

In [3]:
web_url = "https://libraries.io"

In [4]:
response = requests.get(web_url)

In [5]:
response.status_code

200

In [6]:
len(response.text)

21707

In [9]:
page_contents = response.text

Parse Data

In [7]:
!pip install beautifulsoup4 --upgrade --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.0/143.0 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [8]:
from bs4 import BeautifulSoup

In [10]:
doc = BeautifulSoup(page_contents, 'html.parser')

In [15]:
pkg_manager_tags = doc.find_all('div', { 'class': 'blurb'})

In [37]:
pkg_managers_url = []
for pkg_manager_tag in pkg_manager_tags:
  pkg_manager_name = pkg_manager_tag.find('a').text
  pkg_managers_url.append(web_url + "/search?order=desc&platforms=" + pkg_manager_name + "&sort=rank")
pkg_managers_url

'https://libraries.io/search?order=desc&platforms=npm&sort=rank'

Method1: Package Manager Popular Library Page

In [62]:
required_pkgs = ['wxpython', 'tensorflow', 'tfds', 'tensorboard', 'pytorch', 'numpy',
    'scipy', 'scikit-learn', 'pandas', 'keras', 'flask', 'tensorflow-transform', 'zappa',
    'subprocess', 'sqlalchemy', 'psycopg', 'flask-sqlalchemy', 'airflow', 'fftw', 'tensorflow-core-platform',
                 'cuda', 'cudnn', 'python', 'ubuntu', 'windows', 'macos', 'centos', 'android', 'ios', 'debian',
                 'bazel', 'glibc', 'jetpack', 'gcc', 'gzip', 'coremltools', 'postgres', 'mysql', 'blender', 'emr']

In [41]:
len(required_pkgs)

40

In [None]:
libs_tag = []
for pkg_manager_url in pkg_managers_url:
  #print(pkg_manager_url)
  child_response = requests.get(pkg_manager_url)
  if (child_response.status_code != 200):
    print('error')
    continue
  child_doc = BeautifulSoup(child_response.text, 'html.parser')
  libs_title_tags = child_doc.find_all('div', {'class': 'project'})[:100]
  for libs_title_tag in libs_title_tags:
    libs_title = libs_title_tag.find('a').text
    if (libs_title in required_pkgs):
      print(libs_title)
      libs_tag.append(libs_title)
len(libs_tag)

Method2: Search for Certain Packages

In [65]:
revised_required_pkgs = ['wxpython', 'tensorflow', 'tensorflow-datasets', 'tensorboard', 'pytorch', 'numpy',
    'scipy', 'scikit-learn', 'pandas', 'keras', 'flask', 'zappa',
    'subprocess', 'sqlalchemy', 'psycopg', 'flask-sqlalchemy', 'tensorflow-core-platform']
outlier_required_pkgs = ['tensorflow-transform', 'airflow', 'fftw']

In [118]:
def get_response(url):
  response = requests.get(url)
  if (response.status_code != 200):
    print("Fetch Error, retrying ...")
    time.sleep(random.uniform(0, 5))
    response = requests.get(url)
    if (response.status_code != 200):
      print("FETCH ERROR AGAIN")
      return -1
  else:
    return response

In [134]:
import re

def contains_non_numeric_chars(string):
    pattern = r'[^0-9.]'
    match = re.search(pattern, string)
    return match is not None

In [135]:
def obtain_max_vertion(doc):
  num = 0
  max_version = doc.find('table', { 'class': 'table'}).find('a').text
  while (contains_non_numeric_chars(max_version)):
    num += 1
    max_version = doc.find('table', { 'class': 'table'}).find_all('a')[num].text
  return max_version

In [136]:
def obtain_min_version(doc):
  num = -1
  min_version = doc.find('table', { 'class': 'table'}).find_all('tr')[-1].find('a').text
  while (contains_non_numeric_chars(min_version)):
    num -= 1
    min_version = doc.find('table', { 'class': 'table'}).find_all('tr')[num].find('a').text
  return min_version

In [137]:
def url_to_ver(pkg_url):
  pkg_response = get_response(pkg_url)
  pkg_doc = BeautifulSoup(pkg_response.text, 'html.parser')
  # obtain max_version
  max_version = obtain_max_vertion(pkg_doc)
  # max_version = pkg_doc.find('table', { 'class': 'table'}).find('a').text
  # obtain min_version
  prev_li = pkg_doc.find('li', {'class': 'prev disabled'})
  next_li = pkg_doc.find('li', {'class': 'next'})
  if (prev_li == None ): # only have one page
    min_version = obtain_min_version(pkg_doc)
    # min_version = pkg_doc.find('table', { 'class': 'table'}).find_all('tr')[-1].find('a').text
  else: # have multiple pages
    page_count = 0
    for sibling in prev_li.next_siblings:
      if sibling == next_li:
        break
      if sibling.name == "li":
        page_count += 1
    min_version_url = pkg_url + '?page=' + str(page_count)
    min_response = get_response(min_version_url)
    min_doc = BeautifulSoup(min_response.text, 'html.parser')
    min_version = obtain_min_version(min_doc)
    # min_version = min_doc.find('table', { 'class': 'table'}).find_all('tr')[-1].find('a').text
  return min_version, max_version

In [138]:
look_up_dic = {}

In [139]:
# pkg_topResults_url = []
fetch_count = 0
for pkg in revised_required_pkgs:
  fetch_count += 1
  if fetch_count % 5 == 0:
    time.sleep(random.uniform(0, 200))
  search_url = "https://libraries.io/search?q=" + pkg
  search_response = get_response(search_url)
  search_doc = BeautifulSoup(search_response.text, 'html.parser')
  topResult_url = search_doc.find('div', {'class': 'project'}).find('h5').find('a')['href']
  if (topResult_url == None):
    print(pkg + " not find.")
    continue
  # pkg_topResults_url.append(web_url + topResult_url + "/versions")
  look_up_dic[pkg] = url_to_ver(web_url + topResult_url + "/versions")
for pkg in outlier_required_pkgs:
  fetch_count += 1
  if fetch_count % 5 == 0:
    time.sleep(random.uniform(0, 200))
  search_url = "https://libraries.io/search?q=" + pkg
  search_response = get_response(search_url)
  search_doc = BeautifulSoup(search_response.text, 'html.parser')
  topResult_url = search_doc.find_all('div', {'class': 'project'})[1].find('h5').find('a')['href']
  if (topResult_url == None):
    print(pkg + " not find.")
    continue
  # pkg_topResults_url.append(web_url + topResult_url + "/versions")
  look_up_dic[pkg] = url_to_ver(web_url + topResult_url + "/versions")
look_up_dic

{'wxpython': ('2.8.11.0', '4.2.1'),
 'tensorflow': ('0.10.0', '2.12.0'),
 'tensorflow-datasets': ('0.0.1', '4.9.2'),
 'tensorboard': ('1.6.0', '2.14.0'),
 'pytorch': ('0.2.0', '2.0.1'),
 'numpy': ('1.0', '1.25.2'),
 'scipy': ('0.4.4', '1.11.1'),
 'scikit-learn': ('0.9', '1.3.0'),
 'pandas': ('0.1', '2.0.3'),
 'keras': ('0.2.0', '2.13.1'),
 'flask': ('0.1', '2.3.2'),
 'zappa': ('0.1.0', '0.57.0'),
 'subprocess': ('0.1.0', '0.2.9'),
 'sqlalchemy': ('0.1.0', '2.0.19'),
 'psycopg': ('1.1.21', '3.1.10'),
 'flask-sqlalchemy': ('0.5', '3.0.5'),
 'tensorflow-core-platform': ('0.2.0', '0.5.0'),
 'tensorflow-transform': ('0.1.0', '1.13.0'),
 'airflow': ('1.10.3', '2.4.3'),
 'fftw': ('3.3.4', '3.3.10')}

Convert Dic to Json File

In [140]:
import json

In [141]:
COMPONENT_VERSION_RANGE = {}

In [142]:
for key, value in look_up_dic.items():
    if value is not None:
        min_version, max_version = value
        COMPONENT_VERSION_RANGE[key] = {
            'min': min_version,
            'max': max_version
        }

In [143]:
COMPONENT_VERSION_RANGE

{'wxpython': {'min': '2.8.11.0', 'max': '4.2.1'},
 'tensorflow': {'min': '0.10.0', 'max': '2.12.0'},
 'tensorflow-datasets': {'min': '0.0.1', 'max': '4.9.2'},
 'tensorboard': {'min': '1.6.0', 'max': '2.14.0'},
 'pytorch': {'min': '0.2.0', 'max': '2.0.1'},
 'numpy': {'min': '1.0', 'max': '1.25.2'},
 'scipy': {'min': '0.4.4', 'max': '1.11.1'},
 'scikit-learn': {'min': '0.9', 'max': '1.3.0'},
 'pandas': {'min': '0.1', 'max': '2.0.3'},
 'keras': {'min': '0.2.0', 'max': '2.13.1'},
 'flask': {'min': '0.1', 'max': '2.3.2'},
 'zappa': {'min': '0.1.0', 'max': '0.57.0'},
 'subprocess': {'min': '0.1.0', 'max': '0.2.9'},
 'sqlalchemy': {'min': '0.1.0', 'max': '2.0.19'},
 'psycopg': {'min': '1.1.21', 'max': '3.1.10'},
 'flask-sqlalchemy': {'min': '0.5', 'max': '3.0.5'},
 'tensorflow-core-platform': {'min': '0.2.0', 'max': '0.5.0'},
 'tensorflow-transform': {'min': '0.1.0', 'max': '1.13.0'},
 'airflow': {'min': '1.10.3', 'max': '2.4.3'},
 'fftw': {'min': '3.3.4', 'max': '3.3.10'}}

In [144]:
file_path = 'version.json'
with open(file_path, 'w') as json_file:
    json.dump(COMPONENT_VERSION_RANGE, json_file, indent=4)