Fetch Website Content

In [23]:
!pip install requests --upgrade



In [24]:
import requests
import time
import random

In [25]:
web_url = "https://libraries.io"

In [26]:
response = requests.get(web_url)

In [27]:
response.status_code

200

In [28]:
len(response.text)

21765

In [29]:
page_contents = response.text

Parse Data

In [30]:
!pip install beautifulsoup4 --upgrade --quiet

In [31]:
from bs4 import BeautifulSoup

In [32]:
doc = BeautifulSoup(page_contents, 'html.parser')

Method1: Package Manager Popular Library Page

In [33]:
required_pkgs = ['wxpython', 'tensorflow', 'tfds', 'tensorboard', 'pytorch', 'numpy',
    'scipy', 'scikit-learn', 'pandas', 'keras', 'flask', 'tensorflow-transform', 'zappa',
    'subprocess', 'sqlalchemy', 'psycopg', 'flask-sqlalchemy', 'airflow', 'fftw', 'tensorflow-core-platform',
                 'cuda', 'cudnn', 'python', 'ubuntu', 'windows', 'macos', 'centos', 'android', 'ios', 'debian',
                 'bazel', 'glibc', 'jetpack', 'gcc', 'gzip', 'coremltools', 'postgres', 'mysql', 'blender', 'emr']

In [34]:
len(required_pkgs)

40

Method2: Search for Certain Packages

In [22]:
def get_response(url):
  response = requests.get(url)
  if (response.status_code != 200):
    print("Fetch Error, retrying ...")
    time.sleep(random.uniform(0, 5))
    response = requests.get(url)
    if (response.status_code != 200):
      print("FETCH ERROR AGAIN")
      return -1
  else:
    return response

Trial Example for Tensorflow

In [79]:
search_url = "https://libraries.io/search?q=" + "tensorflow"
search_response = get_response(search_url)
search_doc = BeautifulSoup(search_response.text, 'html.parser')
topResult_url = search_doc.find('div', {'class': 'project'}).find('h5').find('a')['href']
lib_url = web_url + topResult_url
print(lib_url)

https://libraries.io/conda/tensorflow


In [56]:
lib_search_response = get_response(lib_url)
lib_search_doc = BeautifulSoup(lib_search_response.text, 'html.parser')
detail_card = lib_search_doc.find('dl', {'class': 'row detail-card'})
key_list = detail_card.find_all('dt', {'class': 'col-xs-8'})
value_list = detail_card.find_all('dd', {'class': 'col-xs-4'})
href_dic = {}
print(len(key_list), len(value_list))

for i in range(len(key_list)):
    a_element = value_list[i].find('a')
    if (a_element):
        href_dic[key_list[i].text.strip()] = a_element['href']
print(href_dic)


12 12
{'Total releases': '/conda/tensorflow/versions', 'Stars': 'https://github.com/tensorflow/tensorflow/stargazers', 'Forks': 'https://github.com/tensorflow/tensorflow/network', 'Watchers': 'https://github.com/tensorflow/tensorflow/watchers', 'Contributors': 'https://github.com/tensorflow/tensorflow/graphs/contributors', 'SourceRank': '/conda/tensorflow/sourcerank'}


In [86]:
stats = obtain_stats(lib_url)
stats

{'Dependencies': {'value': '5'},
 'Dependent packages': {'value': '59'},
 'Dependent repositories': {'value': '0'},
 'Total releases': {'value': '44', 'href': '/conda/tensorflow/versions'},
 'Latest release': {'value': 'May  2, 2023'},
 'First release': {'value': 'Jan 19, 2018'},
 'Stars': {'value': '177K',
  'href': 'https://github.com/tensorflow/tensorflow/stargazers'},
 'Forks': {'value': '88.9K',
  'href': 'https://github.com/tensorflow/tensorflow/network'},
 'Watchers': {'value': '7,693',
  'href': 'https://github.com/tensorflow/tensorflow/watchers'},
 'Contributors': {'value': '888',
  'href': 'https://github.com/tensorflow/tensorflow/graphs/contributors'},
 'Repository size': {'value': '887 MB'},
 'SourceRank': {'value': '19', 'href': '/conda/tensorflow/sourcerank'},
 'keywords': ['deep-learning',
  'deep-neural-networks',
  'distributed',
  'machine-learning',
  'ml',
  'neural-network',
  'python',
  'tensorflow'],
 'license': ['Apache-2.0'],
 'install_command': 'conda install

In [57]:
import json
stats_data = {}
for key, value in zip(key_list, value_list):
    key_text = key.text.strip()
    value_text = value.text.strip()

    if key_text in href_dic:
        stats_data[key_text] = {
            'value': value_text,
            'href': href_dic[key_text]
        }
    else:
        stats_data[key_text] = {
            'value': value_text
        }
stats_data

{'Dependencies': {'value': '5'},
 'Dependent packages': {'value': '59'},
 'Dependent repositories': {'value': '0'},
 'Total releases': {'value': '44', 'href': '/conda/tensorflow/versions'},
 'Latest release': {'value': 'May  2, 2023'},
 'First release': {'value': 'Jan 19, 2018'},
 'Stars': {'value': '177K',
  'href': 'https://github.com/tensorflow/tensorflow/stargazers'},
 'Forks': {'value': '88.9K',
  'href': 'https://github.com/tensorflow/tensorflow/network'},
 'Watchers': {'value': '7,693',
  'href': 'https://github.com/tensorflow/tensorflow/watchers'},
 'Contributors': {'value': '888',
  'href': 'https://github.com/tensorflow/tensorflow/graphs/contributors'},
 'Repository size': {'value': '887 MB'},
 'SourceRank': {'value': '19', 'href': '/conda/tensorflow/sourcerank'}}

In [74]:
main_column = lib_search_doc.find('div', {'class': 'col-md-8'}).find('dl')
keywords_list = main_column.find('dd').find_all('a')
license_list = main_column.find_all('dd')[1].find_all('a')
install_command = main_column.find('dd', {"class": "well"}).find('code').text.strip()

keywords = []
for keyword in keywords_list:
    keywords.append(keyword.text.strip())
license = []
for lic in license_list:
    license.append(lic.text.strip())
install_command


deep-learning
deep-neural-networks
distributed
machine-learning
ml
neural-network
python
tensorflow


'conda install -c anaconda tensorflow'

In [77]:
keyword_title = main_column.find('dt').text
license_title = main_column.find_all('dt')[1].text
license_title

'\n      License\n    '

In [67]:
stats_data['keywords'] = keywords
stats_data['license'] = license
stats_data['install_command'] = install_command
stats_data

{'Dependencies': {'value': '5'},
 'Dependent packages': {'value': '59'},
 'Dependent repositories': {'value': '0'},
 'Total releases': {'value': '44', 'href': '/conda/tensorflow/versions'},
 'Latest release': {'value': 'May  2, 2023'},
 'First release': {'value': 'Jan 19, 2018'},
 'Stars': {'value': '177K',
  'href': 'https://github.com/tensorflow/tensorflow/stargazers'},
 'Forks': {'value': '88.9K',
  'href': 'https://github.com/tensorflow/tensorflow/network'},
 'Watchers': {'value': '7,693',
  'href': 'https://github.com/tensorflow/tensorflow/watchers'},
 'Contributors': {'value': '888',
  'href': 'https://github.com/tensorflow/tensorflow/graphs/contributors'},
 'Repository size': {'value': '887 MB'},
 'SourceRank': {'value': '19', 'href': '/conda/tensorflow/sourcerank'},
 'keywords': ['deep-learning',
  'deep-neural-networks',
  'distributed',
  'machine-learning',
  'ml',
  'neural-network',
  'python',
  'tensorflow'],
 'license': ['Apache-2.0'],
 'install_command': 'conda install

Functions for all packages

In [35]:
revised_required_pkgs = ['wxpython', 'tensorflow', 'tensorflow-datasets', 'tensorboard', 'pytorch', 'numpy',
    'scipy', 'scikit-learn', 'pandas', 'keras', 'flask', 'zappa',
    'subprocess', 'sqlalchemy', 'psycopg', 'flask-sqlalchemy', 'tensorflow-core-platform']
outlier_required_pkgs = ['tensorflow-transform', 'airflow', 'fftw']

In [68]:
def obtain_lib_url(pkg):
    search_url = "https://libraries.io/search?q=" + pkg
    search_response = get_response(search_url)
    search_doc = BeautifulSoup(search_response.text, 'html.parser')
    topResult_url = search_doc.find('div', {'class': 'project'}).find('h5').find('a')['href']
    lib_url = web_url + topResult_url
    return lib_url

In [85]:
def obtain_stats(lib_url):
    lib_search_response = get_response(lib_url)
    lib_search_doc = BeautifulSoup(lib_search_response.text, 'html.parser')
    detail_card = lib_search_doc.find('dl', {'class': 'row detail-card'})
    key_list = detail_card.find_all('dt', {'class': 'col-xs-8'})
    value_list = detail_card.find_all('dd', {'class': 'col-xs-4'})
    href_dic = {}

    for i in range(len(key_list)):
        a_element = value_list[i].find('a')
        if (a_element):
            href_dic[key_list[i].text.strip()] = a_element['href']
    stats_data = {}
    for key, value in zip(key_list, value_list):
        key_text = key.text.strip()
        value_text = value.text.strip()

        if key_text in href_dic:
            stats_data[key_text] = {
                'value': value_text,
                'href': href_dic[key_text]
            }
        else:
            stats_data[key_text] = {
                'value': value_text
            }
    
    main_column = lib_search_doc.find('div', {'class': 'col-md-8'}).find('dl')
    main_titles = main_column.find_all('dt')
    keyword_index= check_keyword_index(main_titles)
    license_index = check_license_index(main_titles)

    if (keyword_index != -1):
        keyword_list = main_column.find_all('dd')[keyword_index].find_all('a')
        keywords = []
        for keyword in keywords_list:
            keywords.append(keyword.text.strip())
        stats_data['keywords'] = keywords
    else:
        stats_data['keywords'] = []
    
    if (license_index != -1):
        license_list = main_column.find_all('dd')[license_index].find_all('a')
        license = []
        for lic in license_list:
            license.append(lic.text.strip())
        stats_data['license'] = license

    try:
        install_command = main_column.find('dd', {"class": "well"}).find('code').text.strip()
    except AttributeError:
        install_command = ""
    stats_data['install_command'] = install_command
    
    return stats_data
    

In [83]:
def check_keyword_index(main_titles):
    for i in range(len(main_titles)):
        if (main_titles[i].text.strip() == "Keywords"):
            return i
    return -1

In [84]:
def check_license_index(main_titles):
    for i in range(len(main_titles)):
        if (main_titles[i].text.strip() == "License"):
            return i
    return -1

In [87]:
look_up_dic = {}

In [88]:
fetch_count = 0
for pkg in revised_required_pkgs:
  print(pkg)
  fetch_count += 1
  if fetch_count % 5 == 0:
    time.sleep(random.uniform(0, 200))
  lib_url = obtain_lib_url(pkg)
  look_up_dic[pkg] = obtain_stats(lib_url)

for pkg in outlier_required_pkgs:
  fetch_count += 1
  if fetch_count % 5 == 0:
    time.sleep(random.uniform(0, 200))
  lib_url = obtain_lib_url(pkg)
  look_up_dic[pkg] = obtain_stats(lib_url)
look_up_dic

wxpython
tensorflow
tensorflow-datasets
tensorboard
pytorch
numpy
scipy
scikit-learn
pandas
keras
flask
zappa
subprocess
sqlalchemy
psycopg
flask-sqlalchemy
tensorflow-core-platform


{'wxpython': {'Dependencies': {'value': '2'},
  'Dependent packages': {'value': '272'},
  'Dependent repositories': {'value': '695'},
  'Total releases': {'value': '21', 'href': '/pypi/wxPython/versions'},
  'Latest release': {'value': 'Jun  8, 2023'},
  'First release': {'value': 'Nov  1, 2010'},
  'Stars': {'value': '2.03K',
   'href': 'https://github.com/wxWidgets/Phoenix/stargazers'},
  'Forks': {'value': '507',
   'href': 'https://github.com/wxWidgets/Phoenix/network'},
  'Watchers': {'value': '109',
   'href': 'https://github.com/wxWidgets/Phoenix/watchers'},
  'Contributors': {'value': '116',
   'href': 'https://github.com/wxWidgets/Phoenix/graphs/contributors'},
  'Repository size': {'value': '44.8 MB'},
  'SourceRank': {'value': '20', 'href': '/pypi/wxPython/sourcerank'},
  'keywords': ['deep-learning',
   'deep-neural-networks',
   'distributed',
   'machine-learning',
   'ml',
   'neural-network',
   'python',
   'tensorflow'],
  'license': ['wxWindows'],
  'install_command'

Convert Dic to Json File

In [89]:
import json

In [90]:
STATS = {}

In [91]:
for key, value in look_up_dic.items():
    if value is not None:
        STATS[key] = value

In [92]:
STATS

{'wxpython': {'Dependencies': {'value': '2'},
  'Dependent packages': {'value': '272'},
  'Dependent repositories': {'value': '695'},
  'Total releases': {'value': '21', 'href': '/pypi/wxPython/versions'},
  'Latest release': {'value': 'Jun  8, 2023'},
  'First release': {'value': 'Nov  1, 2010'},
  'Stars': {'value': '2.03K',
   'href': 'https://github.com/wxWidgets/Phoenix/stargazers'},
  'Forks': {'value': '507',
   'href': 'https://github.com/wxWidgets/Phoenix/network'},
  'Watchers': {'value': '109',
   'href': 'https://github.com/wxWidgets/Phoenix/watchers'},
  'Contributors': {'value': '116',
   'href': 'https://github.com/wxWidgets/Phoenix/graphs/contributors'},
  'Repository size': {'value': '44.8 MB'},
  'SourceRank': {'value': '20', 'href': '/pypi/wxPython/sourcerank'},
  'keywords': ['deep-learning',
   'deep-neural-networks',
   'distributed',
   'machine-learning',
   'ml',
   'neural-network',
   'python',
   'tensorflow'],
  'license': ['wxWindows'],
  'install_command'

In [93]:
file_path = 'stats.json'
with open(file_path, 'w') as json_file:
    json.dump(STATS, json_file, indent=4)