In [1]:
%pip install bs4 requests 'polars[all]' lxml pandas

Note: you may need to restart the kernel to use updated packages.


In [20]:
import requests
from bs4 import BeautifulSoup
import polars as pl
from polars import Config
import re
import json5
import pandas as pd
import os
from pathlib import Path

In [21]:
Config.set_tbl_rows(100)

polars.config.Config

In [3]:
url = 'https://distrowatch.com/index.php?dataspan=1'
response = requests.get(url)

In [4]:
soup = BeautifulSoup(response.content, 'html.parser')
distros = soup.find_all('td', {'class': 'phr2'})
hits = soup.find_all('td', {'class': 'phr3'})

In [5]:
distributions = []
ids = []
page_hits = []
for distro in distros:
    distributions.append(distro.get_text())
    ids.append(distro.a['href'])

for hit in hits:
    page_hits.append(int(hit.get_text()))

In [6]:
top_distros = {
    'distribution': distributions,
    'id': ids,
    'page hit(s)': page_hits 
}
df = pd.DataFrame(top_distros)
df.to_json("top_distros.json", orient="records")

In [22]:
df = pl.read_json("top_distros.json")
df

distribution,id,page hit(s)
str,str,i64
"""MX Linux""","""mx""",2738
"""Mint""","""mint""",1963
"""Fedora""","""fedora""",1646
"""EndeavourOS""","""endeavour""",1640
"""Debian""","""debian""",1372
"""Manjaro""","""manjaro""",1095
"""Ubuntu""","""ubuntu""",1056
"""Garuda""","""garuda""",1029
"""Pop!_OS""","""popos""",904
"""Zorin""","""zorin""",730


In [7]:
def add_hyphen(string):
    pattern = r'(\d)([a-zA-Z])'
    result = re.sub(pattern, r'\1-\2', string)
    return result

In [27]:
def scrape_features_as_json(id):
    link = f'https://distrowatch.com/table.php?distribution={id}'
    scrape = requests.get(link)
    dist_soup = BeautifulSoup(scrape.content, 'lxml')
    counter = 0

    features = [""]
    while features[0] != "Feature":
        features = dist_soup.select('tr')[counter]
        for br in features.find_all('br'):
            br.replace_with('-')
        features = features.get_text()
        features = features.strip().split("\n")
        if features[0] != "Feature":
            counter += 1
    for idx, feature in enumerate(features):
        feature_new = add_hyphen(feature)
        features[idx] = feature_new
    if features[0] == features[-1]:
        features.pop()

    release = dist_soup.select('tr')[counter + 1].get_text()
    release = release.split("\n")
    release.pop(0)
    release.pop(-1)
    if release[0] == release[-1]:
        release.pop()

    end_of_life = dist_soup.select('tr')[counter + 2].get_text()
    end_of_life = end_of_life.replace('\xa0', '\xa0\n')  # Assume we need to insert '\n' after every '\xa0'
    end_of_life = end_of_life.replace('\xa0\n\n', '\xa0\n')
    end_of_life = end_of_life.replace(';', '')
    end_of_life = end_of_life.split("\n")
    end_of_life.pop(0)
    end_of_life.pop(-1)
    for idx, eol in enumerate(end_of_life):
        if eol == "\xa0":
            end_of_life[idx] = "NA"
    if end_of_life[0] == end_of_life[-1]:
        end_of_life.pop()

    price = dist_soup.select('tr')[counter + 3].get_text()
    price = price.split("\n")
    price.pop(0)
    price.pop(-1)
    for idx, p in enumerate(price):
        if p == "\xa0":
            price[idx] = "NA"
    if price[0] == price[-1]:
        price.pop()

    image_size = dist_soup.select('tr')[counter + 4].get_text()
    image_size = image_size.split("\n")
    image_size.pop(0)
    image_size.pop(-1)
    for idx, im in enumerate(image_size):
        if im == "\xa0":
            image_size[idx] = "NA"
    if image_size[0] == image_size[-1]:
        image_size.pop()

    free_download = dist_soup.select('tr')[counter + 5]
    free_download_list = []
    free_download_list.append(free_download.th.get_text())
    for idx, iso in enumerate(free_download.find_all("td")):
        if iso.a:
            free_download_list.append(iso.find('a')["href"])
        else:
            free_download_list.append("NA")
    
    installation = dist_soup.select('tr')[counter + 6].get_text()
    installation = installation.split("\n")
    installation.pop(0)
    installation.pop(-1)
    for idx, ins in enumerate(installation):
        if ins == "\xa0":
            installation[idx] = "NA"
    if installation[0] == installation[-1]:
        installation.pop()

    default_desktop = dist_soup.select('tr')[counter + 7].get_text()
    default_desktop = default_desktop.split("\n")
    default_desktop.pop(0)
    default_desktop.pop(-1)
    for idx, dd in enumerate(default_desktop):
        if dd == "\xa0":
            default_desktop[idx] = "NA"
    if default_desktop[0] == default_desktop[-1]:
        default_desktop.pop()

    package_management = dist_soup.select('tr')[counter + 8].get_text()
    package_management = package_management.split("\n")
    package_management.pop(0)
    package_management.pop(-1)
    for idx, pm in enumerate(package_management):
        if pm == "\xa0":
            package_management[idx] = "NA"
    if package_management[0] == package_management[-1]:
        package_management.pop()

    release_model = dist_soup.select('tr')[counter + 9].get_text()
    release_model = release_model.split("\n")
    release_model.pop(0)
    release_model.pop(-1)
    for idx, rm in enumerate(release_model):
        if rm == "\xa0":
            release_model[idx] = "NA"
    if release_model[0] == release_model[-1]:
        release_model.pop()

    office_suite = dist_soup.select('tr')[counter + 10].get_text()
    office_suite = office_suite.split("\n")
    office_suite.pop(0)
    office_suite.pop(-1)
    for idx, o in enumerate(office_suite):
        if o == "\xa0":
            office_suite[idx] = "NA"
    if office_suite[0] == office_suite[-1]:
        office_suite.pop()

    arch = dist_soup.select('tr')[counter + 11].get_text()
    arch = arch.split("\n")
    arch.pop(0)
    arch.pop(-1)
    for idx, a in enumerate(arch):
        if a == "\xa0":
            arch[idx] = "NA"
    if arch[0] == arch[-1]:
        arch.pop()

    init = dist_soup.select('tr')[counter + 12].get_text()
    init = init.split("\n")
    init.pop(0)
    init.pop(-1)
    for idx, i in enumerate(init):
        if i == "\xa0":
            init[idx] = "NA"
    if init[0] == init[-1]:
        init.pop()

    fs = dist_soup.select('tr')[counter + 13].get_text()
    fs = fs.split("\n")
    fs.pop(0)
    fs.pop(-1)
    for idx, f in enumerate(fs):
        if f == "\xa0":
            fs[idx] = "NA"
    if fs[0] == fs[-1]:
        fs.pop()

    lang = dist_soup.select('tr')[counter + 14].get_text()
    lang = lang.split("\n")
    lang.pop(0)
    lang.pop(-1)
    for idx, l in enumerate(lang):
        if l == "\xa0":
            lang[idx] = "NA"
    if lang[0] == lang[-1]:
        lang.pop()

    asian = dist_soup.select('tr')[counter + 15].get_text()
    asian = asian.split("\n")
    asian.pop(0)
    asian.pop(-1)
    for idx, a in enumerate(asian):
        if a == "\xa0":
            asian[idx] = "NA"
    if asian[0] == asian[-1]:
        asian.pop()

    version = dist_soup.select('tr')[counter + 16].get_text()
    version = version.split("\n")
    version.pop(0)
    version.pop(-1)
    for idx, v in enumerate(version):
        if a == "\xa0":
            version[idx] = "NA"
    if version[0] == version[-1]:
        version.pop()

    num_packages = dist_soup.select('tr')[counter + 16]
    num_packages_list = []
    suffixes = []
    for idx, link in enumerate(num_packages.find_all("td")):
        if link.a:
            num_packages_list.append(link.find('a')["href"])
        else:
            num_packages_list.append("NA")
    ctr = 0
    for idx, elem in enumerate(num_packages_list):
        if elem == "NA":
            num_packages_list[idx] = "NA"
        else:
            pkg_link = f'https://distrowatch.com/{elem}'
            pkg_scrape = requests.get(pkg_link)
            pkg_soup = BeautifulSoup(pkg_scrape.content, "lxml")
            while not pkg_soup.select("b")[ctr].get_text().startswith("Number of packages:"):
                ctr += 1
            num_packages_list[idx] = pkg_soup.select("b")[ctr].get_text().strip()[20:].replace(",", "")
            """ ctr = 0
            while pkg_soup.select("a")[ctr].get("href").endswith(".txt") == False:
                ctr += 1
            suffixes[idx] = pkg_soup.select("a")[ctr].get("href")

            for suffix in suffixes:
                new_link = f"https://distrowatch.com/" + suffix
                link_scrape = requests.get(new_link)
                link_soup = BeautifulSoup(link_scrape.content, 'lxml')
                packages = link_soup.find('p').get_text().strip().split("\n")
                pkg_name = ["Package Name"]
                pkg_version = ["Package Version"]
                for pkg in packages:
                    pkg_name.append(pkg.split("^")[0])
                    pkg_version.append(pkg.split("^")[1])
                result = {
                    pkg_name[0]: pkg_name[1:],
                    pkg_version[0]: pkg_version[1:],
                }
                df = pl.DataFrame(result)
                df.write_csv(f'pkglist_{id}_{version[idx+1]}.csv', separator=",") """
                

    num_packages_list.insert(0, "Number of Packages")

    result = {
        release[0]: release[1:],
        end_of_life[0]: end_of_life[1:],
        price[0]: price[1:],
        image_size[0]: image_size[1:],
        free_download_list[0]: free_download_list[1:],
        installation[0]: installation[1:],
        default_desktop[0]: default_desktop[1:],
        package_management[0]: package_management[1:],
        release_model[0]: release_model[1:],
        office_suite[0]: office_suite[1:],
        arch[0]: arch[1:],
        init[0]: init[1:],
        fs[0]: fs[1:],
        lang[0]: lang[1:],
        asian[0]: asian[1:],
        num_packages_list[0]: num_packages_list[1:], 
    }

    result_json = {
        "distribution_name": [id] * len(features[1:]),
        "version": features[1:],
        "release_date": release[1:],
        "end_of_life": end_of_life[1:],
        "price": price[1:],
        "image_size": image_size[1:],
        "download_link": free_download_list[1:],
        "installation_method": installation[1:],
        "default_desktop": default_desktop[1:],
        "package_management": package_management[1:],
        "release_model": release_model[1:],
        "office_suite": office_suite[1:],
        "processor_architecture": arch[1:],
        "init_system": init[1:],
        "file_system": fs[1:],
        "multilingual": lang[1:],
        "asian_language_support": asian[1:],
        "number_of_packages": num_packages_list[1:],
    }
    
    df = pd.DataFrame(result_json)
    #df.transpose(include_header=True, header_name=features[0], column_names=features[1:]).write_json(f'features_{id}.json')
    df.to_json(f'features_{id}.json', orient="records")

In [34]:
os.getcwd()

'/home/sachin/Documents/Projects/DSAI/Distro_rec'

In [28]:
for idx, id in enumerate(ids):
    scrape_features_as_json(id)
    print(f"{idx+1}. scraped: {id}")

1. scraped: mx
2. scraped: mint
3. scraped: fedora
4. scraped: endeavour
5. scraped: debian
6. scraped: manjaro
7. scraped: ubuntu
8. scraped: garuda
9. scraped: popos
10. scraped: zorin
11. scraped: opensuse
12. scraped: biglinux
13. scraped: kdeneon
14. scraped: nobara
15. scraped: xero
16. scraped: freebsd
17. scraped: antix
18. scraped: alma
19. scraped: elementary
20. scraped: lite
21. scraped: vanilla
22. scraped: nixos
23. scraped: mageia
24. scraped: tuxedo
25. scraped: smartos
26. scraped: ghostbsd
27. scraped: pclinuxos
28. scraped: kali
29. scraped: relianoid
30. scraped: nutyx
31. scraped: puppy
32. scraped: kubuntu
33. scraped: clonezilla
34. scraped: sparky
35. scraped: peppermint
36. scraped: q4os
37. scraped: tails
38. scraped: devuan
39. scraped: arco
40. scraped: alpine
41. scraped: gnoppix
42. scraped: solus
43. scraped: cachyos
44. scraped: slackware
45. scraped: blendos
46. scraped: reactos
47. scraped: alt
48. scraped: arch
49. scraped: bodhi
50. scraped: truenas


In [24]:
for id in ids:
    file_path = Path()
    if os.path.exists(f"features_{id}.json"):
       print(f"scraped: {id}")
       continue
    scrape_features_as_json(id)
    print(f"scraped: {id}")

scraped: mx
scraped: mint
scraped: fedora
scraped: endeavour
scraped: debian
scraped: manjaro
scraped: ubuntu
scraped: garuda
scraped: popos
scraped: zorin
scraped: opensuse
scraped: biglinux
scraped: kdeneon
scraped: nobara
scraped: xero
scraped: freebsd
scraped: antix
scraped: alma
scraped: elementary
scraped: lite
scraped: vanilla
scraped: nixos
scraped: mageia
scraped: tuxedo
scraped: smartos
scraped: ghostbsd
scraped: pclinuxos
scraped: kali
scraped: relianoid
scraped: nutyx
scraped: puppy
scraped: kubuntu
scraped: clonezilla
scraped: sparky
scraped: peppermint
scraped: q4os
scraped: tails
scraped: devuan
scraped: arco
scraped: alpine
scraped: gnoppix
scraped: solus
scraped: cachyos
scraped: slackware
scraped: blendos
scraped: reactos
scraped: alt
scraped: arch
scraped: bodhi
scraped: truenas
scraped: voyager
scraped: lubuntu
scraped: slackel
scraped: centos
scraped: pureos
scraped: gentoo
scraped: rocky
scraped: easyos
scraped: parrot
scraped: openmamba
scraped: nitrux
scraped: x

In [10]:
data = pl.read_json("features_artix.json")

In [11]:
data

distribution_name,version,release_date,end_of_life,price,image_size,download_link,installation_method,default_desktop,package_management,release_model,office_suite,processor_architecture,init_system,file_system,multilingual,asian_language_support,number_of_packages
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""artix""","""20230814""","""2023-08-14""","""NA""","""Free""","""700-1700""","""https://downlo…","""Graphical""","""Cinnamon, LXDE…","""Pacman""","""Rolling""","""--""","""x86_64""","""Dinit, OpenRC,…","""ext3, ext4""","""NA""","""NA""","""1255"""
"""artix""","""20220713""","""2022-07-17""","""NA""","""Free""","""700-1700""","""https://downlo…","""Graphical""","""Cinnamon, LXDE…","""Pacman""","""Rolling""","""--""","""x86_64""","""Dinit, OpenRC,…","""ext3, ext4""","""NA""","""NA""","""1252"""
"""artix""","""20210726""","""2021-07-28""","""NA""","""Free""","""600-1400""","""https://downlo…","""Graphical""","""Cinnamon, LXDE…","""Pacman""","""Rolling""","""--""","""x86_64""","""OpenRC, runit,…","""ext3, ext4""","""NA""","""NA""","""2302"""
"""artix""","""20200210""","""2020-02-10""","""NA""","""Free""","""600-1200""","""https://downlo…","""Graphical""","""Cinnamon, LXDE…","""Pacman""","""Rolling""","""--""","""x86_64""","""OpenRC, runit,…","""ext3, ext4""","""NA""","""NA""","""982"""
"""artix""","""20181004""","""2018-10-04""","""NA""","""Free""","""400-900""","""https://source…","""Graphical""","""LXQt""","""Pacman""","""Rolling""","""--""","""x86_64""","""OpenRC, runit""","""ext3, ext4""","""NA""","""NA""","""644"""
"""artix""","""20170808""","""2017-08-08""","""NA""","""Free""","""300-700""","""https://source…","""Graphical""","""i3, LXQt""","""Pacman""","""Rolling""","""--""","""x86_64""","""OpenRC""","""ext3, ext4""","""NA""","""NA""","""644"""
