In [None]:
# %pip install bs4 requests 'polars[all]' lxml

In [None]:
import requests
from bs4 import BeautifulSoup
import polars as pl
import re

In [None]:
url = 'https://distrowatch.com/index.php?dataspan=4'
response = requests.get(url)

In [None]:
soup = BeautifulSoup(response.content, 'html.parser')
distros = soup.find_all('td', {'class': 'phr2'})
hits = soup.find_all('td', {'class': 'phr3'})

In [None]:
distributions = []
ids = []
page_hits = []
for distro in distros:
    distributions.append(distro.get_text())
    ids.append(distro.a['href'])

for hit in hits:
    page_hits.append(int(hit.get_text()))

In [None]:
top_distros = {
    'distribution': distributions,
    'id': ids,
    'page hit(s)': page_hits 
}
df = pl.DataFrame(top_distros)
df.write_csv("top_distros.csv", separator=",")

In [None]:
def add_hyphen(string):
    pattern = r'(\d)([a-zA-Z])'
    result = re.sub(pattern, r'\1-\2', string)
    return result

In [None]:
def scrape_features(id):
    link = f'https://distrowatch.com/table.php?distribution={id}'
    scrape = requests.get(link)
    dist_soup = BeautifulSoup(scrape.content, 'lxml')
    counter = 0

    features = [""]
    while features[0] != "Feature":
        features = dist_soup.select('tr')[counter].get_text()
        features = features.strip().split("\n")
        if features[0] != "Feature":
            counter += 1
    for idx, feature in enumerate(features):
        feature_new = add_hyphen(feature)
        features[idx] = feature_new
    if features[0] == features[-1]:
        features.pop()

    release = dist_soup.select('tr')[counter + 1].get_text()
    release = release.split("\n")
    release.pop(0)
    release.pop(-1)
    if release[0] == release[-1]:
        release.pop()

    end_of_life = dist_soup.select('tr')[counter + 2].get_text()
    end_of_life = end_of_life.split("\n")
    end_of_life.pop(0)
    end_of_life.pop(-1)
    for idx, eol in enumerate(end_of_life):
        if eol == "\xa0":
            end_of_life[idx] = "NA"
    if end_of_life[0] == end_of_life[-1]:
        end_of_life.pop()

    price = dist_soup.select('tr')[counter + 3].get_text()
    price = price.split("\n")
    price.pop(0)
    price.pop(-1)
    for idx, p in enumerate(price):
        if p == "\xa0":
            price[idx] = "NA"
    if price[0] == price[-1]:
        price.pop()

    image_size = dist_soup.select('tr')[counter + 4].get_text()
    image_size = image_size.split("\n")
    image_size.pop(0)
    image_size.pop(-1)
    for idx, im in enumerate(image_size):
        if im == "\xa0":
            image_size[idx] = "NA"
    if image_size[0] == image_size[-1]:
        image_size.pop()

    free_download = dist_soup.select('tr')[counter + 5]
    free_download_list = []
    free_download_list.append(free_download.th.get_text())
    for idx, iso in enumerate(free_download.find_all("td")):
        if iso.a:
            free_download_list.append(free_download.find_all('a')[idx]["href"])
        else:
            free_download_list.append("NA")
    
    installation = dist_soup.select('tr')[counter + 6].get_text()
    installation = installation.split("\n")
    installation.pop(0)
    installation.pop(-1)
    for idx, ins in enumerate(installation):
        if ins == "\xa0":
            installation[idx] = "NA"
    if installation[0] == installation[-1]:
        installation.pop()

    default_desktop = dist_soup.select('tr')[counter + 7].get_text()
    default_desktop = default_desktop.split("\n")
    default_desktop.pop(0)
    default_desktop.pop(-1)
    for idx, dd in enumerate(default_desktop):
        if dd == "\xa0":
            default_desktop[idx] = "NA"
    if default_desktop[0] == default_desktop[-1]:
        default_desktop.pop()

    package_management = dist_soup.select('tr')[counter + 8].get_text()
    package_management = package_management.split("\n")
    package_management.pop(0)
    package_management.pop(-1)
    for idx, pm in enumerate(package_management):
        if pm == "\xa0":
            package_management[idx] = "NA"
    if package_management[0] == package_management[-1]:
        package_management.pop()

    release_model = dist_soup.select('tr')[counter + 9].get_text()
    release_model = release_model.split("\n")
    release_model.pop(0)
    release_model.pop(-1)
    for idx, rm in enumerate(release_model):
        if rm == "\xa0":
            release_model[idx] = "NA"
    if release_model[0] == release_model[-1]:
        release_model.pop()

    office_suite = dist_soup.select('tr')[counter + 10].get_text()
    office_suite = office_suite.split("\n")
    office_suite.pop(0)
    office_suite.pop(-1)
    for idx, o in enumerate(office_suite):
        if o == "\xa0":
            office_suite[idx] = "NA"
    if office_suite[0] == office_suite[-1]:
        office_suite.pop()

    arch = dist_soup.select('tr')[counter + 11].get_text()
    arch = arch.split("\n")
    arch.pop(0)
    arch.pop(-1)
    for idx, a in enumerate(arch):
        if a == "\xa0":
            arch[idx] = "NA"
    if arch[0] == arch[-1]:
        arch.pop()

    init = dist_soup.select('tr')[counter + 12].get_text()
    init = init.split("\n")
    init.pop(0)
    init.pop(-1)
    for idx, i in enumerate(init):
        if i == "\xa0":
            init[idx] = "NA"
    if init[0] == init[-1]:
        init.pop()

    fs = dist_soup.select('tr')[counter + 13].get_text()
    fs = fs.split("\n")
    fs.pop(0)
    fs.pop(-1)
    for idx, f in enumerate(fs):
        if f == "\xa0":
            fs[idx] = "NA"
    if fs[0] == fs[-1]:
        fs.pop()

    lang = dist_soup.select('tr')[counter + 14].get_text()
    lang = lang.split("\n")
    lang.pop(0)
    lang.pop(-1)
    for idx, l in enumerate(lang):
        if l == "\xa0":
            lang[idx] = "NA"
    if lang[0] == lang[-1]:
        lang.pop()

    asian = dist_soup.select('tr')[counter + 15].get_text()
    asian = asian.split("\n")
    asian.pop(0)
    asian.pop(-1)
    for idx, a in enumerate(asian):
        if a == "\xa0":
            asian[idx] = "NA"
    if asian[0] == asian[-1]:
        asian.pop()

    version = dist_soup.select('tr')[counter + 16].get_text()
    version = version.split("\n")
    version.pop(0)
    version.pop(-1)
    for idx, v in enumerate(version):
        if a == "\xa0":
            version[idx] = "NA"
    if version[0] == version[-1]:
        version.pop()

    num_packages = dist_soup.select('tr')[counter + 16]
    num_packages_list = []
    suffixes = []
    for idx, link in enumerate(num_packages.find_all("td")):
        if link.a:
            num_packages_list.append(num_packages.find_all('a')[idx]["href"])
        else:
            num_packages_list.append("NA")
    ctr = 0
    for idx, elem in enumerate(num_packages_list):
        if elem == "NA":
            num_packages_list[idx] = "NA"
        else:
            pkg_link = f'https://distrowatch.com/{elem}'
            pkg_scrape = requests.get(pkg_link)
            pkg_soup = BeautifulSoup(pkg_scrape.content, "lxml")
            while not pkg_soup.select("b")[ctr].get_text().startswith("Number of packages:"):
                ctr += 1
            num_packages_list[idx] = pkg_soup.select("b")[ctr].get_text().strip()[20:].replace(",", "")
            # # test
            print(num_packages_list[idx])
            ## test
            ctr = 0
            while not pkg_soup.select("a")[ctr]["href"].startswith("resource/{id}"):
                ctr += 1
            ## test test
            print("test" + pkg_soup.select("a")[ctr]["href"])
            # # test
            suffixes[idx] = pkg_soup.select("a")[ctr]["href"]
            
            for suffix in suffixes:
                new_link = f"https://distrowatch.com/" + suffix
                link_scrape = requests.get(new_link)
                link_soup = BeautifulSoup(link_scrape.content, 'lxml')
                packages = link_soup.find('p').get_text().strip().split("\n")
                pkg_name = ["Package Name"]
                pkg_version = ["Package Version"]
                for pkg in packages:
                    pkg_name.append(pkg.split("^")[0])
                    pkg_version.append(pkg.split("^")[1])
                result = {
                    pkg_name[0]: pkg_name[1:],
                    pkg_version[0]: pkg_version[1:],
                }
                df = pl.DataFrame(result)
                df.write_csv(f'pkglist_{id}_{version[idx+1]}.csv', separator=",")
                

    num_packages_list.insert(0, "Number of Packages")

    result = {
        release[0]: release[1:],
        end_of_life[0]: end_of_life[1:],
        price[0]: price[1:],
        image_size[0]: image_size[1:],
        free_download_list[0]: free_download_list[1:],
        installation[0]: installation[1:],
        default_desktop[0]: default_desktop[1:],
        package_management[0]: package_management[1:],
        release_model[0]: release_model[1:],
        office_suite[0]: office_suite[1:],
        arch[0]: arch[1:],
        init[0]: init[1:],
        fs[0]: fs[1:],
        lang[0]: lang[1:],
        asian[0]: asian[1:],
        num_packages_list[0]: num_packages_list[1:], 
    }
    
    df = pl.DataFrame(result)
    df.transpose(include_header=True, header_name=features[0], column_names=features[1:]).write_csv(f'features_{id}.csv', separator=",")

In [None]:
for id in ids:
    scrape_features(id)

In [None]:
# Just an example for MX Linux
mx = pl.read_csv("features_mx.csv")
mx

In [70]:
import weaviate

client = weaviate.Client(
    url = "http://localhost:8080"
)

client.schema.delete_all()

WeaviateStartUpError: Weaviate did not start up in 5 seconds. Either the Weaviate URL http://localhost:8080 is wrong or Weaviate did not start up in the interval given in 'startup_period'.

In [33]:
import pandas as pd
data = pd.read_json("data/debian.json")
data

Unnamed: 0,unstablesid,testingtrixie,12-bookworm,11-bullseye,10-buster,9-stretch,8.0-jessie,7.0-wheezy,6.0-squeeze,5.0-lenny,4.0-etch,3.1-sarge,3.0-woody,2.2-potato,2.1-slink,2.0-hamm,1.3-bo,1.2-rex,1.1-buzz
Release Date,2023-10-07,2023-06-11,2023-06-10,2021-08-14,2019-07-06,2017-06-18,2015-04-26,2013-05-04,2011-02-06,2009-02-15,2007-04-08,2005-06-06,2002-07-19,2000-08-15,1999-03-09,1998-07-23,1997-06-05,1996-12-12,1996-06-17
End Of Life,,,,,,,2020-07,2018-05,2016-02,,,,,,,,,,
Price (US$),Free,Free,Free,Free,Free,Free,Free,Free,Free,Free,Free,Free,Free,Free,Free,Free,Free,Free,Free
Image Size (MB),,,,300-3700,300-3700,300-3700,,,,,,,,,,,,,
Free Download,http://www.debian.org/CD,http://www.debian.org/CD,http://www.debian.org/CD,http://www.debian.org/CD,http://www.debian.org/CD,http://www.debian.org/CD,http://www.debian.org/CD,http://www.debian.org/CD,http://www.debian.org/CD,http://www.debian.org/CD,http://www.debian.org/CD,http://www.debian.org/CD,http://www.debian.org/CD,http://www.debian.org/CD,http://www.debian.org/CD,http://www.debian.org/CD,http://www.debian.org/CD,http://www.debian.org/CD,http://www.debian.org/CD
Installation,Graphical,Graphical,Graphical,Graphical,Graphical,Graphical,Graphical,Graphical,Graphical,Graphical,Graphical,Text mode,Text mode,Text mode,Text mode,Text mode,Text mode,Text mode,Text mode
Default Desktop,GNOME,GNOME,GNOME,GNOME,GNOME,GNOME,GNOME,GNOME,GNOME,GNOME,GNOME,GNOME,GNOME,,,,,,
Package Management,DEB,DEB,DEB,DEB,DEB,DEB,DEB,DEB,DEB,DEB,DEB,DEB,DEB,DEB,DEB,DEB,DEB,DEB,DEB
Release Model,Rolling,Semi,Fixed,Fixed,Fixed,Fixed,Fixed,Fixed,Fixed,Fixed,Fixed,Fixed,Fixed,Fixed,Fixed,Fixed,Fixed,Fixed,Fixed
Office Suite,"Calligra, GOffice, LibreOffice","Calligra, GOffice, LibreOffice","Calligra, GOffice, LibreOffice","Calligra, GOffice, LibreOffice","Calligra, GOffice, LibreOffice","Calligra, GOffice, LibreOffice","Calligra, GOffice, KOffice, LibreOffice","Calligra, GOffice, KOffice, LibreOffice","GOffice, KOffice, OO.o","GOffice, KOffice, OO.o","GOffice, KOffice, OO.o",KOffice,KOffice,,,,,,


In [38]:
data['testingtrixie']

Release Date                                                     2023-06-11
End Of Life                                                              NA
Price (US$)                                                            Free
Image Size (MB)                                                          NA
Free Download                                      http://www.debian.org/CD
Installation                                                      Graphical
Default Desktop                                                       GNOME
Package Management                                                      DEB
Release Model                                                          Semi
Office Suite                                 Calligra, GOffice, LibreOffice
Processor Architecture    amd64, aarch64, armel, armhf, i686, mipsel, mi...
Init Software                                                       systemd
Journaled File Systems                Btrfs, ext3, ext4, JFS, ReiserFS, XFS
Multilingual

In [68]:
import weaviate
import json


# Settings for displaying the import progress
counter = 0
interval = 20  # print progress every this many records; should be bigger than the batch_size

def add_debian_release(release_name, release_data):
    properties = {
        'Release_Date': release_data['Release Date'],
        'End_Of_Life': release_data['End Of Life'],
        'Price': release_data['Price (US$)'],
        'Image_Size': release_data['Image Size (MB)'],
        'Free_Download': release_data['Free Download'],
        'Installation': release_data['Installation'],
        'Default_Desktop': release_data['Default Desktop'],
        'Package_Management': release_data['Package Management'],
        'Release_Model': release_data['Release Model'],
        'Office_Suite': release_data['Office Suite'],
        'Processor_Architecture': release_data['Processor Architecture'],
        'Init_Software': release_data['Init Software'],
        'Journaled_File_Systems': release_data['Journaled File Systems'],
        'Multilingual': release_data['Multilingual'],
        'Asian_Language_Support': release_data['Asian Language Support'],
        'Number_of_Packages': release_data['Number of Packages'],
    }

    client.batch.configure(batch_size=100)  # Configure batch
    with client.batch as batch:
        # Add the object to the batch
        batch.add_data_object(
            data_object=properties,
            class_name='Debian',
            # If you Bring Your Own Vectors, add the `vector` parameter here
            # vector=obj.vector
        )


with open('data/debian.json', 'r') as json_file:
    data = json.load(json_file)
    for release_name, release_data in data.items():
        add_debian_release(release_name, release_data)

print('Importing Debian releases into Weaviate...')
client.batch.flush()
print('Finished importing Debian releases.')


{'error': [{'message': 'update vector: API Key: no api key found neither in request header: X-Openai-Api-Key nor in environment variable under OPENAI_APIKEY'}]}
{'error': [{'message': 'update vector: API Key: no api key found neither in request header: X-Openai-Api-Key nor in environment variable under OPENAI_APIKEY'}]}
{'error': [{'message': 'update vector: API Key: no api key found neither in request header: X-Openai-Api-Key nor in environment variable under OPENAI_APIKEY'}]}
{'error': [{'message': 'update vector: API Key: no api key found neither in request header: X-Openai-Api-Key nor in environment variable under OPENAI_APIKEY'}]}
{'error': [{'message': 'update vector: API Key: no api key found neither in request header: X-Openai-Api-Key nor in environment variable under OPENAI_APIKEY'}]}
{'error': [{'message': 'update vector: API Key: no api key found neither in request header: X-Openai-Api-Key nor in environment variable under OPENAI_APIKEY'}]}
{'error': [{'message': 'update vec



In [72]:
query = {
    "filter": {
        "class": "Distributions"
    }
}

try:
    response = client.query.get_objects(query)
    if response['totalResults'] > 0:
        print("Data has been successfully imported.")
    else:
        print("No data found in Weaviate.")
except:
    print("Weaviate.")


Weaviate.


In [56]:
client.schema.delete_all()

In [71]:
from weaviate.util import generate_uuid5

data = pd.read_csv('Homepage.csv')
class_name = "Distributions"

data_objs = []
for i,row in data.iterrows():
    obj = {"name": data['Name'][i],
     "description": data['Description'][i]}
    data_objs.append(obj)

# client.batch.configure(batch_size=100)

with client.batch as batch:
    for data_obj in data_objs:
        try:
            batch.add_data_object(
                data_obj,
                class_name,
                uuid=generate_uuid5(data_obj)
            )
            print(f"Added object with name: {data_obj['name']}")
        except Exception as e:
            print(f"Error adding object with name {data_obj['name']}. Error: {e}")

Added object with name: MX Linux
Added object with name: Mint
Added object with name: EndeavourOS
Added object with name: Debian
Added object with name: Manjaro
Added object with name: Ubuntu
Added object with name: Pop!_OS
Added object with name: Fedora
Added object with name: openSUSE
Added object with name: Lite
Added object with name: Zorin
Added object with name: Garuda
Added object with name: KDE neon
Added object with name: elementary
Added object with name: antiX
Added object with name: Mageia
Added object with name: Kali
Added object with name: Puppy
Added object with name: AlmaLinux
Added object with name: Nobara
Added object with name: PCLinuxOS
Added object with name: Vanilla
Added object with name: SparkyLinux
Added object with name: EasyOS
Added object with name: ArcoLinux
Added object with name: Alpine
Added object with name: Q4OS
Added object with name: FreeBSD
Added object with name: NixOS
Added object with name: CachyOS
Added object with name: Peppermint
Added object 