# Imports

In [2]:
import pandas as pd

pd.options.mode.chained_assignment = None
pd.options.display.max_rows = 500
pd.options.display.max_seq_items = 500

import numpy as np
import requests
import json
import re
import time
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup as bs4

from sentence_transformers import SentenceTransformer, util
from sentence_transformers.util import semantic_search


import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

# Constants

In [3]:
BASELINK_DATAPORTAL = "https://opendata.swiss/de/dataset/"
CKAN_API_LINK = (
    "https://ckan.opendata.swiss/api/3/action/current_package_list_with_resources"
)

LANGUAGE = "de"

In [4]:
semantic_cols = [
    "title.de",
    "title.fr",
    "title.it",
    "title.en",
    "description.de",
    "description.fr",
    "description.it",
    "description.en",
    "tags",
    "keywords.de",
    "keywords.fr",
    "keywords.it",
    "keywords.en",
]

final_cols = semantic_cols + ["id", "organization.title", "organization.url"]

# Functions

In [135]:
def get_full_package_list(limit=500, sleep=2):
    """Get full package list from CKAN API"""
    offset = 0
    frames = []
    while True:
        print(f"{offset} packages retrieved.")
        url = CKAN_API_LINK + f"?limit={limit}&offset={offset}"
        res = requests.get(url)
        data = json.loads(res.content)
        if data["result"] == []:
            break
        data = pd.DataFrame(pd.json_normalize(data["result"]))
        frames.append(data)
        offset += limit
        time.sleep(sleep)
    data = pd.concat(frames)
    data = data.reset_index(drop=True)
    return data


def clean_features(data):
    """Clean various features"""
    # Reduce publisher data to name.
    data.publisher = data.publisher.apply(lambda x: json.loads(x)["name"])

    # Reduce tags to tag names.
    data.tags = data.tags.apply(lambda x: [tag["name"] for tag in x])

    # Replace empty urls with NA message.
    data[data["organization.url"] == ""]["organization.url"] = "None provided"

    # Remove HTML tags from description.
    data[f"description.{LANGUAGE}"] = data[f"description.{LANGUAGE}"].apply(
        lambda x: bs4(x, "html.parser").text
    )

    # Strip whitespace from title.
    data[f"title.{LANGUAGE}"] = data[f"title.{LANGUAGE}"].map(lambda x: x.strip())

    return data

# Create starter code files and README

**Get all metadata packages from CKAN API**

In [136]:
# all_packages = get_full_package_list()
# all_packages.to_parquet(f"metadata_240314.parq", engine="fastparquet")

**Reload data from disk and clean**

In [5]:
df = pd.read_parquet(f"metadata_240314.parq", engine="fastparquet")

# Reduce tags to text
df.tags = df.tags.apply(lambda x: x[0]["name"] if x not in [None, []] else "")

# Reduce keywords to text
for feature in ["keywords.fr", "keywords.de", "keywords.en", "keywords.it"]:
    df[feature] = df[feature].apply(
        lambda x: ", ".join(x) if x not in [None, []] else ""
    )

# Reduce all text features to one column
df["joined"] = df[semantic_cols].apply(lambda row: " ".join(row.astype(str)), axis=1)

# Create link from base link and name
df["link"] = df.name.apply(lambda x: BASELINK_DATAPORTAL + x)

In [6]:
def clean_text(text):
    """Clean text"""
    # Remove line breaks \n \r etc
    text = re.sub(r"\s+", " ", text)
    # Remove square and round brackets
    text = re.sub(r"\[.*?\]", " ", text)
    text = re.sub(r"\(.*?\)", " ", text)
    # Remove " and '
    text = text.replace('"', " ")
    text = text.replace("'", " ")
    # Replace hypens with space
    text = text.replace("-", " ")
    text = text.replace("–", " ")
    # Remove punctuation but keep dots
    text = re.sub(r"[^\w\s.]", "", text)
    # Remove multiple spaces
    text = re.sub(" +", " ", text)
    # Remove leading and trailing whitespace
    text = text.strip()
    return text


df.joined = df.joined.apply(clean_text)

In [7]:
empties = df[df["title.de"] == ""].index
df.loc[empties, "title.de"] = df.loc[empties, "title.en"]

empties = df[df["title.de"] == ""].index
df.loc[empties, "title.de"] = df.loc[empties, "title.fr"]

empties = df[df["title.de"] == ""].index
df.loc[empties, "title.de"] = df.loc[empties, "title.it"]

empties = df[df["title.de"] == ""].index
len(empties)

0

In [8]:
df.to_parquet(f"metadata_240314_cleaned.parq", engine="fastparquet")

In [9]:
df.head(1).T

Unnamed: 0,0
owner_org,ec2593fa-d844-4d20-9851-e3c2b24eff89
maintainer,"Bundesamt für Gesundheit, Sektion Datenmanagem..."
issued,2024-03-14T00:00:00
title_for_slug,Qualitätsindikatoren der Schweizer Akutspitäle...
qualified_relations,[]
private,False
maintainer_email,KUV-DMS@bag.admin.ch
num_tags,32
contact_points,"[{'name': 'Bundesamt für Gesundheit, Sektion D..."
temporals,"[{'start_date': '2022-01-01T00:00:00', 'end_da..."


# Embed texts

In [10]:
df = pd.read_parquet(f"metadata_240314_cleaned.parq", engine="fastparquet")

In [15]:
# model_path = "all-MiniLM-L6-v2"
# model_path = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
# model_path = "distiluse-base-multilingual-cased-v2"
# model_path = "jinaai/jina-embeddings-v2-base-de"
# model = SentenceTransformer(model_path, device="mps")

# Encoding with the Nomic model without a CUDA device takes very long.
model = SentenceTransformer(
    "nomic-ai/nomic-embed-text-v1.5", 
    trust_remote_code=True, 
    device="mps" # Only for Mac with Apple Silicon, otherwise remove this line
)
print(f"{model.max_seq_length}")

embeddings = model.encode(
    df.joined.values,
    batch_size=128,
    show_progress_bar=True,
    normalize_embeddings=True,
)

# Save embeddings with numpy.save
np.save("embeddings_nomic.npy", embeddings)

<All keys matched successfully>


8192


In [16]:
with open("embeddings_nomic.npy", "rb") as f:
    embeddings = np.load(f)

In [20]:
search_term = "velo"
query_embedding = model.encode(
    search_term, show_progress_bar=True, device="mps", normalize_embeddings=True
)
hits = util.semantic_search(query_embedding, embeddings, top_k=50)
hits = hits[0]
for hit in hits:
    print(
        df.iloc[hit["corpus_id"]]["title.de"],
        "(Score: {:.4f})".format(hit["score"], df.iloc[hit["corpus_id"]]["link"]),
    )

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Velonetz (WFS) (Score: 0.6761)
Velonetz (WMS) (Score: 0.6761)
Velo (Score: 0.6554)
Velo-Fahrverbote (allgemein oder temporär) (Score: 0.6376)
Velopumpen (Score: 0.6372)
Velostrassen (Score: 0.6332)
Velo-Themen (Score: 0.6317)
Velo-Schlauchautomaten (Score: 0.6294)
Velo-Einbahnstrassen und -Gefahrenstellen (Score: 0.6276)
Teilrichtplan Velo (Score: 0.6273)
Velopumpen (Score: 0.6234)
Steile Velo-Strecken (Score: 0.6158)
Veloplan Stadt St.Gallen (Score: 0.6146)
Velonetz (Score: 0.6145)
Velopumpen - Standort und Informationen (Score: 0.6141)
Veloabstellplätze (Score: 0.6089)
Veloverleih (Score: 0.6057)
Velopumpenstandorte (Score: 0.6047)
Velo- und Motorradabstellplätze in der Stadt St.Gallen (Score: 0.6004)
Touristische Velorouten (Score: 0.5981)
Velopumpen - Standort und Informationen (WMS) (Score: 0.5972)
Velopumpen - Standort und Informationen (WFS) (Score: 0.5972)
Velostadtplan (Score: 0.5933)
Verkehrsreiche Strassen (50 km/h oder mehr) (Score: 0.5890)
Velopumpen (Score: 0.5828)
Velopr

# Call CKAN search endpoint

In [None]:
def call_ckan(query, limit=100):
    """Call CKAN API to search for query"""
    url = f"https://opendata.swiss/api/3/action/package_search?q={query}&rows={limit}"
    res = requests.get(url)
    data = json.loads(res.content)
    return data["result"]["results"]

In [None]:
query = "Bevölkerung"
url = f"https://ckan.opendata.swiss/api/3/action/package_search?q={query}&facet.limit=100&rows=100"

In [None]:
results[0]

{'owner_org': 'bc3fde05-b0bf-4035-a098-a2d6f89d47c3',
 'maintainer': 'stata@bs.ch',
 'groups': [{'display_name': {'fr': 'Population et société',
    'de': 'Bevölkerung und Gesellschaft',
    'en': 'Population and society',
    'it': 'Popolazione e società'},
   'description': {'fr': '', 'de': '', 'en': '', 'it': ''},
   'title': '{"de": "Bevölkerung und Gesellschaft", "en": "Population and society", "fr": "Population et société", "it": "Popolazione e società"}',
   'image_display_url': '',
   'id': 'd569c3de-4b8d-4de0-8f0e-9ac4c17f9003',
   'name': 'soci'},
  {'display_name': {'fr': 'Éducation, culture et sport',
    'de': 'Bildung, Kultur und Sport',
    'en': 'Education, culture and sport',
    'it': 'Istruzione, cultura e sport'},
   'description': {'fr': '', 'de': '', 'en': '', 'it': ''},
   'title': '{"de": "Bildung, Kultur und Sport", "en": "Education, culture and sport", "fr": "Éducation, culture et sport", "it": "Istruzione, cultura e sport"}',
   'image_display_url': '',
   'i