In [2]:
from util import load_eidc_data
from sentence_transformers import SentenceTransformer

df = load_eidc_data.load_as_df('data/catalogue_metadata.json')

In [3]:
model_names = ['paraphrase-MiniLM-L3-v2', 'all-MiniLM-L6-v2', 'multi-qa-MiniLM-L6-cos-v1']
embedding_models = {}
embeddings = {}
for model_name in model_names:
    print(f'Building model for {model_name}...')
    embedding_models[model_name] = SentenceTransformer(model_name)
    print('Creating embeddings...')
    embeddings[model_name] = embedding_models[model_name].encode(df['description'])


Building model for paraphrase-MiniLM-L3-v2...




Creating embeddings...
Building model for all-MiniLM-L6-v2...
Creating embeddings...
Building model for multi-qa-MiniLM-L6-cos-v1...
Creating embeddings...


In [4]:
from scipy import spatial
import numpy as np


def get_top_n_datasets(query_embedding, datasets, dataset_embeddings, n):
    similarity_scores = []
    for doc_embedding in dataset_embeddings:
        dist = 1 - spatial.distance.cosine(query_embedding, doc_embedding)
        similarity_scores.append(dist)
    sorted = np.array(similarity_scores).argsort()[::-1][:n]
    return datasets.iloc[sorted]

In [5]:
questions = ['Where is the wettest soil in the UK?',
             'Where is water quality worst in the UK?',
             'Where are bird populations declining in the UK?',
             'Where in the UK are bumblebees most at risk from neonicotinoids?',
             'Which county in the UK has the most rivers?']

import pandas as pd

for q in questions:
    print(q)
    results = pd.DataFrame(columns=model_names)
    for model_name in model_names:
        question_embedding = embedding_models[model_name].encode(q)
        result = get_top_n_datasets(question_embedding, df, embeddings[model_name], 3)
        result.index = pd.RangeIndex(len(result))
        results[model_name] = result['title']
    print(results.to_markdown())

Where is the wettest soil in the UK?


|    | paraphrase-MiniLM-L3-v2                                                 | all-MiniLM-L6-v2                                                                                | multi-qa-MiniLM-L6-cos-v1                                                                                                                   |
|---:|:------------------------------------------------------------------------|:------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------|
|  0 | Soil physico-chemical properties from eight UK agricultural sites, 2022 | Topsoil physico-chemical properties from the UKCEH Countryside Survey, Great Britain, 2020, v2  | Hydraulic and hydrological data from surface and subsurface soils across the Thames catchment, UK, 2021                                     |
|  1 | Soil physico-chemical properti