In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('notebooks/claims.csv')

In [3]:
df.columns = df.columns.str.lower()
df

Unnamed: 0,category,question,answer,section
0,Manage existing benefit,How do I update my benefit information?,You can update your benefit information online...,general claim benefits
1,Manage existing benefit,What if my circumstances change?,Report changes in circumstances immediately to...,general claim benefits
2,Manage existing benefit,Can I appeal a decision?,Yes you can appeal within one month of the dec...,general claim benefits
3,Manage existing benefit,How can I report a change in income?,Report changes in income using the online portal.,general claim benefits
4,Manage existing benefit,How do I stop claiming benefits?,You can stop claiming by contacting your benef...,general claim benefits
...,...,...,...,...
420,Sodium Valproate Claims,What are the legal implications of prescribing...,Prescribing sodium valproate during pregnancy ...,nhs claim benefits
421,Sodium Valproate Claims,Can fathers of children affected by sodium val...,Yes fathers can also claim compensation if the...,nhs claim benefits
422,Sodium Valproate Claims,What role do support groups play in sodium val...,Support groups provide resources information a...,nhs claim benefits
423,Sodium Valproate Claims,Can you claim for future care needs due to sod...,Yes future care needs including specialized ed...,nhs claim benefits


In [4]:
documents = df.to_dict(orient='records')

In [7]:
documents[0]

{'category': 'Manage existing benefit',
 'question': 'How do I update my benefit information?',
 'answer': 'You can update your benefit information online through your account.',
 'section': 'general claim benefits'}

In [6]:
from sentence_transformers import SentenceTransformer

model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
len(model.encode("Getting size of model dim"))

384

In [8]:
sections = []
for doc in documents:
    # Transforming the title into an embedding using the model
    doc["text_vector"] = model.encode(doc["category"]).tolist()
    sections.append(doc)

In [17]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "answer": {"type": "text"},
            "question": {"type": "text"},
            "category": {"type": "keyword"},
            "text_vector": {"type": "dense_vector", "dims": 384, "index": True, "similarity": "cosine"},
        }
    }
}
index_name = "uk-benefit-claims"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'uk-benefit-claims'})

In [18]:
for doc in sections:
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

In [23]:
search_term = "How do I stop claiming benefits?"
vector_search_term = model.encode(search_term)

In [24]:
query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000, 
}

In [25]:
res = es_client.search(index=index_name, knn=query, source=["text", "section", "question", "course"])
res["hits"]["hits"]

[{'_index': 'uk-benefit-claims',
  '_id': '6fJ9sZEBxoG-TnjP9m3j',
  '_score': 0.745834,
  '_source': {'question': 'How do I update my benefit information?'}},
 {'_index': 'uk-benefit-claims',
  '_id': '6vJ9sZEBxoG-TnjP920E',
  '_score': 0.745834,
  '_source': {'question': 'What if my circumstances change?'}},
 {'_index': 'uk-benefit-claims',
  '_id': '6_J9sZEBxoG-TnjP920c',
  '_score': 0.745834,
  '_source': {'question': 'Can I appeal a decision?'}},
 {'_index': 'uk-benefit-claims',
  '_id': '7PJ9sZEBxoG-TnjP920y',
  '_score': 0.745834,
  '_source': {'question': 'How can I report a change in income?'}},
 {'_index': 'uk-benefit-claims',
  '_id': '7fJ9sZEBxoG-TnjP921Q',
  '_score': 0.745834,
  '_source': {'question': 'How do I stop claiming benefits?'}}]

In [28]:
knn_query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000
}
response = es_client.search(
    index=index_name,
    query={
        "match": {"category": "Manage existing benefit"},
    },
    knn=knn_query,
    size=5
)
response["hits"]["hits"]

[{'_index': 'uk-benefit-claims',
  '_id': '6fJ9sZEBxoG-TnjP9m3j',
  '_score': 2.81046,
  '_source': {'category': 'Manage existing benefit',
   'question': 'How do I update my benefit information?',
   'answer': 'You can update your benefit information online through your account.',
   'text_vector': [-0.06169872730970383,
    0.06927156448364258,
    0.016122009605169296,
    0.0037233280017971992,
    0.03128599375486374,
    0.08697997033596039,
    0.028020402416586876,
    0.01912151463329792,
    -0.038937151432037354,
    -0.006217021960765123,
    -0.007982592098414898,
    0.056966401636600494,
    0.013437407091259956,
    -0.0007534001488238573,
    0.03850725293159485,
    0.029142413288354874,
    -0.021272001788020134,
    0.08256879448890686,
    -0.029913274571299553,
    -0.048439715057611465,
    -0.11138635873794556,
    -0.06767214089632034,
    -0.056599799543619156,
    0.017560778185725212,
    -0.006038482300937176,
    -0.012538759037852287,
    -0.0508014112710