In [13]:
from typing import Dict

import torch
import numpy as np
import glob
import pandas as pd

from os import path
from transformers import AutoModel, AutoTokenizer
from sentence_transformers.util import cos_sim



In [54]:
def split_string(input_string, chunk_size=1800):
    return [input_string[i:i + chunk_size] for i in range(0, len(input_string), chunk_size)]

In [55]:
def read_texts():

    texts, metadatas, results, file_names = [], [], [], []
    for file_path in glob.glob('data/*.txt'):

        file_name, _ = path.splitext(path.basename(file_path))
        data = dict(param.split('=') for param in file_name.split('&'))

        with open(file_path) as file:
            text = file.read()
            for splitted in split_string(text):
                data['text'] = splitted
                file_names.append(file_name)
                results.append(data)

    return file_names, results

In [56]:
file_names, results = read_texts()
df = pd.DataFrame.from_records(results, index=file_names)


# For retrieval you need to pass this prompt. Please find our more in our blog post.
def transform_query(query: str) -> str:
    """ For retrieval, add the prompt for query (not for documents).
    """
    return f'Represent this sentence for searching relevant passages: {query}'


# The model works really well with cls pooling (default) but also with mean pooling.
def pooling(outputs: torch.Tensor, inputs: Dict,  strategy: str = 'cls') -> np.ndarray:
    if strategy == 'cls':
        outputs = outputs[:, 0]
    elif strategy == 'mean':
        outputs = torch.sum(
            outputs * inputs["attention_mask"][:, :, None], dim=1) / torch.sum(inputs["attention_mask"])
    else:
        raise NotImplementedError
    return outputs.detach().cpu().numpy()

In [57]:
query = 'What are the incentives for installing solar on residential buildings? Single family homeowner in Santa Clara county with single filer'

In [None]:
# 1. load model
model_id = 'mixedbread-ai/mxbai-embed-large-v1'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id)

docs = [f'Represent this sentence for searching relevant passages: {query}'] + df['text'][:500].to_list()


# 2. encode
inputs = tokenizer(docs, padding=True, return_tensors='pt')
for k, v in inputs.items():
    inputs[k] = v
outputs = model(**inputs).last_hidden_state
embeddings = pooling(outputs, inputs, 'cls')
#df['embeddings'] = embeddings

similarities = cos_sim(embeddings[0], embeddings[1:])
#print('similarities:', similarities)

In [52]:
similarities

tensor([[0.4787, 0.4787, 0.4787, 0.4787, 0.4787, 0.4787, 0.4787, 0.4787, 0.4787,
         0.3461]])