In [1]:
from dataclasses import dataclass, field
from functools import partial
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import List, Optional
from datasets import Features, Sequence, Value, load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import json
from pyserini.search import FaissSearcher, LuceneSearcher
from pyserini.search.faiss import AutoQueryEncoder

from src.hyde import Promptor, OpenAIGenerator, CohereGenerator, HyDE, AlpacaGenerator


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /opt/conda/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda117_nocublaslt.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 7.0
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /opt/conda/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda117_nocublaslt.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)
  warn(msg)


In [3]:
import pandas as pd
dataset = load_dataset(
        "csv", data_files=["/workspace/alpaca-lora/dataset_fischer.csv"], split="train", delimiter="\t", column_names=["title", "text"]
)

Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-b6b3f4ad325a31dd/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


In [130]:
import numpy as np

def split_text(text: str, n=1024, character=" ") -> List[str]:
    """Split the text every ``n``-th occurrence of ``character``"""
    text = text.split(character)
    return [character.join(text[i : i + n]).strip() for i in range(0, len(text), n)]


def split_documents(documents: dict) -> dict:
    """Split documents into passages"""
    titles, texts = [], []
    for title, text in zip(documents["title"], documents["text"]):
        if text is not None:
            for passage in split_text(text):
                titles.append(title if title is not None else "")
                texts.append(passage)
    for i in range(len(texts)):
        try:
            lines = texts[i].split("\n")
            name = "Prodcut name: " +lines[0]
            header = "Header: "+lines[1]
            desc = "Description: "+lines[2]
            rest = "\n".join(lines[3:])
            texts[i] = "\n".join([name, header, desc, rest])
        except:
            print(texts[i])
            continue
    return {"title": titles, "text": texts}

def encode(encoder, q):
    all_emb_c = []
    for c in [q]:
        c_emb = encoder.encode(c)
        all_emb_c.append(np.array(c_emb))
    all_emb_c = np.array(all_emb_c)
    avg_emb_c = np.mean(all_emb_c, axis=0)
    hyde_vector = avg_emb_c.reshape((1, len(avg_emb_c)))
    return hyde_vector

def embed(documents: dict, encoder) -> dict:
    """Compute the DPR embeddings of document passages"""
    embeddings = []
    texts = documents['text']
    for doc in texts:
        embeddings.append(encode(encoder, doc))
    embeddings = np.vstack(embeddings)
    return {"embeddings": embeddings}

In [131]:
dataset = dataset.map(split_documents, batched=True)

                                                   

text




In [None]:
promptor = Promptor('web search')
generator = AlpacaGenerator('decapoda-research/llama-7b-hf', 'tloen/alpaca-lora-7b', 'cuda')


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.
Loading checkpoint shards: 100%|██████████| 33/33 [00:23<00:00,  1.43it/s]


In [7]:
encoder = AutoQueryEncoder(encoder_dir='facebook/contriever', pooling='mean', device='cuda')


In [132]:
# hyde = HyDE(promptor, generator, encoder, searcher)

In [133]:
new_features = Features(
    {"text": Value("string"), "title": Value("string"), "embeddings": Sequence(Value("float32"))}
)  # optional, save as float32 instead of float64 to save space
dataset = dataset.map(
    partial(embed, encoder=encoder),
    batched=True,
    batch_size=16,
    features=new_features,
)

                                                             

In [134]:
dataset[1]['text'].split('\n')

['Prodcut name: Prodcut name: DuoPower',
 'Header: Header: The duo of power and intelligence',
 'Description: Description: The fischer DuoPower is an intelligent 2-component plug with three product functions. The intelligent universal plug is suitable for fixing in all building materials. This allows a variety of applications with just one plug. The fischer DuoPower adjusts itself automatically to the building material and transfers the highest loads through the three product functions of folding, expanding and knotting. The very good feedback from the plug when screwing the screw in generates extra security. The fischer DuoPower 6 x 50, 8 x 65 and 10 x 80, because of the larger anchorage depth, are particularly suitable for fixings in hollow building materials, aerated concrete and for bridge plaster.',
 'Applications: TV consoles,Lighting,Shelves,Mirror cabinets,Letter boxes,Pictures,Fixing blinds,Curtain rails,Wash basin fixings,Plumbing and heating fixings,Bath and toilet installat

In [135]:
dataset.save_to_disk("passages")

                                                                                            

In [136]:
import faiss

In [137]:
index = faiss.IndexHNSWFlat(768, 128, faiss.METRIC_INNER_PRODUCT)
dataset.add_faiss_index("embeddings", custom_index=index)

100%|██████████| 1/1 [00:00<00:00,  7.48it/s]


Dataset({
    features: ['text', 'title', 'embeddings'],
    num_rows: 486
})

In [138]:
import os
index_path = os.path.join("fischer_db")
dataset.get_index("embeddings").save(index_path)

In [139]:
hyde = HyDE(promptor, generator, encoder, None)

In [140]:
query = 'I need a screw with a trumpet shape head, a coarse thread and a needle tip'

In [182]:
WEB_SEARCH = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{}\n\n### Response:\n"


SCIFACT = """Please write a scientific paper passage to support/refute the claim.
Claim: {}
Passage:"""


ARGUANA = """Please write a counter argument for the passage.
Passage: {}
Counter Argument:"""


TREC_COVID = """Please write a scientific paper passage to answer the question.
Question: {}
Passage:"""


FIQA = """Please write a financial article passage to answer the question.
Question: {}
Passage:"""


DBPEDIA_ENTITY = """Please write a passage to answer the question.
Question: {}
Passage:"""


TREC_NEWS = """Please write a news passage about the topic.
Topic: {}
Passage:"""


MR_TYDI = """Please write a passage in {} to answer the question in detail.
Question: {}
Passage:"""


class Promptor:
    def __init__(self, task: str, language: str = 'en'):
        self.task = task
        self.language = language
    
    def build_prompt(self, query: str):
        if self.task == 'web search':
            return WEB_SEARCH.format(query)
        elif self.task == 'scifact':
            return SCIFACT.format(query)
        elif self.task == 'arguana':
            return ARGUANA.format(query)
        elif self.task == 'trec-covid':
            return TREC_COVID.format(query)
        elif self.task == 'fiqa':
            return FIQA.format(query)
        elif self.task == 'dbpedia-entity':
            return DBPEDIA_ENTITY.format(query)
        elif self.task == 'trec-news':
            return TREC_NEWS.format(query)
        elif self.task == 'mr-tydi':
            return MR_TYDI.format(self.language, query)
        else:
            raise ValueError('Task not supported')


In [183]:
promptor = Promptor('web search')
hyde.promptor = promptor

In [187]:
"Please write a passage to answer the question.\nQuestion:"+query

'Please write a passage to answer the question.\nQuestion:Have you got a trumpet head screw with an extra fine thread?'

In [188]:
prompt = hyde.prompt("Please write a passage to answer the question.\nQuestion:"+query)
print(prompt)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Please write a passage to answer the question.
Question:Have you got a trumpet head screw with an extra fine thread?

### Response:



In [189]:

import json
import os.path as osp
from typing import Union


class PrompterAlpaca(object):
    __slots__ = ("template", "_verbose")

    def __init__(self, template_name: str = "", verbose: bool = False):
        self._verbose = verbose
        if not template_name:
            # Enforce the default here, so the constructor can be called with '' and will not break.
            template_name = "alpaca"
        file_name = osp.join("templates", f"{template_name}.json")
        if not osp.exists(file_name):
            raise ValueError(f"Can't read {file_name}")
        with open(file_name) as fp:
            self.template = json.load(fp)
        if self._verbose:
            print(
                f"Using prompt template {template_name}: {self.template['description']}"
            )

    def generate_prompt(
        self,
        instruction: str,
        input: Union[None, str] = None,
        label: Union[None, str] = None,
    ) -> str:
        # returns the full prompt from instruction and optional input
        # if a label (=response, =output) is provided, it's also appended.
        if input:
            res = self.template["prompt_input"].format(
                instruction=instruction, input=input
            )
        else:
            res = self.template["prompt_no_input"].format(
                instruction=instruction
            )
        if label:
            res = f"{res}{label}"
        if self._verbose:
            print(res)
        return res

    def get_response(self, output: str) -> str:
        return output.split(self.template["response_split"])[1].strip()

prompter_alpaca = PrompterAlpaca("/workspace/alpaca-lora/templates/alpaca")

In [190]:
query = "Have you got a trumpet head screw with an extra fine thread?"

In [193]:
hypothesis_documents = hyde.generate("Please write a passage to answer the question.\nQuestion:"+query)
hypothesis_documents = [doc.split('Passage:')[1].split("Question")[0].strip() for doc in hypothesis_documents]

In [195]:
prompter_alpaca.get_response(hypothesis_documents[0])

'Yes, I have a trumpet head screw with an extra fine thread. It is made of stainless steel and has a thread pitch of 0.015 inches.'

In [165]:
hyde.generator.n = 1

In [166]:
hypothesis_documents = hyde.generate("Please write a passage to answer the question.\nQuestion:"+query)
hypothesis_documents = [prompter_alpaca.get_response(doc) for doc in hypothesis_documents]
hypothesis_documents = list(set(hypothesis_documents))
hyde_vector = hyde.encode(query, hypothesis_documents)
orig_vector = hyde.encode(query, [])
hits_hyde = index.search(hyde_vector, k=2)[1][0].tolist()
hits_orig = index.search(orig_vector, k=2)[1][0].tolist()
hits = list(set(hits_orig + hits_hyde))
retrieved_documents = [
    dataset[int(hit)]['text'] for hit in hits
]

input_context = "\n".join(retrieved_documents)
instruction = "Answer the following question: {}".format(query)
prompt_alpaca = prompter_alpaca.generate_prompt(instruction, input_context)
output = hyde.generator.generate(prompt_alpaca)


In [167]:
chat_out = prompter_alpaca.get_response(output[0])

In [168]:
chat_out

'Yes, I have got a trumpet head screw with an extra fine thread.'

In [161]:
hypothesis_documents

['The chipboard screw should be used for working with hardwood without pre-drilling.\nThe chipboard screw should be used for working with hardwood without pre-drilling.']

In [61]:
prompt_alpaca + chat_out + "\nQ: I want to " + "\nContinue discussion:"

'Below is an question that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nQ: I am looking for a full fixing set when it comes to free-standing toilets and bidets. Do you sell such products? \n\n### Input:\nSanitary fixing set S 8 D 70 WCR\nComplete fixing sets for free-standing toilets and bidets\nThe fischer fixing set S 8 D 70 WCR contains all elements required for the safe fixing of free-standing toilets: Two fischer plugs S 8 made from high-quality nylon, two stainless steel screws 6 x 70 with hexagonal head, two cover caps in chrome and white, and two snap-fit sleeves. The set is suitable for pre-positioned and push-through installation.\nApplications: Free-standing toilets,Bidets\nAdvantages: The complete fixing set including stainless steel screws allows for quick and easy installation.,A pronounced rim of the fixing ring prevents contact between the screw and ceramics, thus en

In [38]:
hits_hyde = index.search(hyde_vector, k=1)[1][0].tolist()

In [153]:
retrieved_documents

['Prodcut name: Prodcut name: Sanitary fixing set S 8 RD WCR\nHeader: Header: Complete fixing sets for free-standing toilets and bidets\nDescription: Description: The fischer toilet fixing set S 8 RD 80 contains all elements required for the safe fixing of free-standing toilets: Two fischer plugs S 8 with shaft and pronounced rim made from high-quality nylon, two stainless steel screws 6 x 85 with hexagonal head and two cover caps in chrome and white. The set is suitable for push-through installation.\nApplications: Free-standing toilets,Bidets\nAdvantages: The complete fixing set including stainless steel screws allows for quick and easy installation.,A pronounced rim prevents contact between the screw and ceramics, thus ensuring nothing gets damaged during fixing.\nMaterials: Free-standing toilets,Bidets',
 'Prodcut name: Prodcut name: DuoSeal\nHeader: Header: The sealing plug for wet areas\nDescription: Description: The fischer DuoSeal is a 2-component plug which seals the drill hol

array([388, 392, 391, 389, 390])

['Pipe clip RC\nThe convenient pipe fixing\nThe fischer pipe clip RC is an easy-to-mount solution for pipe fixings. The fischer SF plus RC is ideal for fixing plastic insulation pipes in concrete and solid building materials. Two additional pipe clips can be added to the sides of a pre-fixed pipe clip. This saves time and money, and increases flexibility. The pipe clip is secured in the drill hole with the fischer ClipFix plus SD or the fischer Hammerfix N 6. In the case of mounting with the fischer ClipFix plus SD, no further screw is required for the fixing. The plastic pipes are inserted and secured by the pre-tensioning of the pipe clip. The high-quality nylon of the pipe clip provides additional safety.\nApplications: Flexible and rigid plastic insulating pipes\nAdvantages: The pipe clip RC can be used with pre-installed clip fixing SD, with Hammerfix N 6 or in 11 mm C-shaped profile-rails, and thus allows for a flexible and cost-effective installation.,The 6 mm-long hole allows f

{'text': 'Drywall screw FSN-TPR\nThe gypsum plasterboard screws with trumpet shape head, coarse thread, needle tip and PH cross drive',
 'title': 'Drywall screw FSN-TPR',
 'embeddings': [-0.022631892934441566,
  0.08117809891700745,
  -0.016862154006958008,
  -0.027642248198390007,
  0.011176199652254581,
  -0.020339298993349075,
  -0.0024805713910609484,
  0.018878331407904625,
  0.01418533455580473,
  0.0007856976008042693,
  -0.09008126705884933,
  -0.09138631820678711,
  -0.060276955366134644,
  0.024773133918642998,
  -0.0006657866179011762,
  -0.04839674010872841,
  0.002140309428796172,
  -0.004338051192462444,
  -0.009142357856035233,
  -0.03598582744598389,
  -0.04379086568951607,
  0.013270639814436436,
  -0.05980702117085457,
  -0.03862391412258148,
  0.06030304357409477,
  -0.17161838710308075,
  -0.04694158956408501,
  -0.024428686127066612,
  -0.03977556899189949,
  -0.05521037057042122,
  -0.03523493558168411,
  -0.03125155717134476,
  -0.14630326628684998,
  -0.04164431

### Build Zeroshot Prompt

Please write a passage to answer the question.
Question: how long does it take to remove wisdom tooth
Passage:


### Generate Hypothesis Documents

HyDE Generated Document: 0
There is no one-size-fits-all answer to this question, as the time it takes to remove a wisdom tooth can vary depending on the individual case. In general, however, the procedure usually takes around 30 minutes to an hour to complete.
HyDE Generated Document: 1
It generally takes around 30 to 45 minutes to remove a wisdom tooth. However, the time may vary depending on the position of the tooth and the amount of work required.
HyDE Generated Document: 2
It usually takes around 30 to 45 minutes to remove a wisdom tooth. However, the length of time may vary depending on the individual case.
HyDE Generated Document: 3
It can take anywhere from a few days to a few weeks to remove a wisdom tooth. The length of time will depend on the individual case and the severity of the tooth.
HyDE Generated Document: 4
The length of time it takes to remove a wisdom tooth varies depending on the tooth's position and the amount of bone surrounding it. The procedure can take anywh

### Encode HyDE vector

(1, 768)


### Search Relevant Documents

HyDE Retrieved Document: 0
4174313
The time it takes to remove the tooth will vary. Some procedures only take a few minutes, whereas others can take 20 minutes or longer. After your wisdom teeth have been removed, you may experience swelling and discomfort, both on the inside and outside of your mouth.This is usually worse for the first three days, but it can last for up to two weeks. Read more about how a wisdom tooth is removed and recovering from wisdom tooth removal.he time it takes to remove the tooth will vary. Some procedures only take a few minutes, whereas others can take 20 minutes or longer. After your wisdom teeth have been removed, you may experience swelling and discomfort, both on the inside and outside of your mouth.
HyDE Retrieved Document: 1
18103
Before having your wisdom teeth removed, you'll be given an injection of local anaesthetic to numb the tooth and surrounding area. If you're particularly anxious about the procedure, your dentist or surgeon may give you a se

### End to End Search

e2e search will directly go through all the steps descripted above.

In [42]:
hits = hyde.e2e_search(query, k=10)
for i, hit in enumerate(hits):
    print(f'HyDE Retrieved Document: {i}')
    print(hit.docid)
    print(json.loads(corpus.doc(hit.docid).raw())['contents'])

HyDE Retrieved Document: 0
4174313
The time it takes to remove the tooth will vary. Some procedures only take a few minutes, whereas others can take 20 minutes or longer. After your wisdom teeth have been removed, you may experience swelling and discomfort, both on the inside and outside of your mouth.This is usually worse for the first three days, but it can last for up to two weeks. Read more about how a wisdom tooth is removed and recovering from wisdom tooth removal.he time it takes to remove the tooth will vary. Some procedures only take a few minutes, whereas others can take 20 minutes or longer. After your wisdom teeth have been removed, you may experience swelling and discomfort, both on the inside and outside of your mouth.
HyDE Retrieved Document: 1
91493
The time it takes to remove the tooth will vary. Some procedures only take a few minutes, whereas others can take 20 minutes or longer. After your wisdom teeth have been removed, you may experience swelling and discomfort, b