In [1]:
%pip install faiss-cpu

Note: you may need to restart the kernel to use updated packages.


In [1]:
from transformers import AutoTokenizer, AutoModel, GPT2LMHeadModel
import faiss
import torch
import os
import numpy as np
from collections import OrderedDict

In [2]:

def read_text_files(directory_path: str) -> list[str]:
	"""
	Reads all .txt files in the specified directory, stores each file's content as a single string,
	and appends it to a main list.
	
	:return: a list where each element is the content of a single text file.
	"""
	file_contents = []
	
	for filename in os.listdir(directory_path):
		if filename.endswith('.txt'):
			file_path = os.path.join(directory_path, filename)
			with open(file_path, 'r', encoding='utf-8') as file:
				content = file.read()
				file_contents.append(content)
	
	return file_contents


In [3]:
text_chunks = read_text_files('data')

In [4]:

# load embedding and generation models
embedder = AutoModel.from_pretrained("gpt2") # sentence-transformers/all-MiniLM-L6-v2
tokenizer = AutoTokenizer.from_pretrained("gpt2")
generator = GPT2LMHeadModel.from_pretrained("gpt2")

def create_embeddings(text_chunks):
	chunk_embeddings = []
	
	for i, chunk in enumerate(text_chunks):
		

		# tokenize with truncation
		inputs = tokenizer(chunk, return_tensors="pt", truncation=True)
		
		# generate embeddings
		with torch.no_grad():
			embedding = embedder(**inputs).last_hidden_state.mean(dim=1)
		chunk_embeddings.append(embedding)
	
	return chunk_embeddings

embeddings = create_embeddings(text_chunks)

In [5]:
# set up FAISS
dimension = embeddings[0].shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.vstack(embeddings))

In [6]:
def gen_answer(query: str):
	
	# query workflow
	query_embedding = embedder(**tokenizer(query, return_tensors="pt")).last_hidden_state.mean(dim=1).detach().numpy()
	distances, indices = index.search(query_embedding, k=3)  # retrieve top-k relevant chunks
	tokenizer.add_special_tokens({"pad_token": "--"})
	context = " ".join([text_chunks[i] for i in indices[0]])
	input_text = f"Context: {context}\n\nQuery: {query}\nAnswer:"
	inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
	attention_mask = inputs['attention_mask']
	
	# generate output with attention mask
	output = generator.generate(inputs['input_ids'], attention_mask=attention_mask, max_length=1_000)
	answer = tokenizer.decode(output[0], skip_special_tokens=True)
	answer = answer.split(sep='Answer: ')[1] # remove context
	print("\n".join(list(OrderedDict.fromkeys(answer.split("\n"))))) # remove duplicate answers


In [7]:
gen_answer("How much money can i get from a canceled flight?")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


The airline will give you a written notice of the cancellation of your flight.

If you are unable to get the airline to give you a written notice, you can request a refund of the amount you paid for the item.
If you are unable to get the


In [8]:
gen_answer("How long must a flight be delayed before i get compensation?")


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


The airline must provide you with a choice between:

    reimbursement of your ticket and, if you have a connecting flight, a return flight to the airport of departure at the earliest opportunity
The airline must offer you, on a oneoff basis, a choice between:
    the reimbursement of your ticket and, if you have a connecting flight, a return flight to the airport of departure at the earliest opportunity
    the reimbursement of your ticket and, if you have a connecting flight,
