In [1]:
%pip install faiss-cpu

Note: you may need to restart the kernel to use updated packages.


In [2]:
from transformers import AutoTokenizer, AutoModel, GPT2LMHeadModel
import faiss
import torch
import os
import numpy as np
from collections import OrderedDict

In [3]:

def read_text_files(directory_path: str) -> list[str]:
	"""
	Reads all .txt files in the specified directory, stores each file's content as a single string,
	and appends it to a main list.
	
	:return: a list where each element is the content of a single text file.
	"""
	file_contents = []
	
	for filename in os.listdir(directory_path):
		if filename.endswith('.txt'):
			file_path = os.path.join(directory_path, filename)
			with open(file_path, 'r', encoding='utf-8') as file:
				content = file.read()
				file_contents.append(content)
	
	return file_contents


In [4]:
text_chunks = read_text_files('data')

In [5]:

# load embedding and generation models
embedder = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
generator = GPT2LMHeadModel.from_pretrained("gpt2")

def create_embeddings(text_chunks):
	chunk_embeddings = []
	
	for i, chunk in enumerate(text_chunks):

		try:
			# tokenize with truncation
			inputs = tokenizer(chunk, return_tensors="pt", truncation=True)
			
			# generate embeddings
			with torch.no_grad():
				embedding = embedder(**inputs).last_hidden_state.mean(dim=1).numpy()
			chunk_embeddings.append(embedding)
		
		except IndexError as e:
			print(f"Error at chunk {i}: {e}")
		except Exception as e:
			print(f"Unexpected error at chunk {i}: {e}")
	
	return chunk_embeddings

embeddings = create_embeddings(text_chunks)

Error at chunk 0: index out of range in self
Error at chunk 1: index out of range in self
Error at chunk 2: index out of range in self
Error at chunk 3: index out of range in self
Error at chunk 5: index out of range in self
Error at chunk 6: index out of range in self
Error at chunk 7: index out of range in self
Error at chunk 8: index out of range in self
Error at chunk 9: index out of range in self
Error at chunk 10: index out of range in self
Error at chunk 11: index out of range in self
Error at chunk 12: index out of range in self


In [6]:
# set up FAISS
dimension = embeddings[0].shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.vstack(embeddings))

In [7]:
def gen_answer(query: str):
	
	# query workflow
	query_embedding = embedder(**tokenizer(query, return_tensors="pt")).last_hidden_state.mean(dim=1).detach().numpy()
	distances, indices = index.search(query_embedding, k=3)  # retrieve top-k relevant chunks
	tokenizer.add_special_tokens({"pad_token": "--"})
	context = " ".join([text_chunks[i] for i in indices[0]])
	input_text = f"Context: {context}\n\nQuery: {query}\nAnswer:"
	inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
	attention_mask = inputs['attention_mask']
	
	# generate output with attention mask
	output = generator.generate(inputs['input_ids'], attention_mask=attention_mask, max_length=1_000)
	answer = tokenizer.decode(output[0], skip_special_tokens=True)
	answer = answer.split(sep='Answer:')[1] # remove context
	print("\n".join(list(OrderedDict.fromkeys(answer.split("\n"))))) # remove duplicate answers


In [8]:
gen_answer("How much money can i get from a canceled flight?")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


 The airline will give you a written notice of your EU air passenger rights.

If you are not able to get compensation, you can ask the airline to refund your flight.
If you are unable to get compensation, you can ask the airline to refund your flight.
If you are


In [9]:
gen_answer("How long until i get money for a delayed flight?")


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


 The airline will provide you with a written notice of your EU air passenger rights.

If you are not able to get your money within the time limit, you can request a refund from the airline.
If you are unable to get your money within the time limit, you can request a refund from the airline.
If you are unable to get your money within the
