In [1]:
%pip install faiss-cpu


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
from transformers import AutoTokenizer, AutoModel, GPT2LMHeadModel
import faiss
import torch
import os
import numpy as np

In [3]:

def read_text_files(directory_path):
	"""
	Reads all .txt files in the specified directory, stores each file's content as a single string,
	and appends it to a main list.
	
	Args:
	- directory_path (str): Path to the directory containing text files.
	
	Returns:
	- list of str: A list where each element is the content of a single text file.
	"""
	file_contents = []  # List to hold the contents of each file as a single string
	
	# Iterate through all files in the directory
	for filename in os.listdir(directory_path):
		# Check if the file is a .txt file
		if filename.endswith('.txt'):
			file_path = os.path.join(directory_path, filename)
			# Open and read the file's content
			with open(file_path, 'r', encoding='utf-8') as file:
				content = file.read()  # Read the entire content of the file
				file_contents.append(content)  # Append to the main list
	
	return file_contents


In [4]:
text_chunks = read_text_files('data')

In [5]:

# Load embedding and generation models
embedder = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
generator = GPT2LMHeadModel.from_pretrained("gpt2")

def create_embeddings(text_chunks):
	chunk_embeddings = []
	
	for i, chunk in enumerate(text_chunks):
		# Skip empty chunks
		if not chunk.strip():
			print(f"Skipping empty chunk at index {i}")
			continue
		
		try:
			# Tokenize with truncation
			inputs = tokenizer(chunk, return_tensors="pt", truncation=True)
			
			# Generate embeddings
			with torch.no_grad():
				embedding = embedder(**inputs).last_hidden_state.mean(dim=1).numpy()
			chunk_embeddings.append(embedding)
		
		except IndexError as e:
			print(f"Error at chunk {i}: {e}")
		except Exception as e:
			print(f"Unexpected error at chunk {i}: {e}")
	
	return chunk_embeddings

# Example usage
chunk_embeddings = create_embeddings(text_chunks)

Error at chunk 0: index out of range in self
Error at chunk 1: index out of range in self
Error at chunk 2: index out of range in self
Error at chunk 3: index out of range in self
Error at chunk 5: index out of range in self
Error at chunk 6: index out of range in self
Error at chunk 7: index out of range in self
Error at chunk 8: index out of range in self
Error at chunk 9: index out of range in self
Error at chunk 10: index out of range in self
Error at chunk 11: index out of range in self
Error at chunk 12: index out of range in self


In [6]:
# Set up FAISS
dimension = chunk_embeddings[0].shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.vstack(chunk_embeddings))

In [7]:
def gen_answer(query: str):
	
	# Query workflow

	query_embedding = embedder(**tokenizer(query, return_tensors="pt")).last_hidden_state.mean(dim=1).detach().numpy()
	distances, indices = index.search(query_embedding, k=3)  # retrieve top-k relevant chunks
	tokenizer.add_special_tokens({"pad_token": "--"})
	context = " ".join([text_chunks[i] for i in indices[0]])
	input_text = f"Context: {context}\n\nQuery: {query}\nAnswer:"
	inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
	attention_mask = inputs['attention_mask']
	
	# Set pad_token_id to eos_token_id if it’s not already set
	if generator.config.pad_token_id is None:
		generator.config.pad_token_id = generator.config.eos_token_id

	# Generate output with attention mask
	output = generator.generate(inputs['input_ids'], attention_mask=attention_mask, max_length=1_000)
	answer = tokenizer.decode(output[0], skip_special_tokens=True)
	return answer


In [8]:
print(gen_answer("How much money can i get from a canceled flight"))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Context: Your flight was overbooked

If you have presented yourself on time for the check-in with a valid flight reservation and travel documentation and you're denied boarding due to overbooking or for operational reasons, and you don't voluntarily give up your seat, you are entitled to:

    compensation
    the right to choose between reimbursement, re-routing or rebooking at a later stage and
    assistance from the airline
 Delay

If your flight is delayed at departure, you have the right to assistance, to reimbursement and a return flight, depending on the duration of the delay and the distance of the flight.

If you arrived at your final destination with a delay of more than 3 hours, you are entitled to compensation, unless the delay was due to extraordinary circumstances. The airline has to prove this by providing, for example, extracts from logbooks or incident reports. The air carrier should give this evidence to the relevant national enforcement body as well as to the passen