### Indexing
- Clean and extract text
- Segment text into chunks
- Encode these chunks into vectors
- Store vectors in databases

In [1]:
# Imports
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModel
import textract

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Initialize tokenizer and model for encoding text into vectors
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModel.from_pretrained('distilbert-base-uncased')

def encode_text(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()




In [3]:
# Extract text from a PDF (Sample source)
text = textract.process("./data/deep-learning.pdf", method="pdfminer").decode()

# Segment the text into chunks
chunks = [text[i:i+500] for i in range(0, len(text), 500)]
print('Sample chunk:')
print(chunks[1])


Sample chunk:
atrices
2.3
. . . . . . . . . . . . . . . . . . . .
2.4
Linear Dependence and Span . . . . . . . . . . . . . . . . . . . .
Norms . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
2.5
Special Kinds of Matrices and Vectors
2.6
. . . . . . . . . . . . . . .
2.7
Eigendecomposition . . . . . . . . . . . . . . . . . . . . . . . . . .
Singular Value Decomposition . . . . . . . . . . . . . . . . . . . .
2.8
The Moore-Penrose Pseudoinverse . . . . . . . . . . . . . . . . . .
2.9
2.10 The 


In [4]:
# Encoding and indexing
dim = model.config.hidden_size
index = faiss.IndexFlatL2(dim) # Using L2 distance for simplicity

for chunk in chunks:
    vec = encode_text(chunk)
    index.add(vec)

# Save the index
faiss.write_index(index, "./data/store-chatgpt.faiss")

  return torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count


### Handling retrieval
- Encode the user query
- Compute similarity scores between the query vector and document vectors
- Retrieve the top K similar chunks

In [10]:
# Handling retrieval
def retrieve(query, k=5):
    # Load index from file, this is replaced with a vector database
    index = faiss.read_index('./data/store-chatgpt.faiss')
    query_vec = encode_text(query)
    D, I = index.search(query_vec, k)
    return [chunks[i] for i in I[0]], D[0]

# Example
query = "What is RAG in AI?"
retrieved_chunks, distances = retrieve(query=query)
print("Retrieved chunks:")
print(retrieved_chunks, distances)

Retrieved chunks:
['orld. For example, Cyc failed to understand a story\nabout a person named Fred shaving in the morning (\n). Its inference\nengine detected an inconsistency in the story: it knew that people do not have\nelectrical parts, but because Fred was holding an electric razor, it believed the\nentity “FredWhileShaving” contained electrical parts. It therefore asked whether\nFred was still a person while he was shaving.\n\nLinde 1992\n\n,\n\nThe diﬃculties faced by systems relying on hard-coded knowledge suggest\nthat', 'ge and acquiring knowledge can be done via learning,\nwhich has motivated the development of large-scale deep architectures. However,\nthere are diﬀerent kinds of knowledge. Some knowledge can be implicit, sub-\nconscious, and diﬃcult to verbalize—such as how to walk, or how a dog looks\ndiﬀerent from a cat. Other knowledge can be explicit, declarative, and relatively\nstraightforward to put into words—every day commonsense knowledge, like “a cat\nis a kind o

### Generation
- Combine the query and retrieved texts into a coherent prompt
- Generate a response using model

In [11]:
from openai import OpenAI
import os
from dotenv import load_dotenv

load_dotenv()

def generate_response_with_chatgpt(query, retrieved_chunks, api_key):
    prompt = f"Question: {query}\nContext: " + " ".join(retrieved_chunks)

    print('OpenAIKey: ')
    print(api_key)
    client = OpenAI(api_key=api_key)
    response = client.chat.completions.create(messages=[{ "role":"user", "content": prompt }], model="gpt-3.5-turbo")

    generated_text = response.choices[0].message.content

    # Saving the response
    file_path = './data/chat-gpt-response.txt'
    
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(generated_text)

    return generated_text

# Calling actions
api_key = os.getenv('OPEN_AI_KEY')
query = "Explain the concept of back propagation in neural networks"
retrieved_chunks, _ = retrieve(query)
response = generate_response_with_chatgpt(query, retrieved_chunks=retrieved_chunks, api_key=api_key)

print('Saved response in text file: ./data/chat-gpt-response.txt')

OpenAIKey: 
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
Saved response in text file: ./data/chat-gpt-response.txt
