### Step 1: Document processing 


In [None]:
import textract
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

vector_database_path = './data/modular_rag_vector_database.npy'
raw_file_path = './data/deep-learning.pdf'

# Segment text using a sliding window approach
def segment_text(text, window_size=500, step=250):
    return [text[i:i+window_size] for i in range(0, len(text)-window_size+1, step)]

# Load data, mock storage
def load_and_process_document(file_path=raw_file_path):
    text = textract.process(file_path, method="pdfminer").decode('utf-8')
    vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
    segments = segment_text(text)
    tfidf_matrix = vectorizer.fit_transform(segments)
    np.save(vector_database_path, tfidf_matrix.toarray())
    return vectorizer, segments

### Step 2: Implement routing to optimize retrieval
- FIrst infer intent of query. Working with these classes : 'history', 'forecast' & 'reasoning'
- This involves training a model to infer the classes
- Another approach would be to use an LLM for the classification
- Once we can classify the query, we determine actions to take for each class. In this case, I am appending 
additional keywords to the query for better targeting in retrieval. Another option may include switching databases or using fewer resources.
- Other ways for more complex use cases may involve semantic routes instead of if statements

In [None]:
# Implementing an automated routing system
# First infer intent of query 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import joblib

model_path = './model/query_classifier_model.pkl'

data = [
    {"query": "What led to the adoption of CNNs?", "label": "history"},
    {"query": "Will GANs improve image synthesis in the future?", "label": "forecast"},
    {"query": "Why do dropout layers help reduce overfitting?", "label": "reasoning"}
    # Add more data for better accuracy
]

# Prepare dataset
texts = [item['query'] for item in data]
labels = [item['label'] for item in data]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Create ML pipeline
pipeline = Pipeline([('tfidf', TfidfVectorizer()), ('classifier', LogisticRegression())])

# Train model
pipeline.fit(X_train, y_train)

# Other training stages such as testing, feedback and improvement apply
# Save the model
joblib.dump(pipeline, model_path)

# Function to classify the query
# Return 0: history, 1:forecast, 2:reasoning
def classify_query(query, model):
    prediction = model.predict([query])[0]
    return {'history': 0, 'forecast': 1, 'reasoning': 2}[prediction]

def route_query(query, model):
    intent = classify_query(query=query, model=model)
    
    if intent == 0:
        # The actions here are open.
        return f"Historical data for: {query}"
    elif intent == 1:
        return f"Future events {query}"
    else:
        return f"Formulae, infer, calculate ${query}"

### Step 3: Search vectors
- We search the vectors

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def search_vectors(query, vectorizer, filename=vector_database_path):
    vectors = np.load(filename)
    query_vec = vectorizer.transform([query])
    scores = cosine_similarity(query_vec, vectors).flatten()
    top_k = scores.argsort()[-5:][::-1]
    return top_k

### Step 4: Generate response

In [None]:
from openai import OpenAI
import os

def generate_response(prompt, api_key):
    client = OpenAI(api_key=api_key)
    response = client.chat.completions.create(messages=[{ "role":"user", "content": prompt }], model="gpt-3.5-turbo")

    generated_text = response.choices[0].message.content

    # Saving the response
    file_path = './data/modular-rag-chat-gpt-response.txt'
    
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(generated_text)
    
    print('Response saved in: ')
    print(file_path)
    return generated_text

### Step 5: Putting it together

In [None]:
from dotenv import load_dotenv

load_dotenv()

file_path = './data/deep-learning.pdf'
api_key = os.getenv('OPEN_AI_KEY')
query = "Explain the concept of back propagation in neural networks"

# Process queries
model = joblib.load(model_path)
processed_query = route_query(query=query, model=model)
print('ProcessedQuery: ')
print(processed_query)

# Search vectors
vectorizer, _ = load_and_process_document()
context = search_vectors(processed_query, vectorizer=vectorizer)

# Generate response
prompt = f"Question: {processed_query}\nContext: {context}"
response = generate_response(prompt=prompt, api_key=api_key)
print(response)