# json + text-based search engine (minsearch) + OpenAI API

In [None]:
#!pip install openai python-dotenv tqdm requests beautifulsoup4
#!pip install --upgrade pinecone

# Import packages and modules

In [None]:
import minsearch
import json
from openai import OpenAI  # OpenAI API client
import pinecone  # Pinecone client library
# print(pinecone.__version__)  # Print Pinecone version (optional)
from pinecone import Pinecone, ServerlessSpec  # For setup and serverless config
import hashlib  # For hashing (e.g., file IDs)
import os  # OS operations and env vars
from datetime import datetime  # Date and time handling
from tqdm import tqdm  # Progress bars (e.g., looping through files)
import requests  # HTTP requests (e.g., for web scraping or API calls)
import re  # Regular expressions (e.g., for pattern matching in strings)
from bs4 import BeautifulSoup  # HTML parsing
from dotenv import load_dotenv  # Load .env file

# Load API Keys

In [None]:
load_dotenv()
# Now retrieve the keys
openai_key = os.getenv("OPENAI_API_KEY")
pinecone_key = os.getenv("PINECONE_API_KEY")

In [None]:
# Initialize the OpenAI client
client = OpenAI(
    api_key=openai_key
)

# Initialize Pinecone v3 client
pc = Pinecone(
    api_key=pinecone_key
)

# Load Json File

In [None]:
with open('rk.json', 'rt') as rk_in:
    docs_raw = json.load(rk_in)
docs_raw

In [None]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        
        # Fix: normalize text fields to strings
        for field in ["question", "text", "section"]:
            if isinstance(doc.get(field, ""), list):
                doc[field] = " ".join(doc[field])
            elif doc.get(field) is None:
                doc[field] = ""
            else:
                doc[field] = str(doc[field])

        documents.append(doc)


In [None]:
documents[0]

# Minsearch

In [None]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [None]:
index.fit(documents)

# Functions

In [None]:
def search(question):
    boost = {'question': 3.0, 'section': 0.5}
    #shift+tab
    results = index.search(
            query=question,
            filter_dict={'course': 'rechnerkommunikation-preparation-guide'},
            boost_dict=boost,
            num_results=3
        )  
    
    return results

In [None]:
def build_prompt(question, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the rk json file.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    prompt = prompt_template.format(question=question, context=context).strip()
    return prompt

In [None]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [None]:
question = 'what to do before the semester?'

In [None]:
def rag(question):
    search_results = search(question)
    prompt = build_prompt(question, search_results)
    answer = llm(prompt)
    return answer

In [None]:
rag(question)