In [7]:
import numpy as np
import os
os.chdir('d:\\GyanPrakashKushwaha\\TDS\\On-My-Own')
from embed import get_chunks, get_embeddings
from config import GEMINI_API_KEY

In [2]:
discourse_embeddings = np.load('embeddings\discourse_embeddings.npz')
markdown_embeddings = np.load('embeddings\markdown_embeddings.npz')

In [3]:
discourse_embeddings, markdown_embeddings

(NpzFile 'embeddings\\discourse_embeddings.npz' with keys: chunks, embeddings, original_urls,
 NpzFile 'embeddings\\markdown_embeddings.npz' with keys: chunks, embeddings, original_urls)

In [11]:
discourse_embeddings['embeddings'][0][0],'\n',markdown_embeddings['chunks'][0] 

(array([ 0.0306072 , -0.01333968,  0.01606282, ...,  0.01087242,
         0.00144763, -0.00074216], shape=(3072,)),
 '\n',
 array(['Development Tools NOTE : The tools in this module are PRE-REQUISITES for the course. You would have used most of these before. If most of this is new to you, please take this course later. Some tools are fundamental to data science because they are industry standards and widely used by data science professionals. Mastering these tools will align you with current best practices and making you more adaptable in a fast-evolving industry. The tools we cover here are not just popular, they’re the core technology behind most of today’s data science and software development.'],
       dtype='<U7895'))

In [19]:
def find_similar_content(query_embedding, MAX_SIMILAR_TEXT, discourse_data, markdown_data):
    results = []
    
    # Search discourse chunks
    print("Searching discourse chunks for similar content...")
    embeddings = discourse_data['embeddings']
    contents = discourse_data['chunks']
    urls = discourse_data['original_urls']
    
    for i, embedding in enumerate(embeddings):            
            similarity = cosine_similarity(query_embedding, embedding)
            if similarity >= 0.5:
                results.append({
                    "source": "discourse",
                    "url": urls[i] if i < len(urls) else "",
                    "contents": contents[i] if i < len(contents) else "",
                    "similarity": similarity
                })
    
    # Search markdown chunks
    print("Searching markdown chunks for similar contents...")
    embeddings = markdown_data['embeddings']
    contents = markdown_data['chunks']
    urls = markdown_data['original_urls']
    
    for i, embedding in enumerate(embeddings):            
            similarity = cosine_similarity(query_embedding, embedding)
            if similarity >= 0.5:
                results.append({
                    "source": "markdown",
                    "url": urls[i] if i < len(urls) else "",
                    "contents": contents[i] if i < len(contents) else "",
                    "similarity": similarity
                })
    
    # Sort by similarity and return top results
    results.sort(key=lambda x: x["similarity"], reverse=True)
    return results[:MAX_SIMILAR_TEXT]

def cosine_similarity(vec1, vec2):
    vec1 = np.array(vec1)
    vec2 = np.array(vec2[0])
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product/(norm_vec1 * norm_vec2)

In [13]:
question = "How can I proceed with TDS course?"
embedding_response = await get_embeddings(question, GEMINI_API_KEY)

In [20]:
relevant_results = find_similar_content(embedding_response, 10, discourse_embeddings, markdown_embeddings)

Searching discourse chunks for similar content...
Searching markdown chunks for similar contents...


In [21]:
relevant_results

[{'source': 'discourse',
  'url': array(['https://discourse.onlinedegree.iitm.ac.in/t/drop-course-window-for-tds/164737'],
        dtype='<U147'),
  'contents': array(['I found that TDS is so much time engaging along with office work and also new comer in coding. Should i drop the course as of now? if Yes, then when will be the drop course window start?'],
        dtype='<U8000'),
  'similarity': np.float64(0.7632075082648546)},
 {'source': 'discourse',
  'url': array(['https://discourse.onlinedegree.iitm.ac.in/t/which-subject-to-choose-in-jan-term/161071'],
        dtype='<U147'),
  'contents': array(['Hii all…i know this may not be the correct platform to ask this question. Actually i have 3 diploma subjects to cover - MLP, JAVA, TDS. So which 2 subject should i choose for this term?Don’t take TDS, this subject should be taken in the end else you might have some problemTDS at the end best. Gives you the best chance of doing well in it. Kind regardsYes you should do tds after mlp only

In [23]:
import httpx
import asyncio

In [24]:
from google.genai import Client, types

In [29]:
async def generate_answer(API_KEY, url, question, relevant_results, max_retries=3):
    context = ""
    for result in relevant_results:
        source_type = "Discourse post" if result["source"] == "discourse" else "Documentation"
        context += f"\n\n{source_type} (URL: {result['url']}):\n{result['contents'][:1500]}"
    
    prompt = f"""Answer the following question based ONLY on the provided context. 
    If you cannot answer the question based on the context, say "I don't have enough information to answer this question."

    Context:
    {context}

    Question: {question}

    Return your response in this exact format:
    1. A comprehensive yet concise answer
    2. A "Sources:" section that lists the URLs and relevant text snippets you used to answer

    Sources must be in this exact format:
    Sources:
    1. URL: [exact_url_1], Text: [brief quote or description]
    2. URL: [exact_url_2], Text: [brief quote or description]

    Make sure the URLs are copied exactly from the context without any changes.
    """
    
    client = Client(api_key=API_KEY)

    response = client.models.generate_content(
        model="gemini-2.5-flash",
        config=types.GenerateContentConfig(
            system_instruction= 'You are a helpful assistant that provides accurate answers based only on the provided context. Always include sources in your response with exact URLs.'),
        contents=prompt
    )
    
    return response.text

In [31]:
ans = await generate_answer(GEMINI_API_KEY, '', question, relevant_results)

In [32]:
ans

'To proceed with the TDS course, consider the following:\n\n1.  **Timing:** It is recommended to take TDS after completing MLP (Machine Learning Programming) or towards the end of your diploma subjects, as this might give you the best chance of doing well. Some students have noted that additional programming skills beyond Python are required.\n2.  **Course Content and Resources:** All course content, including recorded videos, course calendars, and GAs (Graded Assignments), are available on the TDS course site. You should diligently read the course introduction as it contains very important information.\n3.  **Engage with Learning Materials:** While video lectures are the primary content, live sessions are highlighted as the most important resource and are highly recommended. Make sure to practice the Graded Assignments properly.\n\nSources:\n1.  URL: https://discourse.onlinedegree.iitm.ac.in/t/which-subject-to-choose-in-jan-term/161071, Text: "Don’t take TDS, this subject should be ta

In [33]:
import re
def parse_llm_response(response):
    try:
        # First try to split by "Sources:" heading
        parts = response.split("Sources:", 1)
        
        # If that doesn't work, try alternative formats
        if len(parts) == 1:
            # Try other possible headings
            for heading in ["Source:", "References:", "Reference:"]:
                if heading in response:
                    parts = response.split(heading, 1)
                    break
        
        answer = parts[0].strip()
        links = []
        
        if len(parts) > 1:
            sources_text = parts[1].strip()
            source_lines = sources_text.split("\n")
            
            for line in source_lines:
                line = line.strip()
                if not line:
                    continue
                    
                # Remove list markers (1., 2., -, etc.)
                line = re.sub(r'^\d+\.\s*', '', line)
                line = re.sub(r'^-\s*', '', line)
                
                # Extract URL and text using more flexible patterns
                url_match = re.search(r'URL:\s*\[(.*?)\]|url:\s*\[(.*?)\]|\[(http[^\]]+)\]|URL:\s*(http\S+)|url:\s*(http\S+)|(http\S+)', line, re.IGNORECASE)
                text_match = re.search(r'Text:\s*\[(.*?)\]|text:\s*\[(.*?)\]|[""](.*?)[""]|Text:\s*"(.*?)"|text:\s*"(.*?)"', line, re.IGNORECASE)
                
                if url_match:
                    # Find the first non-None group from the regex match
                    url = next((g for g in url_match.groups() if g), "")
                    url = url.strip()
                    
                    # Default text if no match
                    text = "Source reference"
                    
                    # If we found a text match, use it
                    if text_match:
                        # Find the first non-None group from the regex match
                        text_value = next((g for g in text_match.groups() if g), "")
                        if text_value:
                            text = text_value.strip()
                    
                    # Only add if we have a valid URL
                    if url and url.startswith("http"):
                        links.append({"url": url, "text": text})
        
        return {"answer": answer, "links": links}
    except Exception as e:
        error_msg = f"Error parsing LLM response: {e}"
        # Return a basic response structure with the error
        return {
            "answer": "Error parsing the response from the language model.",
            "links": []
        }

In [36]:
print(parse_llm_response(ans))

{'answer': 'To proceed with the TDS course, consider the following:\n\n1.  **Timing:** It is recommended to take TDS after completing MLP (Machine Learning Programming) or towards the end of your diploma subjects, as this might give you the best chance of doing well. Some students have noted that additional programming skills beyond Python are required.\n2.  **Course Content and Resources:** All course content, including recorded videos, course calendars, and GAs (Graded Assignments), are available on the TDS course site. You should diligently read the course introduction as it contains very important information.\n3.  **Engage with Learning Materials:** While video lectures are the primary content, live sessions are highlighted as the most important resource and are highly recommended. Make sure to practice the Graded Assignments properly.', 'links': [{'url': 'https://discourse.onlinedegree.iitm.ac.in/t/which-subject-to-choose-in-jan-term/161071,', 'text': 'Don’t take TDS, this subjec

In [None]:
# {'answer': 'To proceed with the TDS course, consider the following:\n\n1.  **Timing:** It is recommended to take TDS after completing MLP (Machine Learning Programming) or towards the end of your diploma subjects, as this might give you the best chance of doing well. Some students have noted that additional programming skills beyond Python are required.\n2.  **Course Content and Resources:** All course content, including recorded videos, course calendars, and GAs (Graded Assignments), are available on the TDS course site. You should diligently read the course introduction as it contains very important information.\n3.  **Engage with Learning Materials:** While video lectures are the primary content, live sessions are highlighted as the most important resource and are highly recommended. Make sure to practice the Graded Assignments properly.', 'links': [{'url': 'https://discourse.onlinedegree.iitm.ac.in/t/which-subject-to-choose-in-jan-term/161071,', 'text': 'Don’t take TDS, this subject should be taken in the end else you might have some problem TDS at the end best. Gives you the best chance of doing well in it. Kind regards Yes you should do tds after mlp only it really help you alot trust me'}, {'url': 'https://discourse.onlinedegree.iitm.ac.in/t/tds-jan-2025-calender-accesss-and-video-lecture-access-in-portal/163158,', 'text': 'The course content is available at [Screenshot 2025-01-13 at 22.47.14] At Tools in Data Science the course calendar is also available. [Screenshot 2025-01-13 at 22.51.33] Likewise the GAs also are available through the links in the seek portal as well as on the course domain shared above. The course introduction has very important information. Many students who faced problems in Tools for Data Science in previous terms usually never read the intro. Please be diligent to read everything carefully.'}, {'url': 'https://discourse.onlinedegree.iitm.ac.in/t/issues-in-tds-and-replacement-with-another-course/164147,', 'text': 'Additional programming skills are required(in addition to Python) to follow the TDS Course. That means indirectly, one should complete many courses diploma in programming before doing TDS.'}, {'url': 'https://discourse.onlinedegree.iitm.ac.in/t/project-2-tds-solver-discussion-thread/169029,', 'text': 'In one of the orientation session, I heard Andrew sir say this (subject to correction): The video lectures are the primary content. It is not compulsory to attend the live sessions, but it is highly recommended. Here, the most important resource are the live sessions... I didn’t practice the GAs properly. So I failed ROE.'}, {'url': 'https://discourse.onlinedegree.iitm.ac.in/t/tds-module-6/168482,', 'text': 'All modules were released before the 10th of Jan on the TDS course site. https://tds.s-anand.net/#/'}]}

In [None]:
async def process_multimodal_query(question, image_base64):

    try:
        if not image_base64:
            logger.info("No image provided, processing as text-only query")
            return await get_embedding(question)
        
        logger.info("Processing multimodal query with image")
        # Call the GPT-4o Vision API to process the image and question
        url = "https://aipipe.org/openai/v1/chat/completions"
        headers = {
            "Authorization": API_KEY,
            "Content-Type": "application/json"
        }
        
        # Format the image for the API
        image_content = f"data:image/jpeg;base64,{image_base64}"
        
        payload = {
            "model": "gpt-4o-mini",
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": f"Look at this image and tell me what you see related to this question: {question}"},
                        {"type": "image_url", "image_url": {"url": image_content}}
                    ]
                }
            ]
        }