<a href="https://colab.research.google.com/github/NaveenHashira/GAN/blob/main/RAG_RN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install -Uq langchain-core langgraph langchain-groq pymongo

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m53.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/313.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.6/313.6 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [11]:
# importing necessary libraries
import os
from typing import TypedDict, List, Dict, Any, Optional,Annotated
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import StateGraph, START, END
from langchain_groq import ChatGroq
from langchain_core.messages import HumanMessage
from langgraph.graph.message import add_messages
from pydantic import BaseModel, Field
from datetime import datetime
from pymongo import MongoClient
import json

In [9]:
# defining the state
class State(TypedDict):
  server_uri:str
  question:str
  metadata:Dict
  filter:Dict
  context:List[str]
  memory:Annotated[list,add_messages]

In [3]:
# initialize llm
#model = ChatGroq(
    #model_name="llama3-8b-8192",
    #temperature=6,
    #api_key=groq_api_key
#)

In [None]:
# initializing state
def update_state_initial(mongodbServer: str,user_query: Optional[str] = None) -> State:
  if user_query is not None:
        state['question'] = user_query

In [14]:
# Extracting metadata from the user query
def extract_metadata_update_state(state: State, user_query: Optional[str] = None) -> State:
    """
    updates state['question'] with a new user query,
    extracts relevant metadata (e.g., companies, names, time periods)
    from the question using a language model, and updates state['metadata'].

    Args:
        state (State): The state object to update.
        user_query (Optional[str]): New query to set as state['question'] (if provided).

    Returns:
        State: The updated state with extracted metadata.
    """

    if user_query is not None:
        state['question'] = user_query

    question = state.get('question', '')

    if not question:
        # Handle empty question  (e.g., skip extraction or raise warning)
        state['metadata'] = {}
        return state

    prompt = f"""
Extract relevant metadata (company names or user names, time periods) from the following questions:

Q: What were the revenue growth rates for Apple and Amazon from 2019 to 2022?
Metadata:
- Companies: Apple, Amazon
- Time period: 2019-2022

Q: How did Naveen and Kavin perform financially in Q1 2023?
Metadata:
- Names: Naveen , Kavin
- Time period: Q1 2023

Q: Which companies led the smartphone market in 2021 and 2022?
Metadata:
- Companies: smartphone market leaders (e.g., Samsung, Apple)
- Time period: 2021-2022

Q: {question}
Metadata:
"""

    # Calling the LLM
    messages = [HumanMessage(content=prompt)]
    response = model.invoke(messages)

    # Extract metadata from model response (assuming structured text like the examples)
    metadata_str = response.content.strip()

    # Improved parsing: Handle lines and split values more robustly
    metadata = {}
    current_key = None
    for line in metadata_str.split('\n'):
        line = line.strip()
        if line.startswith('-') and ':' in line:
            key, value = line.split(':', 1)
            key = key.strip('- ').strip()
            values = [x.strip() for x in value.split(',') if x.strip()]
            metadata[key] = values if len(values) > 1 else values[0] if values else ''
            current_key = key
        elif current_key and line:  # Handle multi-line values if needed
            if isinstance(metadata[current_key], list):
                metadata[current_key].append(line)
            else:
                metadata[current_key] += f" {line}"

    # Update the state with the extracted metadata
    state['metadata'] = metadata

    return state

In [21]:
def generate_metadata_filter(state: State) -> State:
    """
    Generates a MongoDB filter from state['metadata'] (with Companies, Names, Time period),
    updates state['filter'] with it, and returns the updated state.

    Args:
        state (State): The state object containing 'metadata' to build the filter from.

    Returns:
        State: The updated state with 'filter' populated.
    """
    metadata = state.get('metadata', {})

    # Collect all name/company values into one list
    name_company_values = []
    if 'Companies' in metadata and metadata['Companies']:
        name_company_values.extend(metadata['Companies'])
    if 'Names' in metadata and metadata['Names']:
        name_company_values.extend(metadata['Names'])

    time_period_value = metadata.get('Time period')

    # Build filter for intersection (logical AND)
    if name_company_values and time_period_value:
        generated_filter = {
            '$and': [
                {
                    '$or': [
                        {'company': {'$in': name_company_values}},
                        {'name': {'$in': name_company_values}}
                    ]
                },
                {'time_period': time_period_value}
            ]
        }
    elif name_company_values:
        generated_filter = {
            '$or': [
                {'company': {'$in': name_company_values}},
                {'name': {'$in': name_company_values}}
            ]
        }
    elif time_period_value:
        generated_filter = {'time_period': time_period_value}
    else:
        generated_filter = {}

    # Update the state with the generated filter
    state['filter'] = generated_filter

    return state

state = {
    'metadata': {
        'Companies': ['CompanyX', 'CompanyY'],
        'Names': ['Alice', 'Bob'],
        'Time period': '2023-Q2'
    }
}
updated_state = generate_metadata_filter(state)
print(updated_state)


{'metadata': {'Companies': ['CompanyX', 'CompanyY'], 'Names': ['Alice', 'Bob'], 'Time period': '2023-Q2'}, 'filter': {'$and': [{'$or': [{'company': {'$in': ['CompanyX', 'CompanyY', 'Alice', 'Bob']}}, {'name': {'$in': ['CompanyX', 'CompanyY', 'Alice', 'Bob']}}]}, {'time_period': '2023-Q2'}]}}


In [12]:
# querying database and retrieving relevant data/info
def query_all_databases_update_context(server_uri: str, state: State) -> State:
    """
    Queries all MongoDB databases and collections using state['filter'],
    updates state['context'] with stringified results (e.g. JSON),
    and returns the updated state.
    """
    client = MongoClient(server_uri)
    query = state.get('filter', {})
    db_names = client.list_database_names()
    results = {}

    for db_name in db_names:
        db = client[db_name]
        for coll_name in db.list_collection_names():
            collection = db[coll_name]
            matches = list(collection.find(query))
            if matches:
                # Store results in the dictionary
                results.setdefault(db_name, {})[coll_name] = matches

                # Convert documents to JSON strings and extend context
                for doc in matches:
                    # Use default=str to handle BSON types like ObjectId, datetime, etc.
                    json_doc = json.dumps(doc, default=str)
                    state['context'].append(json_doc)

    client.close()

    return state


In [8]:
# retrieve from vector store
#def retrieve(state:State):
  #retrieved_docs = vector_store.similarity_search(state["question"])
  #return {"context":retrieved_docs}

In [10]:
# generate answer
def generate(state: dict):
    prompt = f"""
You are an expert data analyst with skills in exploratory data analysis, statistical reasoning, and insight generation across any domain. Your goal is to analyze the provided data context in response to the user's question, delivering clear, actionable insights without assuming domain-specific knowledge.

User Question: {state["question"]}

Retrieved Context (from MongoDB): {state["context"]}

Instructions:
1. Understand the data: Summarize key elements, describe data types (numerical, categorical, temporal), structure (tables, lists, key-value pairs), and note any quality issues such as missing values, duplicates, or outliers.

2. Align with the question: Interpret the user's question and break it down into sub-queries if needed (for example, trends, correlations, or comparisons).

3. Perform analysis:
   - Compute relevant statistics or perform comparisons as applicable, focusing strictly on what the question requires.
   - Identify patterns, anomalies, or insights directly relevant to the question.
   - Perform simple aggregations, groupings, or comparisons (e.g., by category or time period) when appropriate.
   - Keep it efficient and avoid over-analysis.

4. Generate insights: Provide 3-5 key findings with explanations, using simple language and quantifying results where possible (e.g., "Sales increased by 20% in Q2").

5. Suggest visualizations: Recommend 1-2 simple visualizations (e.g., bar charts for distributions, line graphs for trends) and briefly describe what each would illustrate. Do not generate the charts.

6. Handle limitations: If the data context is insufficient, unclear, or irrelevant, explain why and suggest what additional data is needed.

7. Output format: Structure your response as:
   - Summary of Data
   - Key Insights (bullet points)
   - Visualization Suggestions
   - Recommendations or Next Steps

Ensure your analysis is objective, evidence-based, and concise. Respond only with the analysis output.
"""
    # calling the llm
    messages = [HumanMessage(content=prompt)]
    response = model.invoke(messages)
    return {"answer": response.content}
