In [66]:
import getpass
import os

os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = getpass.getpass()

if not os.environ.get("GROQ_API_KEY"):
  os.environ["GROQ_API_KEY"] = getpass.getpass("Enter API key for Groq: ")


In [69]:

from langchain.chat_models import init_chat_model

llm = init_chat_model("llama-3.3-70b-versatile", model_provider="groq")

In [77]:
llm.invoke("list 5 universities in california")

AIMessage(content='Here are 5 universities in California:\n\n1. Stanford University\n2. University of California, Berkeley\n3. University of California, Los Angeles (UCLA)\n4. California Institute of Technology (Caltech)\n5. University of Southern California (USC)', additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 54, 'prompt_tokens': 41, 'total_tokens': 95, 'completion_time': 0.196363636, 'prompt_time': 0.004613601, 'queue_time': 0.233543086, 'total_time': 0.200977237}, 'model_name': 'llama-3.3-70b-versatile', 'system_fingerprint': 'fp_5f849c5a0b', 'finish_reason': 'stop', 'logprobs': None}, id='run-7b210a4c-ff96-4a05-9c9f-7519c3d5a17d-0', usage_metadata={'input_tokens': 41, 'output_tokens': 54, 'total_tokens': 95})

In [71]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [58]:
#initialize the vector store
from langchain_chroma import Chroma

vector_store = Chroma(embedding_function=embeddings)

In [59]:
# from langchain_community.document_loaders import JSONLoader

# loader = JSONLoader(file_path="uni_with_description.json",
#                     jq_schema=".[]",
#                     text_content=False)

# docs = loader.load()

In [60]:
# docs[:5]

In [72]:
from langchain import hub
from langchain_community.document_loaders import 
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict


loader = JSONLoader(file_path="uni_with_description.json",
                    jq_schema=".[]",
                    text_content=False)

docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap = 500
)

all_splits = text_splitter.split_documents(docs)

#index chunks
_ = vector_store.add_documents(documents=all_splits)

#prompt template
prompt = """
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Keep the answer concise.
Question: {question} 
Context: {context} 
Answer:
"""


In [73]:

#state for application
class State(TypedDict):
    question : str
    context : List[Document]
    answer: str


# Define application steps
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.format(
        question= state["question"], 
        context= docs_content
        )
    response = llm.invoke(messages)
    return {"answer": response.content}


In [74]:
# Compile application and test
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [76]:
response = graph.invoke({"question": "list 5 universities in California"})
print(response["answer"])

The University of California, Irvine is the only university mentioned in the context. Since the context mentions that the University of California comprises 10 institutions, we can infer that there are at least 10 universities in the University of California system in California. However, only one university is explicitly mentioned. 

So, based on the provided context, I can only confirm one university in California: 
1. University of California, Irvine 

I don't know the other four universities in California.


In [63]:
state = {"question": "5 universities", "context": [], "answer": ""}
state.update(retrieve(state))
state.update(generate(state))

print(state["answer"])

The context only mentions 1 university: University of Maryland (https://umd.edu/). I don't know the other 4 universities.


In [64]:
state = {"question": "5 universities", "context": [], "answer": ""}
state.update(retrieve(state))
print(state["context"])

[Document(id='271b21a1-0914-4fae-9200-e4fdad62f96e', metadata={'seq_num': 40, 'source': '/home/jivan-acharya/Documents/Internship/Code/RAG-app/uni_with_description.json'}, page_content='& Theology,Sociology,Civil Engineering,Education,Other Health,Veterinary Science,Chemical Engineering,Communication & Media Studies", "Location": "College Park, United States", "Description": null, "uni_url": "https://umd.edu/"}'), Document(id='adc8963e-13e8-4475-a8db-22689f1e1035', metadata={'seq_num': 40, 'source': '/home/jivan-acharya/Documents/Internship/Code/RAG-app/uni_with_description.json'}, page_content='& Theology,Sociology,Civil Engineering,Education,Other Health,Veterinary Science,Chemical Engineering,Communication & Media Studies", "Location": "College Park, United States", "Description": null, "uni_url": "https://umd.edu/"}'), Document(id='a51ad530-6a95-4564-8372-8557481fcdaa', metadata={'seq_num': 40, 'source': '/home/jivan-acharya/Documents/Internship/Code/RAG-app/uni_with_description.js