# Super Simple End-to-End RAG implementation (no Pipeline just "Ragging around")

In [1]:
#Setting API Keys and Env Variables

import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = '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'
os.environ['GROQ_API_KEY'] = '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'

In [2]:
#Setting Groq client

from groq import Groq

GroqClient = Groq(
    api_key=os.environ.get("GROQ_API_KEY"),
)

In [16]:
#Testing the Groq Client

chat_completion = GroqClient.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Who was the first U.S.A. President?",
        }
    ],
    model="llama3-8b-8192",
)

print(chat_completion.choices[0].message.content)

The first President of the United States was George Washington. He was inaugurated on April 30, 1789 and served two terms in office until March 4, 1797.


In [None]:
#Exploring Groq API

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": "You are an History Teacher Specialized in U.S. American history" #. You always return your answer in JSON format."
        },
        {
            "role": "user",
            "content": "Who was the first U.S.A. President?",
        }
    ],
    model="llama3-8b-8192",
    #Temperature controls randomness: As the temperature approaches zero, the model will become deterministic and repetitive, Values from 0 to 2 - can be float.
    #If you adjust the temperature to 0.5, the model will generate text that is more predictable and less creative than if you set the temperature to 1.0.
    temperature=1,
    #Max output Tokens
    max_tokens=1024,
    #Top_P controls diversity via nucleus sampling: 0.5 means half of all likelihood-weighted options are considered, Values from 0 to 1 - can be float.
    #If you set top p to 0.9, the model will only consider the most likely words that make up 90% of the probability mass.
    top_p=1,
    stream=False,
    #stream=True,
    #response_format={"type": "json_object"},
    stop=None,
)

#If Stream set to True
#for chunk in chat_completion:
#    print(chunk.choices[0].delta.content or "", end="")

#If Stream set to False and response format not JSON
print(chat_completion.choices[0].message.content)

#If using JSON output, ensure Stream is set to False and System prompt contains the word JSON contextualized such as "You always return your answer in JSON format.".
#print(chat_completion.choices[0].message)

In [4]:
!pip install langchain-groq



In [5]:
#Super Simple End-to-End Implementation

#Importing Libraries

import bs4

from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq

# Load Documents
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

# Embed
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=HuggingFaceEmbeddings(model_name = "sentence-transformers/all-mpnet-base-v2"))

retriever = vectorstore.as_retriever()

#### RETRIEVAL and GENERATION ####

# Prompt
prompt = hub.pull("rlm/rag-prompt")

# LLM
llm = ChatGroq(model_name="llama3-8b-8192", temperature=1, top_p=1, max_tokens=1024)

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Question
rag_chain.invoke("What is Task Decomposition?")

                    top_p was transferred to model_kwargs.
                    Please confirm that top_p is what you intended.


'Task Decomposition is the process of breaking down a complex problem or task into smaller, more manageable subtasks or steps. This is often done to make the task more feasible and to identify the necessary steps required to achieve the desired outcome.'

In [9]:
#Prompt from LangchainHub

prompt

ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template='\nAnswer the question based only on the following context: {context}\n\nQuestion: {question}\n'))])

In [8]:
#Prompt Building Example

# Prompt
template = '''
Answer the question based only on the following context: {context}

Question: {question}
'''

prompt = ChatPromptTemplate.from_template(template)
prompt

ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template='\nAnswer the question based only on the following context: {context}\n\nQuestion: {question}\n'))])

In [11]:
question = "what's the difference between llamaindex and langchain"

import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

num_tokens_from_string(question, "cl100k_base")

11

In [14]:
#Embedding the Question and computing Cosine Similarity

question = "What kinds of pets do I like?"
document = "My favorite pet is a cat."

embedding = HuggingFaceEmbeddings(model_name = "sentence-transformers/all-mpnet-base-v2")
query_result = embedding.embed_query(question)
document_result = embedding.embed_query(document)
print(len(query_result))

import numpy as np

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

similarity = cosine_similarity(query_result, document_result)
print("Cosine Similarity:", similarity)

768
Cosine Similarity: 0.5595268901544017
