In [27]:
import os
from dotenv import load_dotenv
load_dotenv()

from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser, XMLOutputParser
from langchain.output_parsers import YamlOutputParser
from pydantic import BaseModel, Field
from langchain_community.document_loaders import TextLoader, PyPDFLoader, WebBaseLoader, ArxivLoader, WikipediaLoader
#from langchain_unstructured import UnstructuredLoader
import bs4
from langchain_text_splitters import RecursiveCharacterTextSplitter, RecursiveJsonSplitter, HTMLHeaderTextSplitter, CharacterTextSplitter

In [2]:
os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY')
os.environ['GROQ_API_KEY'] = os.getenv('GROQ_API_KEY')
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
os.environ['LANGCHAIN_PROJECT'] = os.getenv('LANGCHAIN_PROJECT')
os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY')
os.environ['LANGCHAIN_TRACING_V2'] = os.getenv('LANGCHAIN_TRACING_V2')

##### Invoking the model for generation:

In [3]:
model = ChatGroq(model="gemma2-9b-it")
print(model.invoke("My name is Namrata.").content)

Hello Namrata, it's nice to meet you!

Is there anything I can help you with today?



##### Creating a chain with prompt and model. Invoking the chain for generation:

In [4]:
prompt = ChatPromptTemplate.from_messages([
    ("system","You are an expert AI Engineer. Provide me answer based on the question"),
    ("user","{input}")
])

chain = prompt | model
result = chain.invoke(input={'input':'My name is Namrata.'})
print(result.content)

Hello Namrata!

It's nice to meet you.  

Is there anything specific I can help you with? Perhaps you have a question about AI, or maybe you'd like to explore some creative text generation? 

I'm here to assist in any way I can. 😊  




#### Output Parsers:

##### Adding different parsers to the chain. Invoking the chain (with parsers) for generation:

StrOutputParser:

In [5]:
str_op_parser = StrOutputParser()
chain = prompt | model | str_op_parser
result = chain.invoke(input={'input':'What is Langsmith?'})
print(result)

Langsmith is an open-source platform developed by **Replicate** that focuses on simplifying the process of building and deploying large language models (LLMs). 

Think of it as a user-friendly toolbox specifically designed for working with LLMs. Here's a breakdown of its key features and benefits:

**Key Features:**

* **Streamlined Development:** Langsmith provides a visual interface and easy-to-use tools for fine-tuning and customizing pre-trained LLMs. You don't need to be a deep learning expert to get started.
* **Experiment Tracking:** It offers robust tools for tracking experiments, comparing different model architectures and fine-tuning parameters, and managing your LLM development workflow efficiently.
* **Collaboration:** Langsmith encourages collaboration by allowing teams to share models, datasets, and experiments, fostering a more open and collaborative LLM development ecosystem.
* **Deployment & Scaling:**  It simplifies the process of deploying your fine-tuned LLMs to var

JsonOutputParser:

In [6]:
#OPTION 1: Provide the output format info in the system prompt itself.

prompt_json = ChatPromptTemplate.from_messages([
    ("system","You are an expert AI Engineer. Provide me answer based on the question. Response should be in JSON object only."),
    ("user","{input}")
])
str_op_parser = StrOutputParser()
chain = prompt_json | model | str_op_parser
result = chain.invoke(input={'input':'What is Langsmith?'})
print(result)

```json
{
 "definition": "Langsmith is an open-source framework for building and deploying AI agents.",
 "features": [
  "Agent construction tools",
  "Modular and extensible design",
  "Integration with various AI models (LLMs, etc.)",
  "Focus on task-oriented applications"
 ],
 "purpose": "To simplify the development and deployment of intelligent agents that can perform complex tasks."
}
```



In [7]:
#OPTION 2: Using the JsonOutputParser to decide the output format, instead of mentioning it explicitely in the system prompt. Using PromptTemplate

json_op_parser = JsonOutputParser()
output_format = json_op_parser.get_format_instructions()

json_prompt = PromptTemplate(
    template = "You are an expert AI Engineer. Provide me answer based on the question. \n {format_instruction} \n {input}",
    input_variables=["input"],
    partial_variables={'format_instruction':output_format}

)
print('Prompt:\n ', json_prompt)

chain = json_prompt | model | json_op_parser
result = chain.invoke(input={'input':'What is Langsmith?'})
print('\n',result)

Prompt:
  input_variables=['input'] input_types={} partial_variables={'format_instruction': 'Return a JSON object.'} template='You are an expert AI Engineer. Provide me answer based on the question. \n {format_instruction} \n {input}'

 {'name': 'Langsmith', 'description': 'Langsmith is an open-source platform for developing and deploying large language models (LLMs).', 'features': ['User-friendly interface for building and training LLMs', 'Pre-trained models and datasets for quick prototyping', 'Fine-tuning capabilities for customizing models to specific tasks', 'Model deployment options for integrating LLMs into applications', 'Community-driven development with active support and contributions'], 'benefits': ['Accelerated LLM development cycle', 'Reduced barrier to entry for LLM experimentation', 'Increased accessibility to powerful AI technologies', 'Collaborative environment for sharing and learning', 'Cost-effective solution for LLM development'], 'website': 'https://www.langsmith

In [8]:
#OPTION 3: Using the JsonOutputParser to decide the output format, instead of mentioning it explicitely in the system prompt. Using ChatPromptTemplate

json_op_parser = JsonOutputParser()
output_format = json_op_parser.get_format_instructions()

prompt_json = ChatPromptTemplate.from_messages(
    messages=[
            ("system","You are an expert AI Engineer. Provide me answer based on the question. \n {format_instruction}"),
            ("user","{input}")
        ]
    )

print('Prompt:\n ', prompt_json)

chain = prompt_json | model | json_op_parser
result = chain.invoke(input={'input':'What is Langsmith?','format_instruction':{output_format}})
print(result)

Prompt:
  input_variables=['format_instruction', 'input'] input_types={} partial_variables={} messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['format_instruction'], input_types={}, partial_variables={}, template='You are an expert AI Engineer. Provide me answer based on the question. \n {format_instruction}'), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], input_types={}, partial_variables={}, template='{input}'), additional_kwargs={})]
{'name': 'Langsmith', 'description': 'Langsmith is an open-source platform for building and deploying AI applications.', 'features': ['User-friendly interface for creating and managing AI models', 'Support for multiple AI frameworks, including Llama 2', 'Built-in tools for fine-tuning and evaluating models', 'Deployment options for web, mobile, and cloud environments', 'Collaborative features for team development'], 'website': 'https://www.langsmith.com'}


Using Pydantic to standardize the LLM outputs:

In [9]:
#Using StrOutputParser:

class Product(BaseModel):

    product_name :str = Field(description="name of the product")
    price : int = Field(description="tentative price in USD")

str_parser = StrOutputParser(pydantic_object=Product)

product_prompt = ChatPromptTemplate.from_messages([

    ("system", "You are expert Sales person who has vast knowledge on any product and it's description. You are well versed in product names and their prices."),
    ("user", "{input}")
]

)

chain = product_prompt | model | str_parser
result = chain.invoke(input={'input':'Tell me about the latest iPhone.'})
print('\n',result)


 Alright,  let's talk about the latest and greatest iPhone!  You're likely talking about the **iPhone 15 series**, which was just released in September 2023. 

There are four models to choose from:

* **iPhone 15:** This is the base model and it starts at **$799**.  It features a stunning 6.1-inch Super Retina XDR display, the powerful A16 Bionic chip (the same one as the iPhone 14 Pro), and a 48MP main camera. 

* **iPhone 15 Plus:**  For those who want a bigger screen, this model offers a 6.7-inch display and the same A16 Bionic chip as the iPhone 15. It starts at **$899**.

* **iPhone 15 Pro:**  This is where things get really exciting!  The Pro model boasts a 6.1-inch display with a new ProMotion technology for even smoother scrolling and animations.  It's powered by the incredibly fast A17 Pro chip (built on a 3nm process, the most advanced chip in a smartphone) and has a new titanium frame for added durability.  It starts at **$999**.

* **iPhone 15 Pro Max:**  The biggest and m

Using just the pydantic object in the Parser is not enough. We need to provide the formating instructions too in the prompt.

In [10]:
#Using JsonOutputParser and Prompt Template:

class Product(BaseModel):

    product_name :str = Field(description="name of the product")
    price : int = Field(description="tentative price in USD")

str_parser = JsonOutputParser(pydantic_object=Product)
output_format = str_parser.get_format_instructions()

product_prompt = PromptTemplate(
    template = "You are expert sales person who has vast knowledge on any product and it's price. Provide an answer to the question. \n {format_instruction} \n {input}",
    input_variables=["input"],
    partial_variables={'format_instruction':output_format}

)

chain = product_prompt | model | str_parser
result = chain.invoke(input={'input':'Tell me about the latest iPhone.'})
print('\n',result)


 {'product_name': 'iPhone 15 Pro Max', 'price': 1099}


In [11]:
#Using JsonOutputParser and ChatPrompt Template:

class Product(BaseModel):

    product_name :str = Field(description="name of the product")
    price : int = Field(description="tentative price in USD")

str_parser = JsonOutputParser(pydantic_object=Product)
output_format = str_parser.get_format_instructions()

product_prompt = ChatPromptTemplate.from_messages(
    messages=[
            ("system","You are expert sales person who has vast knowledge on any product and it's price. Provide an answer to the question. \n {format_instruction}"),
            ("user","{input}")
        ]
    )

chain = product_prompt | model | str_parser
result = chain.invoke(input={'input':'Tell me about the latest iPhone.', 'format_instruction':output_format})
print('\n',result)


 {'product_name': 'iPhone 15 Pro Max', 'price': 1099}


Using Pydantic we can thus standardize the output of the LLMs. The output will always come using only the information required according to the pydantic class object passed to the parser.

Using Pydantic model on YamlOutputParser:

In [12]:
#Using YamlOutputParser and Prompt Template:

class Product(BaseModel):

    product_name :str = Field(description="name of the product")
    price : int = Field(description="tentative price in USD")

str_parser = YamlOutputParser(pydantic_object=Product)
output_format = str_parser.get_format_instructions()

product_prompt = PromptTemplate(
    template = "You are expert sales person who has vast knowledge on any product and it's price. Provide an answer to the question. \n {format_instruction} \n {input}",
    input_variables=["input"],
    partial_variables={'format_instruction':output_format}

)

chain = product_prompt | model | str_parser
result = chain.invoke(input={'input':'Tell me about the latest iPhone.'})
print('\n',result)


 product_name='iPhone 15 Pro Max' price=1099


#### Data Ingestion Techniques:

Langchain Documentation: https://python.langchain.com/docs/integrations/document_loaders/

TextLoader:

In [13]:
text_loader = TextLoader('speech.txt')
text_docs= text_loader.load() # Load data into Document Objects
print(text_docs)
print(type(text_docs[0]))
print('Metadata: ', text_docs[0].metadata)
print('Total Docs: ', len(text_docs))

[Document(metadata={'source': 'speech.txt'}, page_content='The world must be made safe for democracy. Its peace must be planted upon the tested foundations of political liberty. We have no selfish ends to serve. We desire no conquest, no dominion. We seek no indemnities for ourselves, no material compensation for the sacrifices we shall freely make. We are but one of the champions of the rights of mankind. We shall be satisfied when those rights have been made as secure as the faith and the freedom of nations can make them.\n\nJust because we fight without rancor and without selfish object, seeking nothing for ourselves but what we shall wish to share with all free peoples, we shall, I feel confident, conduct our operations as belligerents without passion and ourselves observe with proud punctilio the principles of right and of fair play we profess to be fighting for.\n\n…\n\nIt will be all the easier for us to conduct ourselves as belligerents in a high spirit of right and fairness be

PdPDFLoader:

In [14]:
pdf_loader = PyPDFLoader('foundation_of_llm_tao_zhu.pdf')
pdf_docs = pdf_loader.load()
print(len(pdf_docs))

231


In [15]:
print(pdf_docs[10])

page_content='4 Pre-training
Unsupervised Supervised
Pre-training Training
Unlabeled
Data
Labeled
Data
(a) Unsupervised Pre-training
Supervised Supervised
Pre-training Tuning
Labeled
Data
T ask 1
Labeled
Data
T ask 2
(b) Supervised Pre-training
Self-
Supervised
Supervised
Zero/Few
Shot Learning
Pre-training Tuning
Prompting
Unlabeled
Data
Labeled
Data
(c) Self-supervised Pre-training
Fig. 1.1: Illustration of unsupervised, supervised, and self-super vised pre-training. In unsupervised pre-training, the
pre-training is performed on large-scale unlabeled data. I t can be viewed as a preliminary step to have a good starting
point for the subsequent optimization process, though cons iderable effort is still required to further train the model
with labeled data after pre-training. In supervised pre-tr aining, the underlying assumption is that different (super vised)
learning tasks are related. So we can ﬁrst train the model on o ne task, and transfer the resulting model to another task
with

UnstructedLoader:
Need to research more how to use it in local

In [16]:
# uns_loader = UnstructuredLoader('foundation_of_llm_tao_zhu.pdf')
# pdf_uns_docs = uns_loader.load()
# print(len(pdf_uns_docs))

In [17]:
# print(pdf_uns_docs[10])

WebLoader:

In [18]:
web_loader = WebBaseLoader(web_path="https://www.resumewriter.sg/blog/list-of-headhunters-in-singapore/",
                           bs_kwargs=dict(parse_only=bs4.SoupStrainer(
                           class_=("post-title","post-content","post-header") #See the video to understand this:
                        ))
                        )

web_docs = web_loader.load()
print(len(web_docs))

1


In [19]:
print(web_docs[0])

page_content='' metadata={'source': 'https://www.resumewriter.sg/blog/list-of-headhunters-in-singapore/'}


In [20]:
web_loader = WebBaseLoader(web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
                     bs_kwargs=dict(parse_only=bs4.SoupStrainer(
                         class_=("post-title","post-content","post-header")
                     ))
                     )
web_docs = web_loader.load()
print(len(web_docs))

1


In [21]:
print(web_docs[0])

page_content='

      LLM Powered Autonomous Agents
    
Date: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng


Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.
Agent System Overview#
In a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:

Planning

Subgoal and decomposition: The agent breaks down large tasks into smaller, manageable subgoals, enabling efficient handling of complex tasks.
Reflection and refinement: The agent can do self-criticism and self-reflection over past actions, learn from mistakes and refine them for future steps, thereby improving the quality of final results.


Memory

Short-t

ArxivLoader:

In [22]:
arx_loader = ArxivLoader(query="2007.14390",doc_content_chars_max=10000000000)
arx_docs = arx_loader.load()
print(len(arx_docs))

1


In [23]:
print(arx_docs[0])

page_content='FLOWER: A FRIENDLY FEDERATED LEARNING FRAMEWORK
Daniel J. Beutel 1 2 Taner Topal 1 2 Akhil Mathur 3 Xinchi Qiu 1 Javier Fernandez-Marques 4 Yan Gao 1
Lorenzo Sani 5 Kwing Hei Li 1 Titouan Parcollet 6 Pedro Porto Buarque de Gusm˜ao 1 Nicholas D. Lane 1
ABSTRACT
Federated Learning (FL) has emerged as a promising technique for edge devices to collaboratively learn a shared
prediction model, while keeping their training data on the device, thereby decoupling the ability to do machine
learning from the need to store the data in the cloud. However, FL is difﬁcult to implement realistically, both
in terms of scale and systems heterogeneity. Although there are a number of research frameworks available to
simulate FL algorithms, they do not support the study of scalable FL workloads on heterogeneous edge devices.
In this paper, we present Flower – a comprehensive FL framework that distinguishes itself from existing platforms
by offering new facilities to execute large-scale FL exp

WikipediaLoader:

In [24]:
wiki_loader = WikipediaLoader(query="Taylor Swift", load_max_docs=10)
wiki_docs = wiki_loader.load()
print(len(wiki_docs))

10


In [25]:
print(wiki_docs[9])

page_content='Folklore is the eighth studio album by the American singer-songwriter Taylor Swift. It was surprise-released on July 24, 2020, by Republic Records. Conceived during quarantine in early 2020, amidst the COVID-19 pandemic, the album explores themes of escapism, nostalgia, and romanticism. Swift recorded her vocals in her Los Angeles home studio and worked virtually with the producers Aaron Dessner and Jack Antonoff, who operated from their studios in the Hudson Valley and New York City.  
Using a set of characters and story arcs to depict fictional narratives, the album departs from the autobiographical songwriting that had characterized Swift's past albums. Experimenting with new musical styles, Folklore consists of mellow ballads driven by piano, strings, and muted percussion; music critics classify the genre as a blend of folk, pop, alternative, electronic, and rock subgenres. The album's title was inspired by the lasting legacy of folktales, and its visual aesthetic ado

#### Data Transformation Techniques:

Chunking using RecursiveCharacterTextSplitter:

In [42]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap = 20)
transformed_pdf = text_splitter.split_documents(pdf_docs)
print('Total Chunks created: ', len(transformed_pdf))

Total Chunks created:  748


In [40]:
print('Total pages in the pdf: ', len(pdf_docs))
print('Original Document: ', pdf_docs[10])

Total pages in the pdf:  231
Original Document:  page_content='4 Pre-training
Unsupervised Supervised
Pre-training Training
Unlabeled
Data
Labeled
Data
(a) Unsupervised Pre-training
Supervised Supervised
Pre-training Tuning
Labeled
Data
T ask 1
Labeled
Data
T ask 2
(b) Supervised Pre-training
Self-
Supervised
Supervised
Zero/Few
Shot Learning
Pre-training Tuning
Prompting
Unlabeled
Data
Labeled
Data
(c) Self-supervised Pre-training
Fig. 1.1: Illustration of unsupervised, supervised, and self-super vised pre-training. In unsupervised pre-training, the
pre-training is performed on large-scale unlabeled data. I t can be viewed as a preliminary step to have a good starting
point for the subsequent optimization process, though cons iderable effort is still required to further train the model
with labeled data after pre-training. In supervised pre-tr aining, the underlying assumption is that different (super vised)
learning tasks are related. So we can ﬁrst train the model on o ne task, and 

In [45]:
print('Content of one chunk: \n')
print(transformed_pdf[10])

Content of one chunk: 

page_content='v
2.3.5 Position Extrapolation and Interpolation . . . . . . . . . . . . . . . . . . 82
2.3.6 Remarks . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 92
2.4 Summary . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 94
3 Prompting 96
3.1 General Prompt Design . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 97
3.1.1 Basics . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 97
3.1.2 In-context Learning . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 99
3.1.3 Prompt Engineering Strategies . . . . . . . . . . . . . . . . . . . . . . . 101
3.1.4 More Examples . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 106
3.2 Advanced Prompting Methods . . . . . . . . . . . . . . . . . . . . . . . . . . . 115
3.2.1 Chain of Thought . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 115
3.2.2 Problem Decomposition . . . . . . . . . . . . . . . . . . . . .

In [65]:
char_splitter = CharacterTextSplitter(separator="\n\n",chunk_size=100, chunk_overlap= 20)
transformed_pdf_charSplit = char_splitter.split_documents(pdf_docs)
print('Total number of chunks created: ', len(transformed_pdf_charSplit))
print(len(transformed_pdf_charSplit[0].page_content))

Total number of chunks created:  231
173


In [None]:
loader=TextLoader('speech.txt')
docs=loader.load()
text_splitter=CharacterTextSplitter(separator="\n\n",chunk_size=100,chunk_overlap=20)
split_text_docs = text_splitter.split_documents(docs)
print('Total number of chunks created: ', len(split_text_docs))
print(len(split_text_docs[0].page_content)) # why is the length of the chunk greater than the chunk_size mentioned above i.e. 100 ???? 

Created a chunk of size 470, which is longer than the specified 100
Created a chunk of size 347, which is longer than the specified 100
Created a chunk of size 668, which is longer than the specified 100
Created a chunk of size 982, which is longer than the specified 100
Created a chunk of size 789, which is longer than the specified 100


Total number of chunks created:  7


470
