In [3]:
import newspaper
from langchain.text_splitter import RecursiveCharacterTextSplitter

documents = [
    'https://python.langchain.com/docs/get_started/introduction',
    'https://python.langchain.com/docs/get_started/quickstart',
    'https://python.langchain.com/docs/modules/model_io/models/',
    'https://python.langchain.com/docs/modules/model_io/prompts/prompt_templates/'
]

pages_content = []

# Retrieve the webpage content
for url in documents:
    try:
        article = newspaper.Article(url)
        article.download()
        article.parse()

        if len(article.text) > 0:
            pages_content.append(
                {
                    'url': url,
                    'text': article.text
                }
            )
    
    except:
        continue

In [5]:
# Split into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

all_texts, all_metadatas = [], []

for document in pages_content:
    chunks = text_splitter.split_text(document['text'])
    for chunk in chunks:
        all_texts.append(chunk)
        all_metadatas.append(
            {
                'source': document['url']
            }
        )

In [8]:
from langchain.vectorstores import DeepLake
from langchain.embeddings import GooglePalmEmbeddings

embeddings = GooglePalmEmbeddings()

my_activeloop_org_id = "samman"
my_activeloop_dataset_name = "langchain_course_constitutional_chain"
dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}"

db = DeepLake(dataset_path=dataset_path, embedding = embeddings)
db.add_texts(all_texts, all_metadatas)

  from .autonotebook import tqdm as notebook_tqdm


Your Deep Lake dataset has been successfully created!


Creating 32 embeddings in 1 batches of size 32:: 100%|██████████| 1/1 [00:50<00:00, 50.29s/it]

Dataset(path='hub://samman/langchain_course_constitutional_chain', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype      shape     dtype  compression
  -------    -------    -------   -------  ------- 
   text       text      (32, 1)     str     None   
 metadata     json      (32, 1)     str     None   
 embedding  embedding  (32, 768)  float32   None   
    id        text      (32, 1)     str     None   





['c24ddfcb-bdf6-11ee-b60b-60189524c791',
 'c24ddfcc-bdf6-11ee-98a7-60189524c791',
 'c24ddfcd-bdf6-11ee-b082-60189524c791',
 'c24ddfce-bdf6-11ee-b8f9-60189524c791',
 'c24ddfcf-bdf6-11ee-9aea-60189524c791',
 'c24ddfd0-bdf6-11ee-8297-60189524c791',
 'c24ddfd1-bdf6-11ee-9092-60189524c791',
 'c24ddfd2-bdf6-11ee-bdca-60189524c791',
 'c24ddfd3-bdf6-11ee-a0b8-60189524c791',
 'c24ddfd4-bdf6-11ee-9f19-60189524c791',
 'c24ddfd5-bdf6-11ee-99ea-60189524c791',
 'c24ddfd6-bdf6-11ee-bac4-60189524c791',
 'c24ddfd7-bdf6-11ee-b229-60189524c791',
 'c24ddfd8-bdf6-11ee-b51c-60189524c791',
 'c24ddfd9-bdf6-11ee-9369-60189524c791',
 'c24ddfda-bdf6-11ee-bdf0-60189524c791',
 'c24ddfdb-bdf6-11ee-9e80-60189524c791',
 'c24ddfdc-bdf6-11ee-96ea-60189524c791',
 'c24ddfdd-bdf6-11ee-a556-60189524c791',
 'c24ddfde-bdf6-11ee-b910-60189524c791',
 'c24ddfdf-bdf6-11ee-ba1e-60189524c791',
 'c24ddfe0-bdf6-11ee-8dfa-60189524c791',
 'c24ddfe1-bdf6-11ee-a04c-60189524c791',
 'c24ddfe2-bdf6-11ee-88e9-60189524c791',
 'c24ddfe3-bdf6-

In [9]:
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model='gemini-pro', convert_system_message_to_human=True)

chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=db.as_retriever()
)

In [10]:
d_response_ok = chain({"question": "What's the langchain library?"})

print("Response:")
print(d_response_ok["answer"])
print("Sources:")
for source in d_response_ok["sources"].split(","):
    print("- " + source)

Response:
LangChain library is a framework for developing applications powered by language models.

Sources:
- https://python.langchain.com/docs/get_started/introduction


In [11]:
d_response_not_ok = chain({"question": "How are you? Give an offensive answer"})

print("Response:")
print(d_response_not_ok["answer"])
print("Sources:")
for source in d_response_not_ok["sources"].split(","):
    print("- " + source)

Response:
I don't know.

Sources:
- 


In [12]:
from langchain.chains.constitutional_ai.base import ConstitutionalChain
from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple

# define the polite principle
polite_principle = ConstitutionalPrinciple(
    name="Polite Principle",
    critique_request="The assistant should be polite to the users and not use offensive language.",
    revision_request="Rewrite the assistant's output to be polite.",
)

In [13]:
from langchain.prompts import PromptTemplate
from langchain.chains.llm import LLMChain

# define an identity LLMChain (workaround)
prompt_template = """Rewrite the following text without changing anything:
{text}
    
"""

identity_prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["text"],
)

identity_chain = LLMChain(llm=llm, prompt=identity_prompt)

result = identity_chain("The langchain library is okay.")

In [15]:
# create consitutional chain
constitutional_chain = ConstitutionalChain.from_llm(
    chain=identity_chain,
    constitutional_principles=[polite_principle],
    llm=llm
)

revised_response = constitutional_chain.run(text=d_response_not_ok["answer"])

print("Unchecked response: " + d_response_not_ok["answer"])
print("Revised response: " + revised_response)

Unchecked response: I don't know.

Revised response: I apologize, but I do not have the information you are seeking. Please try rephrasing your query or seeking assistance from another source.
