In [1]:
import os, sys
sys.path.insert(1, 'D:\Github\DeepLake-Langchain')
import credentials
os.environ["OPENAI_API_KEY"] = credentials.openai
os.environ['ACTIVELOOP_TOKEN'] = credentials.active_loop

In [2]:
import requests
from newspaper import Article # https://github.com/codelucas/newspaper
import time

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}

article_urls = [
    "https://www.artificialintelligence-news.com/2023/05/16/openai-ceo-ai-regulation-is-essential/",
    "https://www.artificialintelligence-news.com/2023/05/15/jay-migliaccio-ibm-watson-on-leveraging-ai-to-improve-productivity/",
    "https://www.artificialintelligence-news.com/2023/05/15/iurii-milovanov-softserve-how-ai-ml-is-helping-boost-innovation-and-personalisation/",
    "https://www.artificialintelligence-news.com/2023/05/11/ai-and-big-data-expo-north-america-begins-in-less-than-one-week/",
    "https://www.artificialintelligence-news.com/2023/05/02/ai-godfather-warns-dangers-and-quits-google/",
    "https://www.artificialintelligence-news.com/2023/04/28/palantir-demos-how-ai-can-used-military/"
]

session = requests.Session()
pages_content = [] # where we save the scraped articles

for url in article_urls:
    try:
        time.sleep(2) # sleep two seconds for gentle scraping
        response = session.get(url, headers=headers, timeout=10)

        if response.status_code == 200:
            article = Article(url)
            article.download() # download HTML of webpage
            article.parse() # parse HTML to extract the article text
            pages_content.append({ "url": url, "text": article.text })
        else:
            print(f"Failed to fetch article at {url}")
    except Exception as e:
        print(f"Error occurred while fetching article at {url}: {e}")

#If an error occurs while fetching an article, we catch the exception and print
#an error message. This ensures that even if one article fails to download,
#the rest of the articles can still be processed.

In [3]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake

embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

my_activeloop_org_id = credentials.active_loop_org_id
my_activeloop_dataset_name = "qabot_with_source"
dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}"

db = DeepLake(dataset_path=dataset_path, embedding_function=embeddings)



Your Deep Lake dataset has been successfully created!
The dataset is private so make sure you are logged in!


\

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/megatron17/qabot_with_source


 

hub://megatron17/qabot_with_source loaded successfully.


 

In [4]:
# We split the article texts into small chunks. While doing so, we keep track of each
# chunk metadata (i.e. the URL where it comes from). Each metadata is a dictionary and
# we need to use the "source" key for the document source so that we can then use the
# RetrievalQAWithSourcesChain class which will automatically retrieve the "source" item
# from the metadata dictionary.

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

all_texts, all_metadatas = [], []
for d in pages_content:
    chunks = text_splitter.split_text(d["text"])
    for chunk in chunks:
        all_texts.append(chunk)
        all_metadatas.append({ "source": d["url"] })

In [5]:
# we add all the chunks to the deep lake, along with their metadata
db.add_texts(all_texts, all_metadatas)

Evaluating ingest: 100%|██████████| 1/1 [00:49<00:00
-

Dataset(path='hub://megatron17/qabot_with_source', tensors=['embedding', 'ids', 'metadata', 'text'])

  tensor     htype     shape      dtype  compression
  -------   -------   -------    -------  ------- 
 embedding  generic  (49, 1536)  float32   None   
    ids      text     (49, 1)      str     None   
 metadata    json     (49, 1)      str     None   
   text      text     (49, 1)      str     None   


 

['c70c24d1-23bd-11ee-9113-00d861dd19c7',
 'c70c24d2-23bd-11ee-9870-00d861dd19c7',
 'c70c24d3-23bd-11ee-859c-00d861dd19c7',
 'c70c24d4-23bd-11ee-bf83-00d861dd19c7',
 'c70c24d5-23bd-11ee-9587-00d861dd19c7',
 'c70c24d6-23bd-11ee-816b-00d861dd19c7',
 'c70c24d7-23bd-11ee-bbae-00d861dd19c7',
 'c70c24d8-23bd-11ee-91a3-00d861dd19c7',
 'c70c24d9-23bd-11ee-8cfe-00d861dd19c7',
 'c70c24da-23bd-11ee-8b7b-00d861dd19c7',
 'c70c24db-23bd-11ee-9865-00d861dd19c7',
 'c70c24dc-23bd-11ee-9ecf-00d861dd19c7',
 'c70c24dd-23bd-11ee-b31f-00d861dd19c7',
 'c70c24de-23bd-11ee-9679-00d861dd19c7',
 'c70c24df-23bd-11ee-a8ed-00d861dd19c7',
 'c70c24e0-23bd-11ee-8712-00d861dd19c7',
 'c70c24e1-23bd-11ee-8d34-00d861dd19c7',
 'c70c24e2-23bd-11ee-b311-00d861dd19c7',
 'c70c24e3-23bd-11ee-8ff6-00d861dd19c7',
 'c70c24e4-23bd-11ee-8862-00d861dd19c7',
 'c70c24e5-23bd-11ee-b62c-00d861dd19c7',
 'c70c24e6-23bd-11ee-b042-00d861dd19c7',
 'c70c24e7-23bd-11ee-b424-00d861dd19c7',
 'c70c24e8-23bd-11ee-a6be-00d861dd19c7',
 'c70c24e9-23bd-

In [6]:
# we create a RetrievalQAWithSourcesChain chain, which is very similar to a
# standard retrieval QA chain but it also keeps track of the sources of the
# retrieved documents

from langchain.chains import RetrievalQAWithSourcesChain
from langchain import OpenAI

llm = OpenAI(model_name="text-davinci-003", temperature=0)

chain = RetrievalQAWithSourcesChain.from_chain_type(llm=llm,
                                                    chain_type="stuff",
                                                    retriever=db.as_retriever())

In [7]:
# We generate a response to a query using the chain. The response object is a dictionary containing
# an "answer" field with the textual answer to the query, and a "sources" field containing a string made
# of the concatenation of the metadata["source"] strings of the retrieved documents.
d_response = chain({"question": "What does Geoffrey Hinton think about recent trends in AI?"})

print("Response:")
print(d_response["answer"])
print("Sources:")
for source in d_response["sources"].split(", "):
    print("- " + source)

Response:
 Geoffrey Hinton believes that the rapid development of generative AI products is "racing towards danger" and that false text, images, and videos created by AI could lead to a situation where average people "would not be able to know what is true anymore." He also expressed concerns about the impact of AI on the job market, as machines could eventually replace roles such as paralegals, personal assistants, and translators.

Sources:
- https://www.artificialintelligence-news.com/2023/05/02/ai-godfather-warns-dangers-and-quits-google/
