In [1]:
from dotenv import load_dotenv

load_dotenv('../../.env')

True

# Scrapping For News

In [2]:
import requests
from newspaper import Article # https://github.com/codelucas/newspaper
import time

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}

article_urls = [
    "https://www.artificialintelligence-news.com/2023/05/16/openai-ceo-ai-regulation-is-essential/",
    "https://www.artificialintelligence-news.com/2023/05/15/jay-migliaccio-ibm-watson-on-leveraging-ai-to-improve-productivity/",
    "https://www.artificialintelligence-news.com/2023/05/15/iurii-milovanov-softserve-how-ai-ml-is-helping-boost-innovation-and-personalisation/",
    "https://www.artificialintelligence-news.com/2023/05/11/ai-and-big-data-expo-north-america-begins-in-less-than-one-week/",
    "https://www.artificialintelligence-news.com/2023/05/02/ai-godfather-warns-dangers-and-quits-google/",
    "https://www.artificialintelligence-news.com/2023/04/28/palantir-demos-how-ai-can-used-military/"
]

session = requests.Session()
pages_content = [] # where we save the scraped articles

for url in article_urls:
    try:
        time.sleep(2) # sleep two seconds for gentle scraping
        response = session.get(url, headers=headers, timeout=10)

        if response.status_code == 200:
            article = Article(url)
            article.download() # download HTML of webpage
            article.parse() # parse HTML to extract the article text
            pages_content.append({ "url": url, "text": article.text })
        else:
            print(f"Failed to fetch article at {url}")
    except Exception as e:
        print(f"Error occurred while fetching article at {url}: {e}")

#If an error occurs while fetching an article, we catch the exception and print
#an error message. This ensures that even if one article fails to download,
#the rest of the articles can still be processed.

# Embedd and Store in DeepLake

In [3]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import DeepLake



embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2',
                                       model_kwargs = {'device':'cpu'} )

# TODO: use your organization id here. (by default, org id is your username)
my_activeloop_org_id = "thapabibek1129"
my_activeloop_dataset_name = "langchain_course_qabot_with_source"
dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}"

db = DeepLake(dataset_path=dataset_path, embedding_function=embeddings)

Your Deep Lake dataset has been successfully created!


 

# Split

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

all_texts, all_metadatas = [], []
for d in pages_content:
    chunks = text_splitter.split_text(d["text"])
    for chunk in chunks:
        all_texts.append(chunk)
        all_metadatas.append({ "source": d["url"] })

# Add

In [5]:
# we add all the chunks to the deep lake, along with their metadata
db.add_texts(all_texts, all_metadatas)

Creating 49 embeddings in 1 batches of size 49:: 100%|██████████| 1/1 [00:36<00:00, 36.26s/it]

Dataset(path='hub://thapabibek1129/langchain_course_qabot_with_source', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype      shape     dtype  compression
  -------    -------    -------   -------  ------- 
   text       text      (49, 1)     str     None   
 metadata     json      (49, 1)     str     None   
 embedding  embedding  (49, 384)  float32   None   
    id        text      (49, 1)     str     None   





['d08968d6-ca56-11ee-9c94-a434d9523559',
 'd08968d7-ca56-11ee-abdd-a434d9523559',
 'd08968d8-ca56-11ee-8cae-a434d9523559',
 'd08968d9-ca56-11ee-9a35-a434d9523559',
 'd08968da-ca56-11ee-8d56-a434d9523559',
 'd08968db-ca56-11ee-a5e2-a434d9523559',
 'd08968dc-ca56-11ee-9d32-a434d9523559',
 'd08968dd-ca56-11ee-8cfd-a434d9523559',
 'd08968de-ca56-11ee-bc24-a434d9523559',
 'd08968df-ca56-11ee-88ea-a434d9523559',
 'd08968e0-ca56-11ee-85c1-a434d9523559',
 'd08968e1-ca56-11ee-aec7-a434d9523559',
 'd08968e2-ca56-11ee-9c61-a434d9523559',
 'd08968e3-ca56-11ee-9dfa-a434d9523559',
 'd08968e4-ca56-11ee-b966-a434d9523559',
 'd08968e5-ca56-11ee-bfab-a434d9523559',
 'd08968e6-ca56-11ee-ad65-a434d9523559',
 'd08968e7-ca56-11ee-9b5f-a434d9523559',
 'd08968e8-ca56-11ee-aed2-a434d9523559',
 'd08968e9-ca56-11ee-b930-a434d9523559',
 'd08968ea-ca56-11ee-b24c-a434d9523559',
 'd08968eb-ca56-11ee-abca-a434d9523559',
 'd08968ec-ca56-11ee-b849-a434d9523559',
 'd08968ed-ca56-11ee-9a96-a434d9523559',
 'd0897cb9-ca56-

In [6]:
from langchain.chains import RetrievalQAWithSourcesChain
from langchain import HuggingFaceHub

llm = HuggingFaceHub(
    repo_id='mistralai/Mistral-7B-Instruct-v0.2',
    model_kwargs={'temperature':0.5,"max_length": 64,"max_new_tokens":512}
)

chain = RetrievalQAWithSourcesChain.from_chain_type(llm=llm,
                                                    chain_type="stuff",
                                                    retriever=db.as_retriever())



# Inference

In [7]:
d_response = chain({"question": "What does Geoffrey Hinton think about recent trends in AI?"})

print("Response:")
print(d_response["answer"])
print("Sources:")
for source in d_response["sources"].split(", "):
    print("- " + source)

Response:
 Geoffrey Hinton, known as the "Godfather of AI," has expressed concerns about the potential dangers of AI, including its ability to generate false text, images, and videos, and its impact on the job market. He resigned from Google to discuss these concerns openly.

Sources:
- 
https://www.artificialintelligence-news.com/2023/05/02/ai-godfather-warns-dangers-and-quits-google/
