In [2]:
import random # generate random numbers and random selections
import time   # time related function 
import requests # make web request
import hashlib  # get access to hash functions
from bs4 import BeautifulSoup # extract info from web page
from duckduckgo_search import DDGS # web searching just like google search
import chromadb # vector database memory for chatbots helps storing and searching
from chromadb.config import Settings
import spacy # natural language processing library
from sentence_transformers import SentenceTransformer # comparing and storing text
from readability import Document # extracting text from web pages

In [3]:
# Initialize tools
nlp = spacy.load("en_core_web_sm") # lode a pre trained nlp model in "nlp" variable making it a function to process english text and return an analyzed document
model = SentenceTransformer("all-MiniLM-L6-v2") # loads a pre trained transformer model that converts entences to embeddings vectors
# here embedding vectors are sentences converted to neumeric data for easier ml model processings 

In [4]:
# ChromaDB client
client = chromadb.Client(Settings()) # creating a vector database for storing the data
# here client is the connection to vector database
# settings is for customizing storage path, cache, behaviour.
collection = client.get_or_create_collection(name="web_knowledge") # web_knowledge is the table/collection name for our database
# mainly text embedding takes place here

In [5]:
# Memory trackers
visited = set() # keeps record of visited urls and data acquired
hashes = set() # detecting duplicate content

In [6]:
# Starting topics
topics = ["Quantum Physics", "Ancient Civilizations", "Artificial Intelligence"] # topics to initiate searching

In [7]:
# DuckDuckGo search function
def search_web(topic): # function that takes one topic at a time
    print(f"\n🔍 Searching DuckDuckGo for: {topic}") # prints the topic whis is being searched
    urls = []  # empty lsit to store all the urls for respective topic during the search
    try: # try block to catch and handle any error
        with DDGS() as ddgs: # DDGS is a class for web searching and ddgs is an instance which actually performs the searching
            results = ddgs.text(topic, max_results=5) # results is a dictionary that stors the title, contend and url
            # for each topic maximum of 5 url searches would be done
            for result in results: # iteration loop using 'result' int dictionary 'results'
                href = result.get("href") # to get the href url key from 'results'
                if href: # if href exists 
                    urls.append(href) # add the url in the 'urls' lsit 
    except Exception as e: # catches any exception while searching
        print(f"[Search Error]: {e}") # if any exception is found, print search error
    return urls # returns the list of urls

In [8]:
# Scraper using readability
def scrape(url): # function to take one url at a time
    print(f"🕷️ Scraping: {url}") # printing the url that is being scrapped
    try: # to handle any error during scrapping 
        r = requests.get(url, timeout=10) # sending a HTTP GET request to url to get data 
        # 10 seconds for the request to be validated else give up
        doc = Document(r.text) # 'r.text' is to extract the redable content from the page
        # 'Document' is for removing all unwanted portions and focus on the main content 
        soup = BeautifulSoup(doc.summary(), 'html.parser') # text extraction happening here
        paragraphs = soup.find_all('p') # find all <p> paragraph tags to find out the article 
        text = ' '.join([p.get_text() for p in paragraphs[:15]]) # extracting text content for first 15 paragraphs
        # combine all the extracted paragraphs into one
        # limited to 15 paragraphs and is customizable
        return text # returns the ultimate content prepared
    except Exception as e: # catch any exception
        print(f"[Scrape Error]: {e}") # if catches, print error
        return "" # return empty string

In [9]:
# Prevent duplicate content
def is_duplicate(text): # acepts a string 'text'
    text_hash = hashlib.sha256(text.encode()).hexdigest() # sha256 is cryptographic hash function that generates 256 bit hash value for any input
    # 'text.encode()' converts the text to bytes
    # 'a final 256 byte code is generated from the input string'
    if text_hash in hashes: # tracks previously seen content 
        return True # returns true
    hashes.add(text_hash) # if new data not seen previously then new hash is added to 'hashes'
    return False # hence return false

In [10]:
# Extract meaningful keywords
def extract_keywords(text): # function accepts text
    doc = nlp(text) # using the previously created 'nlp' function/pipeline, keywords are extracted
    keywords = list({ent.text for ent in doc.ents if ent.label_ in [
        "PERSON", "ORG", "GPE", "NORP", "EVENT", "WORK_OF_ART", "LANGUAGE", "PRODUCT"
    ]}) # extracts the given keywords and stores it in a set (with no duplicate words)
    # the set is converted into list
    if not keywords: # if the above related keywords not found
        keywords = [chunk.text for chunk in doc.noun_chunks if len(chunk.text.split()) < 4] # filters out the long phrases 
    return keywords[:3] # returns the first 3 keywords (customizable limit)

In [11]:
# Store to ChromaDB
def store_knowledge(content, topic, url): # function gets the content and its respective topic name and url
    embedding = model.encode(content) # converts the text data in neumeric data
    collection.add(
        documents=[content],
        metadatas=[{"topic": topic, "url": url}],
        ids=[str(time.time())],
        embeddings=[embedding.tolist()] # stores the data in database 
    )
    print(f"✅ Stored knowledge from {url}") # prints the conformation message

In [12]:
# Continuous learning loop/ an infinite loop
while True:
    topic = random.choice(topics) # randomly picks up a topic
    urls = search_web(topic) # use duck duck go to find url for related topics

    for url in urls: # iteration loop
        if url in visited: # its checked that the url is visited or not
            continue # hence continue or skip that url
        visited.add(url) # else add the new url in 'urls' list

        content = scrape(url) # here the content from the url is scrapped
        if not content or is_duplicate(content): # if duplicate data found 
            continue # skip

        store_knowledge(content, topic, url) # else store the knowledge in database 

        new_topics = extract_keywords(content) # keywords are extracted from the content and stored in 'new topics'
        if new_topics: # if new topic found
            print(f"🧠 New Topics Discovered: {new_topics}") # print new topic found
            topics.extend(new_topics) # add the topic and extent the topic list 

        time.sleep(5)  # before doing next iteration, waits for 5 seconds to avoid overloading


🔍 Searching DuckDuckGo for: Artificial Intelligence
🕷️ Scraping: https://en.wikipedia.org/wiki/Artificial_intelligence
✅ Stored knowledge from https://en.wikipedia.org/wiki/Artificial_intelligence
🧠 New Topics Discovered: ['Amazon', 'Alexa', 'AI winters.[9][10] Funding']
🕷️ Scraping: https://www.britannica.com/technology/artificial-intelligence
✅ Stored knowledge from https://www.britannica.com/technology/artificial-intelligence
🧠 New Topics Discovered: ['\n\t\t\tFeedback', 'you', 'your feedback']
🕷️ Scraping: https://www.coursera.org/articles/what-is-artificial-intelligence
✅ Stored knowledge from https://www.coursera.org/articles/what-is-artificial-intelligence
🧠 New Topics Discovered: ['Chat GPT', 'DeepLearning', 'AI']
🕷️ Scraping: https://www.ibm.com/think/topics/artificial-intelligence
✅ Stored knowledge from https://www.ibm.com/think/topics/artificial-intelligence
🧠 New Topics Discovered: ['Greece', 'German', 'the "Turing Test']
🕷️ Scraping: https://www.nasa.gov/what-is-artifici

KeyboardInterrupt: 