<a href="https://colab.research.google.com/github/Mofidulhaque/DataMiningLab/blob/main/Project_03_Building_a_Domain_Specific_Search_Engine_with_Crawling_and_Link_Analysis_20.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup

Stopwords are used when building the inverted index. The inverted index will ignore stopwords.

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

STOPWORDS = stopwords.words('english')
print(STOPWORDS)

[nltk_data] Downloading package stopwords to /root/nltk_data...


['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

[nltk_data]   Unzipping corpora/stopwords.zip.


Add custom stopwords if you deem it necessary

In [None]:
custom_STOPWORDS = [] # Add your own stopwords here
STOPWORDS.extend(custom_STOPWORDS)

In [None]:
from collections import defaultdict

# Inverted index: word -> set of URLs
inverted_index = defaultdict(set)
url_list = set()

In [None]:
# This dictionary will be used to build the connection between links
web_connection = {'source':[], 'target':[]}

In [None]:
import re

# This function will clean the content of web page in order to build the inverted index.
def clean_and_tokenize(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())  # Remove punctuation and lowercase
    tokens = text.split()
    return [t for t in tokens if t not in STOPWORDS and len(t) > 1]

In [None]:
from urllib.parse import urljoin, urlparse

# The crawl function has 5 parameters
# url = The url to crawl
# base_domain = the base domain of the url. During crawling, the crawler will ignore links from other domains

def crawl(url, base_domain, visited, visit_limit, limit):
    if limit==0 or len(visited)==visit_limit:
        return

    try:
        response = requests.get(url, timeout=5)
        if response.status_code != 200:
            return
    except requests.RequestException:
        return

    visited.add(url)
    print("-"*(10-limit), end=" ")
    print(f"Crawled: {url}")

    soup = BeautifulSoup(response.text, 'html.parser')
    text = soup.get_text(separator=' ', strip=True)
    words = clean_and_tokenize(text)

    for word in words:
        inverted_index[word].add(url)
        url_list.add(url)

    # Recursively follow links
    for tag in soup.find_all('a', href=True):
        link = urljoin(url, tag['href'])
        parsed = urlparse(link)

        # Store external links as connection
        web_connection['source'].append(url)
        web_connection['target'].append(link)

        if parsed.netloc == base_domain and link not in visited:
            crawl(link, base_domain, visited, visit_limit, limit-1)

In [None]:
def crawl_roots(root_urls, max_per_root=2, visit_limit=50):
    for root in root_urls:
        print(f"\nStarting crawl from: {root}")
        domain = urlparse(root).netloc
        visited = set()
        crawl(root, domain, visited, visit_limit, max_per_root)

In [None]:
seed_urls = [
    'https://en.wikipedia.org/wiki/Animal',
    'https://kids.nationalgeographic.com/animals',
    'https://www.britannica.com/animal/animal',
    'https://a-z-animals.com/',
    'https://www.colchesterzoologicalsociety.com/animals-and-habitats/animals/',
    'https://www.montereybayaquarium.org/animals/animals-a-to-z',
    'https://www.mdpi.com/journal/animals',
    'https://animals.sandiegozoo.org/animals',
    'https://www.nationalgeographic.com/animals',
    'https://www.awf.org/wildlife-conservation/all',
    'https://learnenglishkids.britishcouncil.org/category/topics/animals',
    'https://animalsaustralia.org/',

]

crawl_roots(seed_urls, max_per_root=10)


Starting crawl from: https://en.wikipedia.org/wiki/Animal
 Crawled: https://en.wikipedia.org/wiki/Animal
- Crawled: https://en.wikipedia.org/wiki/Animal#bodyContent
-- Crawled: https://en.wikipedia.org/wiki/Main_Page
--- Crawled: https://en.wikipedia.org/wiki/Main_Page#bodyContent
---- Crawled: https://en.wikipedia.org/wiki/Wikipedia:Contents
----- Crawled: https://en.wikipedia.org/wiki/Wikipedia:Contents#bodyContent
------ Crawled: https://en.wikipedia.org/wiki/Portal:Current_events
------- Crawled: https://en.wikipedia.org/wiki/Portal:Current_events#bodyContent
-------- Crawled: https://en.wikipedia.org/wiki/Special:Random
--------- Crawled: https://en.wikipedia.org/wiki/Special:Random#bodyContent
--------- Crawled: https://en.wikipedia.org/wiki/Wikipedia:About
--------- Crawled: https://en.wikipedia.org/wiki/Wikipedia:Contact_us
--------- Crawled: https://en.wikipedia.org/wiki/Help:Contents
--------- Crawled: https://en.wikipedia.org/wiki/Help:Introduction
--------- Crawled: https:

In [None]:
# print inverted index
print("\nSample inverted index (first 20 words):")
for word in list(inverted_index.keys())[:20]:
    print(f"{word}: {list(inverted_index[word])}")


Sample inverted index (first 20 words):
animal: ['https://animalsaustralia.org/about-us/careers/', 'https://animalsaustralia.org/our-work/rodeos/', 'https://animalsaustralia.org/latest-news', 'https://www.colchesterzoologicalsociety.com/book-your-tickets/tesco-clubcard-vouchers/', 'https://kids.nationalgeographic.com/videos/topic/amazing-animals', 'https://www.mdpi.com/', 'https://www.colchesterzoologicalsociety.com/about-us/careers/', 'https://animalsaustralia.org/our-impact/our-impact-in-2023-24/', 'https://animalsaustralia.org/faqs/#content', 'https://animalsaustralia.org/our-work/#sentiment-2', 'https://www.britannica.com/browse/Bugs-Mollusks-Invertebrates', 'https://kids.nationalgeographic.com/games/funny-fill-in/article/funny-fill-in-on-safari', 'https://kids.nationalgeographic.com/videos/topic/party-animals', 'https://www.britannica.com/procon', 'https://animalsaustralia.org/', 'https://animals.sandiegozoo.org/animals/arthropods#highlighted', 'https://animalsaustralia.org/our-w

In [None]:
# Print first 20 connections

for source, target in list(zip(web_connection['source'], web_connection['target']))[:20]:
    print(f"{source} -> {target}")

https://en.wikipedia.org/wiki/Animal -> https://en.wikipedia.org/wiki/Animal#bodyContent
https://en.wikipedia.org/wiki/Animal#bodyContent -> https://en.wikipedia.org/wiki/Animal#bodyContent
https://en.wikipedia.org/wiki/Animal#bodyContent -> https://en.wikipedia.org/wiki/Main_Page
https://en.wikipedia.org/wiki/Main_Page -> https://en.wikipedia.org/wiki/Main_Page#bodyContent
https://en.wikipedia.org/wiki/Main_Page#bodyContent -> https://en.wikipedia.org/wiki/Main_Page#bodyContent
https://en.wikipedia.org/wiki/Main_Page#bodyContent -> https://en.wikipedia.org/wiki/Main_Page
https://en.wikipedia.org/wiki/Main_Page#bodyContent -> https://en.wikipedia.org/wiki/Wikipedia:Contents
https://en.wikipedia.org/wiki/Wikipedia:Contents -> https://en.wikipedia.org/wiki/Wikipedia:Contents#bodyContent
https://en.wikipedia.org/wiki/Wikipedia:Contents#bodyContent -> https://en.wikipedia.org/wiki/Wikipedia:Contents#bodyContent
https://en.wikipedia.org/wiki/Wikipedia:Contents#bodyContent -> https://en.wiki

In [None]:
import networkx as nx

web_graph = nx.DiGraph()
for i  in range(len(web_connection['source'])):
    web_graph.add_edge(web_connection["source"][i], web_connection["target"][i])


In [None]:
len(web_graph.nodes)

28609

In [None]:
pagerank_scores = nx.pagerank(web_graph, alpha=0.85, max_iter=100, tol=1e-6)
print("\nPageRank Scores:", pagerank_scores)




In [None]:
def search_engine(query, index, scores):
    query_terms = query.lower().split()
    results = set()
    for term in query_terms:
        if term in index:
            if not results:
                results = set(index[term])
            else:
                results = results.intersection(index[term])  # Find common websites

    # Sort results based on score
    ranked_results = []
    for website in results:
        if website in scores:
          ranked_results.append((website, scores[website]))
    ranked_results.sort(key=lambda x: x[1], reverse=True)

    return ranked_results

In [None]:
# Query and display results
query = "lion"
print(f"\nSearch Results for '{query}' using PageRank:")
results = search_engine(query, inverted_index, pagerank_scores)

for page, score in results:
    print(f"{page}: ({score})")



Search Results for 'lion' using PageRank:
https://animals.sandiegozoo.org/animals/mammals: (7.905832139781785e-05)
https://animals.sandiegozoo.org/animals: (7.905832139781785e-05)
https://www.colchesterzoologicalsociety.com/news/: (7.1542034454268e-05)
https://www.colchesterzoologicalsociety.com/animals-and-habitats/animals/: (7.1542034454268e-05)
https://kids.nationalgeographic.com/animals/mammals: (6.272742133822128e-05)
https://www.awf.org/living-with-wildlife/supporting-resilient-people: (5.804527880250278e-05)
https://www.awf.org/caring-for-wildlife/applying-conservation-science: (5.804527880250278e-05)
https://www.awf.org/about/history: (5.804527880250278e-05)
https://www.awf.org/caring-for-wildlife/focal-wildlife-species: (5.804527880250278e-05)
https://animalsaustralia.org/our-work/inspiring-stories/: (5.226785740228932e-05)
https://animalsaustralia.org/our-work/emergency-grants-program/: (5.226785740228932e-05)
https://www.britannica.com/browse/Mammals: (4.801379062164006e-05