<a href="https://colab.research.google.com/github/Mehadi4021/CSE426_Data_Mining_and_Warehouse_Lab/blob/main/Project03_Building_a_Domain_Specific_Search_Engine_with_Crawling_and_Link_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup

Stopwords are used when building the inverted index. The inverted index will ignore stopwords.

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

STOPWORDS = stopwords.words('english')
print(STOPWORDS)

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Add custom stopwords if you deem it necessary

In [None]:
custom_STOPWORDS = [] # Add your own stopwords here
STOPWORDS.extend(custom_STOPWORDS)

In [None]:
from collections import defaultdict

# Inverted index: word -> set of URLs
inverted_index = defaultdict(set)
url_list = set()

In [None]:
# This dictionary will be used to build the connection between links
web_connection = {'source':[], 'target':[]}

In [None]:
import re

# This function will clean the content of web page in order to build the inverted index.
def clean_and_tokenize(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())  # Remove punctuation and lowercase
    tokens = text.split()
    return [t for t in tokens if t not in STOPWORDS and len(t) > 1]

In [None]:
from urllib.parse import urljoin, urlparse

# The crawl function has 5 parameters
# url = The url to crawl
# base_domain = the base domain of the url. During crawling, the crawler will ignore links from other domains

def crawl(url, base_domain, visited, visit_limit, limit):
    if limit==0 or len(visited)==visit_limit:
        return

    try:
        response = requests.get(url, timeout=5)
        if response.status_code != 200:
            return
    except requests.RequestException:
        return

    visited.add(url)
    print("-"*(10-limit), end=" ")
    print(f"Crawled: {url}")

    soup = BeautifulSoup(response.text, 'html.parser')
    text = soup.get_text(separator=' ', strip=True)
    words = clean_and_tokenize(text)

    for word in words:
        inverted_index[word].add(url)
        url_list.add(url)

    # Recursively follow links
    for tag in soup.find_all('a', href=True):
        link = urljoin(url, tag['href'])
        parsed = urlparse(link)

        # Store external links as connection
        web_connection['source'].append(url)
        web_connection['target'].append(link)

        if parsed.netloc == base_domain and link not in visited:
            crawl(link, base_domain, visited, visit_limit, limit-1)

In [None]:
def crawl_roots(root_urls, max_per_root=2, visit_limit=50):
    for root in root_urls:
        print(f"\nStarting crawl from: {root}")
        domain = urlparse(root).netloc
        visited = set()
        crawl(root, domain, visited, visit_limit, max_per_root)

In [None]:
seed_urls = [
    'https://www.mykhel.com/cricket/ban-vs-zim-shadman-islam-shines-with-gritty-100-anchors-bangladesh-to-commanding-lead-on-day-2-in-358519.html',
    'https://www.cricbuzz.com/cricket-news/134203/confident-that-we-can-put-bangladesh-under-pressure-dion-ebrahim',
    'https://gulfnews.com/sport/cricket-prodigy-vaibhav-suryavanshi-smashes-ipl-record-at-14-wins-hearts-of-legends-and-bollywood-icons-1.500109736',
    'https://www.espncricinfo.com/cricket-news',
    'https://sports.ndtv.com/cricket/news',
    'https://www.hindustantimes.com/cricket',
    'https://www.bbc.com/sport/cricket',
    'https://www.icc-cricket.com/news',
    'https://indianexpress.com/section/sports/cricket/',
    'https://www.news18.com/cricket/',
    'https://www.cricket.com.au/news'
]

crawl_roots(seed_urls, max_per_root=10)


Starting crawl from: https://www.mykhel.com/cricket/ban-vs-zim-shadman-islam-shines-with-gritty-100-anchors-bangladesh-to-commanding-lead-on-day-2-in-358519.html

Starting crawl from: https://www.cricbuzz.com/cricket-news/134203/confident-that-we-can-put-bangladesh-under-pressure-dion-ebrahim
 Crawled: https://www.cricbuzz.com/cricket-news/134203/confident-that-we-can-put-bangladesh-under-pressure-dion-ebrahim
- Crawled: https://www.cricbuzz.com/
-- Crawled: https://www.cricbuzz.com/cricket-match/live-scores
--- Crawled: https://www.cricbuzz.com/cricket-schedule/upcoming-series/international
---- Crawled: https://www.cricbuzz.com/cricket-scorecard-archives
----- Crawled: https://www.cricbuzz.com/cricket-news
------ Crawled: https://www.cricbuzz.com/cricket-news/editorial/cb-plus
------- Crawled: https://www.cricbuzz.com/cricket-news/latest-news
-------- Crawled: https://www.cricbuzz.com/cricket-news/info/
--------- Crawled: https://www.cricbuzz.com/cricket-news/editorial/spotlight
---

In [None]:
# print inverted index
print("\nSample inverted index (first 20 words):")
for word in list(inverted_index.keys())[:20]:
    print(f"{word}: {list(inverted_index[word])}")


Sample inverted index (first 20 words):
confident: ['https://www.cricbuzz.com/cricket-news/latest-news', 'https://www.cricbuzz.com/cricket-team/zimbabwe/12', 'https://www.cricbuzz.com/', 'https://www.cricbuzz.com/cricket-news/134203/confident-that-we-can-put-bangladesh-under-pressure-dion-ebrahim', 'https://www.icc-cricket.com/news/team/1', 'https://www.cricbuzz.com/cricket-team/sri-lanka/5', 'https://www.cricbuzz.com/cricket-series/9514/zimbabwe-tour-of-bangladesh-2025', 'https://www.icc-cricket.com/tournaments/champions-trophy-2025', 'https://www.icc-cricket.com/news/team/15', 'https://indianexpress.com/section/world/', 'https://www.icc-cricket.com/tournaments/womens-t20-worldcup/index', 'https://gulfnews.com/gn-reach', 'https://www.icc-cricket.com/news/category/u19-cricket-world-cup', 'https://www.cricbuzz.com/cricket-team/bangladesh/6']
put: ['https://www.cricbuzz.com/cricket-team/zimbabwe/12', 'https://indianexpress.com/section/sports/', 'https://www.bbc.com/news/uk', 'https://ww

In [None]:
# Print first 20 connections

for source, target in list(zip(web_connection['source'], web_connection['target']))[:20]:
    print(f"{source} -> {target}")

https://www.cricbuzz.com/cricket-news/134203/confident-that-we-can-put-bangladesh-under-pressure-dion-ebrahim -> https://plus.google.com/104502282508811467249
https://www.cricbuzz.com/cricket-news/134203/confident-that-we-can-put-bangladesh-under-pressure-dion-ebrahim -> Javascript:void(0)
https://www.cricbuzz.com/cricket-news/134203/confident-that-we-can-put-bangladesh-under-pressure-dion-ebrahim -> Javascript:void(0)
https://www.cricbuzz.com/cricket-news/134203/confident-that-we-can-put-bangladesh-under-pressure-dion-ebrahim -> https://www.cricbuzz.com/
https://www.cricbuzz.com/ -> https://plus.google.com/104502282508811467249
https://www.cricbuzz.com/ -> Javascript:void(0)
https://www.cricbuzz.com/ -> Javascript:void(0)
https://www.cricbuzz.com/ -> https://www.cricbuzz.com/
https://www.cricbuzz.com/ -> https://www.cricbuzz.com/cricket-match/live-scores
https://www.cricbuzz.com/cricket-match/live-scores -> https://plus.google.com/104502282508811467249
https://www.cricbuzz.com/cricket

In [None]:
import networkx as nx

web_graph = nx.DiGraph()
for i in range(len(web_connection['source'])):
    web_graph.add_edge(web_connection["source"][i], web_connection["target"][i])

In [None]:
len(web_graph.nodes)

7711

In [None]:
pagerank_scores = nx.pagerank(web_graph, alpha=0.85, max_iter=100, tol=1e-6)
print("\nPageRank Scores:", pagerank_scores)




In [None]:
def search_engine(query, index, scores):
    query_terms = query.lower().split()
    results = set()
    for term in query_terms:
        if term in index:
            if not results:
                results = set(index[term])
            else:
                results = results.intersection(index[term])  # Find common websites

    # Sort results based on score
    ranked_results = []
    for website in results:
        if website in scores:
          ranked_results.append((website, scores[website]))
    ranked_results.sort(key=lambda x: x[1], reverse=True)

    return ranked_results

In [None]:
# Query and display results
query = "Virat Kohli"
print(f"\nSearch Results for '{query}' using PageRank:")
results = search_engine(query, inverted_index, pagerank_scores)

for page, score in results:
    print(f"{page}: ({score})")


Search Results for 'Virat Kohli' using PageRank:
https://www.icc-cricket.com/tournaments/world-test-championship/: (0.00018118724310883016)
https://www.icc-cricket.com/tournaments/champions-trophy-2025: (0.0001798901412613605)
https://www.icc-cricket.com/tournaments/t20cricketworldcup/: (0.00017982517305346763)
https://www.cricbuzz.com/cricket-news/editorial/interviews: (0.0001785810369616234)
https://www.cricbuzz.com/cricket-news/latest-news: (0.0001785810369616234)
https://www.cricbuzz.com/cricket-series/9237/indian-premier-league-2025: (0.0001785810369616234)
https://www.icc-cricket.com/videos/: (0.00017845837620627267)
https://www.news18.com/cricket/videos/: (0.0001709785633539953)
https://www.news18.com/notifications/: (0.0001709785633539953)
https://www.news18.com/movies/: (0.0001709785633539953)
https://www.news18.com/: (0.0001709785633539953)
https://www.news18.com/cricket/live-score/: (0.0001709785633539953)
https://www.news18.com/cricket/cricket-schedule/: (0.000170978563353