In [1]:
import requests
from bs4 import BeautifulSoup
import re
import difflib
import webbrowser
import time
from urllib.parse import urljoin, urlparse

In [2]:
STANDARD_TAGS = {
    "html", "head", "title", "meta", "link", "style", "script", "body",
    "h1", "h2", "h3", "h4", "h5", "h6", "p", "br", "hr", "a", "img", "table",
    "tr", "td", "th", "ul", "ol", "li", "nav", "header", "footer", "section",
    "article", "aside", "figure", "figcaption", "div", "span", "form", "input",
    "button", "label", "select", "option", "textarea", "iframe", "video", "audio",
    "source", "canvas", "details", "summary", "dialog", "embed", "object"
}

DEPRECATED_TAGS = {"font", "marquee", "center"}

THREAT_PATTERNS = [
    r"eval\(",
    r"document\.write\(",
    r"display\s*:\s*none",
    r"visibility\s*:\s*hidden"
]

OUTDATED_LIBRARIES = {
    "jquery": "3.5.0",
    "bootstrap": "4.5.0"
}

In [3]:
def fetch_html(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def find_private_tags(soup):
    all_tags = {tag.name for tag in soup.find_all()}
    private_tags = all_tags - STANDARD_TAGS
    return private_tags

def clean_html(html, private_tags):
    for tag in private_tags:
        html = re.sub(rf'<{tag}.*?>.*?</{tag}>', '', html, flags=re.DOTALL)
        html = re.sub(rf'<{tag}.*?/>', '', html, flags=re.DOTALL)
    return html

In [4]:
def extract_links(url, soup):
    base_url = "{0.scheme}://{0.netloc}".format(urlparse(url))
    links = {urljoin(base_url, a['href']) for a in soup.find_all('a', href=True)}
    return links

def generate_sitemap(url, links, i):
    sitemap_content = '<?xml version="1.0" encoding="UTF-8"?>\n'
    sitemap_content += '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n'

    for link in sorted(links):
        sitemap_content += f"  <url><loc>{link}</loc></url>\n"

    sitemap_content += '</urlset>'

    with open(f"sitemap{i}.xml", "w", encoding="utf-8") as file:
        file.write(sitemap_content)

    print(f"Sitemap generated and saved as 'sitemap{i}.xml'. Opening in browser...")
    webbrowser.open("sitemap.xml")

In [5]:
url = "https://webs.iiitd.edu.in/raghava/toxinpred3/"
start_time = time.time()

html_content = fetch_html(url)

soup_original = BeautifulSoup(html_content, 'lxml')
private_tags = find_private_tags(soup_original)

print("Private/Custom Tags Found: ")
if private_tags:
    for tag in private_tags:
        print(f"<{tag}>")
else:
    print("No private tags found.")

Private/Custom Tags Found: 
<strong>


In [6]:
cleaned_html = clean_html(html_content, private_tags)
soup_cleaned = BeautifulSoup(cleaned_html, 'lxml')

print("Original HTML: ")
print(soup_original.prettify())

Original HTML: 
<!DOCTYPE HTML>
<!--
	Telephasic by HTML5 UP
	html5up.net | @ajlkn
	Free for personal and commercial use under the CCA 3.0 license (html5up.net/license)
-->
<html>
 <head>
  <title>
   ToxinPred3.0
  </title>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1, user-scalable=no" name="viewport"/>
  <link href="assets/css/main.css" rel="stylesheet"/>
 </head>
 <body class="homepage is-preload">
  <div id="page-wrapper">
   <!-- Header -->
   <div id="header-wrapper">
    <div class="container">
     <h3 style="color:white;font-weight:bolt; text-align:center;">
      <a href="index.html">
       ToxinPred3.0
      </a>
      - A webserver to predict toxic and non-toxic peptide
     </h3>
    </div>
    <div class="container" id="header">
     <style>
      #nav ul li a:hover {
    background-color: #1c4e71;
}
     </style>
     <!-- Nav -->
     <nav id="nav" style="position:relative; width:105%;">
      <ul>
       <li>
        <a href="i

In [7]:
print("Cleaned HTML: ")
print(soup_cleaned.prettify())

Cleaned HTML: 
<!DOCTYPE HTML>
<!--
	Telephasic by HTML5 UP
	html5up.net | @ajlkn
	Free for personal and commercial use under the CCA 3.0 license (html5up.net/license)
-->
<html>
 <head>
  <title>
   ToxinPred3.0
  </title>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1, user-scalable=no" name="viewport"/>
  <link href="assets/css/main.css" rel="stylesheet"/>
 </head>
 <body class="homepage is-preload">
  <div id="page-wrapper">
   <!-- Header -->
   <div id="header-wrapper">
    <div class="container">
     <h3 style="color:white;font-weight:bolt; text-align:center;">
      <a href="index.html">
       ToxinPred3.0
      </a>
      - A webserver to predict toxic and non-toxic peptide
     </h3>
    </div>
    <div class="container" id="header">
     <style>
      #nav ul li a:hover {
    background-color: #1c4e71;
}
     </style>
     <!-- Nav -->
     <nav id="nav" style="position:relative; width:105%;">
      <ul>
       <li>
        <a href="in

In [8]:
diff = difflib.unified_diff(
    soup_original.prettify().splitlines(),
    soup_cleaned.prettify().splitlines(),
    fromfile='Original HTML',
    tofile='Cleaned HTML',
    lineterm=''
)

print("Differences: ")
print("\n".join(diff) if private_tags else "No differences found.")

Differences: 
--- Original HTML
+++ Cleaned HTML
@@ -36,58 +36,34 @@
       <ul>
        <li>
         <a href="index.html" style="color: white;">
-         <strong>
-          Home
-         </strong>
         </a>
        </li>
        <li>
         <a href="prediction.php">
-         <strong>
-          Prediction
-         </strong>
         </a>
        </li>
        <li>
         <a href="design.php">
-         <strong>
-          Design
-         </strong>
         </a>
        </li>
        <li>
         <a href="protein.php">
-         <strong>
-          Protein Scanning
-         </strong>
         </a>
        </li>
        <li>
         <a href="motif.php">
-         <strong>
-          Motif Scan
-         </strong>
         </a>
        </li>
        <li>
         <a href="blast.php">
-         <strong>
-          BLAST search
-         </strong>
         </a>
        </li>
        <li>
         <a href="download.php">
-         <strong>
-          Download
-         </s

In [9]:
links = extract_links(url, soup_original)

if links:
    generate_sitemap(url, links, 1)
else:
    print("No valid links found for sitemap.")

print(f"Execution Time: {time.time() - start_time:.2f} seconds.")

Sitemap generated and saved as 'sitemap1.xml'. Opening in browser...
Execution Time: 0.91 seconds.


In [10]:
start_time = time.time()
html_content = fetch_html(url)

soup_original = BeautifulSoup(html_content, 'html.parser')

private_tags = find_private_tags(soup_original)

print("Private/Custom Tags Found: ")
if private_tags:
    for tag in private_tags:
        print(f"<{tag}>")
else:
    print("No private tags found.")

Private/Custom Tags Found: 
<strong>
<font>
<b>


In [11]:
cleaned_html = clean_html(html_content, private_tags)
soup_cleaned = BeautifulSoup(cleaned_html, 'html.parser')

print("Original HTML: ")
print(soup_original.prettify())

Original HTML: 
<!DOCTYPE HTML>
<!--
	Telephasic by HTML5 UP
	html5up.net | @ajlkn
	Free for personal and commercial use under the CCA 3.0 license (html5up.net/license)
-->
<html>
 <head>
  <title>
   ToxinPred3.0
  </title>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1, user-scalable=no" name="viewport"/>
  <link href="assets/css/main.css" rel="stylesheet"/>
 </head>
 <body class="homepage is-preload">
  <div id="page-wrapper">
   <!-- Header -->
   <div id="header-wrapper">
    <div class="container">
     <h3 style="color:white;font-weight:bolt; text-align:center;">
      <a href="index.html">
       ToxinPred3.0
      </a>
      - A webserver to predict toxic and non-toxic peptide
     </h3>
    </div>
    <div class="container" id="header">
     <style>
      #nav ul li a:hover {
    background-color: #1c4e71;
}
     </style>
     <!-- Nav -->
     <nav id="nav" style="position:relative; width:105%;">
      <ul>
       <li>
        <a href="i

In [12]:
print("Cleaned HTML: ")
print(soup_cleaned.prettify())

Cleaned HTML: 
<!DOCTYPE HTML>
<!--
	Telephasic by HTML5 UP
	html5up.net | @ajlkn
	Free for personal and commercial use under the CCA 3.0 license (html5up.net/license)
-->
<html>
 <head>
  <title>
   ToxinPred3.0
  </title>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1, user-scalable=no" name="viewport"/>
  <link href="assets/css/main.css" rel="stylesheet"/>
 </head>
 <meta content="width=device-width, initial-scale=1, user-scalable=no" name="viewport"/>
 <link href="assets/css/main.css" rel="stylesheet"/>
 <body class="homepage is-preload">
  <!-- Header -->
  <div id="footer-wrapper">
   <div class="container" id="footer">
    <ul class="menu" style="text-align:center;">
     <li>
      <a href="https://webs.iiitd.edu.in/raghava/toxinpred2/" style="color:white;" target="_blank">
       ToxinPred2
      </a>
     </li>
     <li>
      <a href="https://webs.iiitd.edu.in/raghava/toxinpred/" style="color:white;" target="_blank">
       ToxinPred
     

In [13]:
diff = difflib.unified_diff(
    soup_original.prettify().splitlines(),
    soup_cleaned.prettify().splitlines(),
    fromfile='Original HTML',
    tofile='Cleaned HTML',
    lineterm=''
)

print("Differences: ")
print("\n".join(diff) if private_tags else "No differences found.")

Differences: 
--- Original HTML
+++ Cleaned HTML
@@ -13,276 +13,8 @@
   <meta content="width=device-width, initial-scale=1, user-scalable=no" name="viewport"/>
   <link href="assets/css/main.css" rel="stylesheet"/>
  </head>
- <body class="homepage is-preload">
-  <div id="page-wrapper">
-   <!-- Header -->
-   <div id="header-wrapper">
-    <div class="container">
-     <h3 style="color:white;font-weight:bolt; text-align:center;">
-      <a href="index.html">
-       ToxinPred3.0
-      </a>
-      - A webserver to predict toxic and non-toxic peptide
-     </h3>
-    </div>
-    <div class="container" id="header">
-     <style>
-      #nav ul li a:hover {
-    background-color: #1c4e71;
-}
-     </style>
-     <!-- Nav -->
-     <nav id="nav" style="position:relative; width:105%;">
-      <ul>
-       <li>
-        <a href="index.html" style="color: white;">
-         <strong>
-          Home
-         </strong>
-        </a>
-       </li>
-       <li>
-        <a href="prediction.php

In [14]:
links = extract_links(url, soup_original)

if links:
    generate_sitemap(url, links, 2)
else:
    print("No valid links found for sitemap.")

print(f"Execution Time: {time.time() - start_time:.2f} seconds.")

Sitemap generated and saved as 'sitemap2.xml'. Opening in browser...
Execution Time: 0.94 seconds.


In [15]:
import requests
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup

# Website URL and sitemap file
website_url = "https://webs.iiitd.edu.in/raghava/toxinpred3/"

def fetch_sitemap(sitemap_path):
    try:
        with open(sitemap_path, "r", encoding="utf-8") as file:
            tree = ET.parse(file)
            root = tree.getroot()
            urls = [elem.text for elem in root.iter("{http://www.sitemaps.org/schemas/sitemap/0.9}loc")]
            return set(urls)
    except Exception as e:
        print(f"Error reading sitemap: {e}")
        return set()

def fetch_indexed_pages(website_url, tex):
    try:
        response = requests.get(website_url, timeout=10, headers={"User-Agent": "Mozilla/5.0"})
        response.raise_for_status()
        soup = BeautifulSoup(response.text, tex)

        indexed_urls = set()
        for link in soup.find_all("a", href=True):
            href = link["href"]
            if href.startswith("http"):  # Absolute URL
                indexed_urls.add(href)
            elif href.startswith("/"):  # Relative URL
                indexed_urls.add(website_url.rstrip("/") + href)

        return indexed_urls
    except requests.RequestException as e:
        print(f"Error fetching website index: {e}")
        return set()

In [16]:
sitemap_file = "/content/sitemap1.xml"

sitemap_urls = fetch_sitemap(sitemap_file)
indexed_urls = fetch_indexed_pages(website_url, tex='lxml')

extra_in_sitemap = sitemap_urls - indexed_urls
missing_in_sitemap = indexed_urls - sitemap_urls

print("URLs in Sitemap but NOT found on the website:")
for url in extra_in_sitemap:
    print(f"- {url}")

print("URLs on Website but MISSING in Sitemap:")
for url in missing_in_sitemap:
    print(f"- {url}")

print("Total URLs in Sitemap:", len(sitemap_urls))
print("Total Indexed URLs:", len(indexed_urls))

URLs in Sitemap but NOT found on the website:
- https://webs.iiitd.edu.in/contact.php
- https://webs.iiitd.edu.in/index.html
- https://webs.iiitd.edu.in/motif.php
- https://webs.iiitd.edu.in/protein.php
- https://webs.iiitd.edu.in/help.html
- https://webs.iiitd.edu.in/prediction.php
- https://webs.iiitd.edu.in/team.php
- https://webs.iiitd.edu.in/design.php
- https://webs.iiitd.edu.in/blast.php
- https://webs.iiitd.edu.in
- https://webs.iiitd.edu.in/download.php
URLs on Website but MISSING in Sitemap:
Total URLs in Sitemap: 11
Total Indexed URLs: 0


In [17]:
sitemap_file = "/content/sitemap2.xml"

sitemap_urls = fetch_sitemap(sitemap_file)
indexed_urls = fetch_indexed_pages(website_url, tex='html.parser')

extra_in_sitemap = sitemap_urls - indexed_urls
missing_in_sitemap = indexed_urls - sitemap_urls

print("URLs in Sitemap but NOT found on the website:")
for url in extra_in_sitemap:
    print(f"- {url}")

print("URLs on Website but MISSING in Sitemap:")
for url in missing_in_sitemap:
    print(f"- {url}")

print("Total URLs in Sitemap:", len(sitemap_urls))
print("Total Indexed URLs:", len(indexed_urls))

URLs in Sitemap but NOT found on the website:
- https://webs.iiitd.edu.in/contact.php
- https://webs.iiitd.edu.in/raghava/toxinpred3/algo.html
- https://webs.iiitd.edu.in/protein.php
- https://webs.iiitd.edu.in/index.html
- https://webs.iiitd.edu.in/raghava/toxinpred3/contact.php
- https://webs.iiitd.edu.in/help.html
- https://webs.iiitd.edu.in/prediction.php
- https://webs.iiitd.edu.in
- https://webs.iiitd.edu.in/team.php
- https://webs.iiitd.edu.in/raghava/toxinpred2
- https://webs.iiitd.edu.in/raghava/toxinpred3/download.php
- https://webs.iiitd.edu.in/raghava/toxinpred
- https://webs.iiitd.edu.in/design.php
- https://webs.iiitd.edu.in/blast.php
- https://webs.iiitd.edu.in/motif.php
- https://webs.iiitd.edu.in/raghava/toxinpred3/help.html
- https://webs.iiitd.edu.in/download.php
- https://webs.iiitd.edu.in/raghava/toxinpred3/team.php
URLs on Website but MISSING in Sitemap:
- https://webs.iiitd.edu.in/raghava/toxinpred3/raghava/toxinpred3/help.html
- https://webs.iiitd.edu.in/raghava

In [18]:
sitemap1_path = "/content/sitemap1.xml"
sitemap2_path = "/content/sitemap2.xml"

sitemap1_urls = fetch_sitemap(sitemap1_path)
sitemap2_urls = fetch_sitemap(sitemap2_path)

indexed_urls1 = fetch_indexed_pages(website_url, tex='lxml')
indexed_urls2 = fetch_indexed_pages(website_url, tex='html.parser')

extra_in_sitemap1 = sitemap1_urls - indexed_urls1
extra_in_sitemap2 = sitemap2_urls - indexed_urls2
missing_in_sitemap1 = indexed_urls1 - sitemap1_urls
missing_in_sitemap2 = indexed_urls2 - sitemap2_urls

sitemap1_coverage = len(sitemap1_urls & indexed_urls1) / max(len(indexed_urls1), 1) * 100
sitemap2_coverage = len(sitemap2_urls & indexed_urls2) / max(len(indexed_urls2), 1) * 100

print("Sitemap Accuracy:")
print(f"Sitemap1 Coverage: {sitemap1_coverage:.2f}%")
print(f"Sitemap2 Coverage: {sitemap2_coverage:.2f}%")

print("Total URLs in Sitemap1:", len(sitemap1_urls))

print("Total URLs in Sitemap2:", len(sitemap2_urls))
print("Total Indexed URLs:", len(indexed_urls2))

Sitemap Accuracy:
Sitemap1 Coverage: 0.00%
Sitemap2 Coverage: 56.25%
Total URLs in Sitemap1: 11
Total URLs in Sitemap2: 27
Total Indexed URLs: 16


Fetch Sitemap: 
Using xml.etree.ElementTree extracts all <loc> tags (which contain URLs) from the sitemap.
Returns a set of URLs found in the sitemap.
Handles errors by printing an error message and returning an empty set if the file cannot be read.

Crawl Website:
Uses a breadth-first search approach to keep track of visited pages.
Iterates until max_pages are visited or to_visit is empty.
Fetches each URL using requests.get() with a user-agent header to avoid bot detection.
Returns a set of visited URLs.

In [21]:
import requests
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

website_url = "https://webs.iiitd.edu.in/raghava/toxinpred3/"
sitemap_file = "/content/sitemap2.xml"

def fetch_sitemap(sitemap_path):
    try:
        with open(sitemap_path, "r", encoding="utf-8") as file:
            tree = ET.parse(file)
            root = tree.getroot()
            urls = {elem.text.strip() for elem in root.iter("{http://www.sitemaps.org/schemas/sitemap/0.9}loc")}
            return urls
    except Exception as e:
        print(f"Error reading sitemap: {e}")
        return set()

def crawl_website(start_url, max_pages=100):
    visited = set()
    to_visit = {start_url}

    while to_visit and len(visited) < max_pages:
        url = to_visit.pop()
        if url in visited:
            continue
        try:
            response = requests.get(url, timeout=10, headers={"User-Agent": "Mozilla/5.0"})
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")
            visited.add(url)
            for link in soup.find_all("a", href=True):
                href = link["href"]
                full_url = urljoin(start_url, href)
                if full_url.startswith(start_url) and full_url not in visited:
                    to_visit.add(full_url)

        except requests.RequestException:
            continue
    return visited

In [23]:
sitemap_urls = fetch_sitemap(sitemap_file)
website_urls = crawl_website(website_url)

missing_in_sitemap = website_urls - sitemap_urls
extra_in_sitemap = sitemap_urls - website_urls

i = 1
print("URLs on Website but MISSING in Sitemap:")
for url in missing_in_sitemap:
    print(f"{i} - {url}")
    i += 1

i += 1
print("URLs in Sitemap but NOT found on Website:")
for url in extra_in_sitemap:
    print(f"{i} - {url}")
    i += 1

print("Total URLs in Sitemap:", len(sitemap_urls))
print("Total Crawled URLs on Website:", len(website_urls))

URLs on Website but MISSING in Sitemap:
1 - https://webs.iiitd.edu.in/raghava/toxinpred3/help.html#pred
2 - https://webs.iiitd.edu.in/raghava/toxinpred3/download/test_pos.csv
3 - https://webs.iiitd.edu.in/raghava/toxinpred3/download/train_pos.csv
4 - https://webs.iiitd.edu.in/raghava/toxinpred3/protein.php
5 - https://webs.iiitd.edu.in/raghava/toxinpred3/
6 - https://webs.iiitd.edu.in/raghava/toxinpred3/motif.php
7 - https://webs.iiitd.edu.in/raghava/toxinpred3/index.html
8 - https://webs.iiitd.edu.in/raghava/toxinpred3/download/test_neg.csv
9 - https://webs.iiitd.edu.in/raghava/toxinpred3/blast.php
10 - https://webs.iiitd.edu.in/raghava/toxinpred3/design.php
11 - https://webs.iiitd.edu.in/raghava/toxinpred3/help.html#blast
12 - https://webs.iiitd.edu.in/raghava/toxinpred3/download/train_neg.csv
13 - https://webs.iiitd.edu.in/raghava/toxinpred3/prediction.php
14 - https://webs.iiitd.edu.in/raghava/toxinpred3/download/toxinpred3.Tar.Gz
15 - https://webs.iiitd.edu.in/raghava/toxinpred3/h