In [1]:
import requests
from bs4 import BeautifulSoup
import re
import difflib
import webbrowser
import time
from urllib.parse import urljoin, urlparse

In [2]:
STANDARD_TAGS = {
    "html", "head", "title", "meta", "link", "style", "script", "body",
    "h1", "h2", "h3", "h4", "h5", "h6", "p", "br", "hr", "a", "img", "table",
    "tr", "td", "th", "ul", "ol", "li", "nav", "header", "footer", "section",
    "article", "aside", "figure", "figcaption", "div", "span", "form", "input",
    "button", "label", "select", "option", "textarea", "iframe", "video", "audio",
    "source", "canvas", "details", "summary", "dialog", "embed", "object"
}

DEPRECATED_TAGS = {"font", "marquee", "center"}

THREAT_PATTERNS = [
    r"eval\(",
    r"document\.write\(",
    r"display\s*:\s*none",
    r"visibility\s*:\s*hidden"
]

OUTDATED_LIBRARIES = {
    "jquery": "3.5.0",
    "bootstrap": "4.5.0"
}

In [3]:
def fetch_html(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def find_private_tags(soup):
    all_tags = {tag.name for tag in soup.find_all()}
    private_tags = all_tags - STANDARD_TAGS
    return private_tags

def clean_html(html, private_tags):
    for tag in private_tags:
        html = re.sub(rf'<{tag}.*?>.*?</{tag}>', '', html, flags=re.DOTALL)
        html = re.sub(rf'<{tag}.*?/>', '', html, flags=re.DOTALL)
    return html

Extract Links: Extracts all hyperlinks
Genereate SiteMap: Generates an XML sitemap from the given links set

In [4]:
def extract_links(url, soup):
    base_url = "{0.scheme}://{0.netloc}".format(urlparse(url))
    links = {urljoin(base_url, a['href']) for a in soup.find_all('a', href=True)}
    return links

def generate_sitemap(url, links, i):
    sitemap_content = '<?xml version="1.0" encoding="UTF-8"?>\n'
    sitemap_content += '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n'

    for link in sorted(links):
        sitemap_content += f"  <url><loc>{link}</loc></url>\n"

    sitemap_content += '</urlset>'

    with open(f"sitemap{i}.xml", "w", encoding="utf-8") as file:
        file.write(sitemap_content)

    print(f"Sitemap generated and saved as 'sitemap{i}.xml'. Opening in browser...")
    webbrowser.open("sitemap.xml")

Check Seo Rules: checks title, meta description, H1 tags, image attributes, Mobile Friendly Viewport

In [5]:
def check_seo_rules(soup, url):
    print("SEO RULES CHECKER: ")

    title = soup.title.string if soup.title else None
    print(f"Title: {title[:60] if title else 'Missing'}")

    meta_desc = soup.find("meta", attrs={"name": "description"})
    desc_content = meta_desc["content"] if meta_desc else "Missing"
    print(f"Meta Description: {desc_content[:160]}")

    h1_tag = soup.find("h1")
    print(f"H1 Tag: {h1_tag.text.strip() if h1_tag else 'Missing'}")

    robots_url = urljoin(url, "/robots.txt")
    robots_response = requests.get(robots_url)
    print(f"robots.txt Found: {robots_response.status_code == 200}")

    images = soup.find_all("img")
    missing_alt = sum(1 for img in images if not img.get("alt"))
    print(f"Images without alt attributes: {missing_alt}")

    viewport = soup.find("meta", attrs={"name": "viewport"})
    print(f"Mobile-Friendly (Viewport): {'Yes' if viewport else 'Missing'}")

Check Threats: Threat Pattern Checking, Deprecated HTML Tags Checking, Outdated JavaScript Libraries Detection

In [6]:
def check_threats(soup, html):
    print("SECURITY & THREAT CHECKER: ")

    for pattern in THREAT_PATTERNS:
        if re.search(pattern, html, re.IGNORECASE):
            print(f"Threat Found: {pattern}")

    found_deprecated = {tag for tag in soup.find_all() if tag.name in DEPRECATED_TAGS}
    if found_deprecated:
        print(f"Deprecated Tags Found: {', '.join(f'<{tag.name}>' for tag in found_deprecated)}")

    scripts = soup.find_all("script", src=True)
    for script in scripts:
        src = script["src"]
        for lib, min_version in OUTDATED_LIBRARIES.items():
            if lib in src and re.search(r"\d+\.\d+\.\d+", src):
                version = re.search(r"(\d+\.\d+\.\d+)", src).group(1)
                if version < min_version:
                    print(f"Outdated {lib} version detected: {version} (Upgrade to {min_version})")


In [7]:
url = "https://webs.iiitd.edu.in/raghava/toxinpred/"
start_time = time.time()

html_content = fetch_html(url)

soup_original = BeautifulSoup(html_content, 'lxml')
private_tags = find_private_tags(soup_original)

print("Private/Custom Tags Found: ")
if private_tags:
    for tag in private_tags:
        print(f"<{tag}>")
else:
    print("No private tags found.")

Private/Custom Tags Found: 
<marquee>
<b>
<dt>
<font>
<i>


In [8]:
cleaned_html = clean_html(html_content, private_tags)
soup_cleaned = BeautifulSoup(cleaned_html, 'lxml')

print("Original HTML: ")
print(soup_original.prettify())

Original HTML: 
<html lang="en">
 <head>
  <title>
   ToxinPred
  </title>
  <meta content="Designing and Prediction of toxic peptides" name="description"/>
  <meta content="toxic, designing, prediction, peptides, imtech, New Delhi" name="keywords"/>
  <meta content="text/html; charset=utf-8" http-equiv="content-type"/>
  <meta content="width=device-width, initial-scale=1, user-scalable=yes" name="viewport"/>
  <link href="style/style.css" rel="stylesheet" type="text/css"/>
  <style type="text/css">
   .thumbnail span{ /*CSS for enlarged image*/
position: absolute;
background-color: white;
padding: 5px;
bottom: -200px; /*added later*/
left: -100px;
border: 1px dashed gray;
visibility: hidden;
color: black;
text-decoration: none;
}
.thumbnail span img{ /*CSS for enlarged image*/
border-width: 0;
padding: 2px;
}
.thumbnail:hover span{ /*CSS for enlarged image on hover*/
visibility: visible;
top: 01;
/*bottom:100;*/
left: 370px; /*position where enlarged image should offset horizontally *

In [9]:
print("Cleaned HTML: ")
print(soup_cleaned.prettify())

Cleaned HTML: 
<html lang="en">
 <head>
  <title>
   ToxinPred
  </title>
  <meta content="Designing and Prediction of toxic peptides" name="description"/>
  <meta content="toxic, designing, prediction, peptides, imtech, New Delhi" name="keywords"/>
  <meta content="text/html; charset=utf-8" http-equiv="content-type"/>
  <meta content="width=device-width, initial-scale=1, user-scalable=yes" name="viewport"/>
  <link href="style/style.css" rel="stylesheet" type="text/css"/>
  <style type="text/css">
   .thumbnail span{ /*CSS for enlarged image*/
position: absolute;
background-color: white;
padding: 5px;
bottom: -200px; /*added later*/
left: -100px;
border: 1px dashed gray;
visibility: hidden;
color: black;
text-decoration: none;
}
.thumbnail span img{ /*CSS for enlarged image*/
border-width: 0;
padding: 2px;
}
.thumbnail:hover span{ /*CSS for enlarged image on hover*/
visibility: visible;
top: 01;
/*bottom:100;*/
left: 370px; /*position where enlarged image should offset horizontally */

In [10]:
diff = difflib.unified_diff(
    soup_original.prettify().splitlines(),
    soup_cleaned.prettify().splitlines(),
    fromfile='Original HTML',
    tofile='Cleaned HTML',
    lineterm=''
)

print("Differences: ")
print("\n".join(diff) if private_tags else "No differences found.")

Differences: 
--- Original HTML
+++ Cleaned HTML
@@ -33,216 +33,21 @@
   </style>
  </head>
  <body>
-  <div id="main">
-   <div id="header">
-    <div id="logo">
-     <div id="logo_text">
-      <h1>
-       <font size="10">
-        ToxinPred
-       </font>
-      </h1>
-      <h2>
-       <font size="5">
-        Designing and prediction of toxic peptides
-       </font>
-      </h2>
-     </div>
-    </div>
-    <div id="menubar">
-     <ul id="menu">
-      <li>
-       <a href="index.html">
-        Home
-       </a>
-      </li>
-      <li>
-       <a href="design.php">
-        Design Peptide
-       </a>
-      </li>
-      <li>
-       <a href="multi_submit.php">
-        Batch Submission
-       </a>
-      </li>
-      <li>
-       <a href="protein.php">
-        Protein Scanning
-       </a>
-      </li>
-      <li>
-       <a href="motif_scan.php">
-        Motif Scan
-       </a>
-      </li>
-      <li>
-       <a href="motif.php">
-        Motif List
-       </a>
-  

In [11]:
check_seo_rules(soup_original, url)
check_threats(soup_original, html_content)

SEO RULES CHECKER: 
Title: ToxinPred
Meta Description: Designing and Prediction of toxic peptides
H1 Tag: ToxinPred
robots.txt Found: True
Images without alt attributes: 0
Mobile-Friendly (Viewport): Yes
SECURITY & THREAT CHECKER: 
Threat Found: visibility\s*:\s*hidden
Deprecated Tags Found: <marquee>, <font>, <font>, <font>, <font>, <font>, <font>, <font>


In [12]:
links = extract_links(url, soup_original)

if links:
    generate_sitemap(url, links, 1)
else:
    print("No valid links found for sitemap.")

print(f"Execution Time: {time.time() - start_time:.2f} seconds.")

Sitemap generated and saved as 'sitemap1.xml'. Opening in browser...
Execution Time: 0.95 seconds.


In [13]:
start_time = time.time()
html_content = fetch_html(url)

soup_original = BeautifulSoup(html_content, 'html.parser')

private_tags = find_private_tags(soup_original)

print("Private/Custom Tags Found: ")
if private_tags:
    for tag in private_tags:
        print(f"<{tag}>")
else:
    print("No private tags found.")

Private/Custom Tags Found: 
<marquee>
<b>
<dt>
<font>
<i>


In [14]:
cleaned_html = clean_html(html_content, private_tags)
soup_cleaned = BeautifulSoup(cleaned_html, 'html.parser')

print("Original HTML: ")
print(soup_original.prettify())

Original HTML: 
<html lang="en">
 <head>
  <title>
   ToxinPred
  </title>
  <meta content="Designing and Prediction of toxic peptides" name="description"/>
  <meta content="toxic, designing, prediction, peptides, imtech, New Delhi" name="keywords"/>
  <meta content="text/html; charset=utf-8" http-equiv="content-type"/>
  <meta content="width=device-width, initial-scale=1, user-scalable=yes" name="viewport"/>
  <link href="style/style.css" rel="stylesheet" type="text/css"/>
  <style type="text/css">
   .thumbnail span{ /*CSS for enlarged image*/
position: absolute;
background-color: white;
padding: 5px;
bottom: -200px; /*added later*/
left: -100px;
border: 1px dashed gray;
visibility: hidden;
color: black;
text-decoration: none;
}
.thumbnail span img{ /*CSS for enlarged image*/
border-width: 0;
padding: 2px;
}
.thumbnail:hover span{ /*CSS for enlarged image on hover*/
visibility: visible;
top: 01;
/*bottom:100;*/
left: 370px; /*position where enlarged image should offset horizontally *

In [15]:
print("Cleaned HTML: ")
print(soup_cleaned.prettify())

Cleaned HTML: 
<html lang="en">
 <head>
  <title>
   ToxinPred
  </title>
  <meta content="Designing and Prediction of toxic peptides" name="description"/>
  <meta content="toxic, designing, prediction, peptides, imtech, New Delhi" name="keywords"/>
  <meta content="text/html; charset=utf-8" http-equiv="content-type"/>
  <meta content="width=device-width, initial-scale=1, user-scalable=yes" name="viewport"/>
  <link href="style/style.css" rel="stylesheet" type="text/css"/>
  <style type="text/css">
   .thumbnail span{ /*CSS for enlarged image*/
position: absolute;
background-color: white;
padding: 5px;
bottom: -200px; /*added later*/
left: -100px;
border: 1px dashed gray;
visibility: hidden;
color: black;
text-decoration: none;
}
.thumbnail span img{ /*CSS for enlarged image*/
border-width: 0;
padding: 2px;
}
.thumbnail:hover span{ /*CSS for enlarged image on hover*/
visibility: visible;
top: 01;
/*bottom:100;*/
left: 370px; /*position where enlarged image should offset horizontally */

In [16]:
diff = difflib.unified_diff(
    soup_original.prettify().splitlines(),
    soup_cleaned.prettify().splitlines(),
    fromfile='Original HTML',
    tofile='Cleaned HTML',
    lineterm=''
)

print("Differences: ")
print("\n".join(diff) if private_tags else "No differences found.")

Differences: 
--- Original HTML
+++ Cleaned HTML
@@ -32,219 +32,19 @@
 }
   </style>
  </head>
- <body>
-  <div id="main">
-   <div id="header">
-    <div id="logo">
-     <div id="logo_text">
-      <h1>
-       <font size="10">
-        ToxinPred
-       </font>
-      </h1>
-      <h2>
-       <font size="5">
-        Designing and prediction of toxic peptides
-       </font>
-      </h2>
-     </div>
-    </div>
-    <div id="menubar">
-     <ul id="menu">
-      <li>
-       <a href="index.html">
-        Home
-       </a>
-      </li>
-      <li>
-       <a href="design.php">
-        Design Peptide
-       </a>
-      </li>
-      <li>
-       <a href="multi_submit.php">
-        Batch Submission
-       </a>
-      </li>
-      <li>
-       <a href="protein.php">
-        Protein Scanning
-       </a>
-      </li>
-      <li>
-       <a href="motif_scan.php">
-        Motif Scan
-       </a>
-      </li>
-      <li>
-       <a href="motif.php">
-        Motif List
-       </a>


In [17]:
check_seo_rules(soup_original, url)
check_threats(soup_original, html_content)

SEO RULES CHECKER: 
Title: ToxinPred
Meta Description: Designing and Prediction of toxic peptides
H1 Tag: ToxinPred
robots.txt Found: True
Images without alt attributes: 0
Mobile-Friendly (Viewport): Yes
SECURITY & THREAT CHECKER: 
Threat Found: visibility\s*:\s*hidden
Deprecated Tags Found: <marquee>, <font>, <font>, <font>, <font>, <font>, <font>, <font>


In [18]:
links = extract_links(url, soup_original)

if links:
    generate_sitemap(url, links, 2)
else:
    print("No valid links found for sitemap.")

print(f"Execution Time: {time.time() - start_time:.2f} seconds.")

Sitemap generated and saved as 'sitemap2.xml'. Opening in browser...
Execution Time: 0.99 seconds.
