In [1]:
import tensorflow
import numpy as np 
from bs4 import BeautifulSoup

In [2]:
class FeatureExtraction(): 
    def __init__(self, html_document, keywords): 
        self.html_document = html_document
        self.keywords = keywords

        soup = BeautifulSoup(html_document, "html.parser")
        self.soup_object = soup

        self.feature_vector = np.zeros(15)

    def __repr__(self): 
        print(f"HTML document: {len(self.html_document) != 0}")
        print(f"Keywords: {self.keywords}")

    def feature_extraction_html_tag(self): 
        """
        HTML tag features: 
        lang
        head 
        body 
        footer
        """
        for i in range(11, 15): 
            self.feature_vector[i] = 1

        if self.soup_object.find_all("html").attrs == None
            self.feature_vector[11] -= 1
        
        if self.soup_object.find_all("head").attrs == None: 
            self.feature_vector[12] -= 1

        if self.soup_object.find_all("body").attrs == None: 
            self.feature_vector[13] -= 1
        
        if self.soup_object.find_all("footer").atrrs == None: 
            self.feature_vector[14] -= 1

    def feature_extraction_head_tag(self): 
        """
        Head tag features: 
        Has Title tag
        Title tag must contain keywords 
        """
        self.feature_vector[9] = 1

        head_tag = self.soup_object.find("head")
        title_tag_content = ""
        found_title = False 
        kw_count = 0

        for i in range(len(head_tag.children)): 
            if head_tag == "title": 
                self.feature_vector[9] -= i
                title_tag_content = head_tag.children[i].text
                found_title = True

        for kw in self.keywords: 
            if kw in title_tag_content: 
                if kw_count > 3: 
                    self.feature_vector[10] -= 1
                else: 
                    self.feature_vector[10] += 1

        if not found_title: 
            self.feature_vector[9] -= 3
            
    def feature_extraction_meta_tag(self): 
        """
        Meta tag features: 
        Application name, 
        Keywords, 
        Author, 
        Description, 
        Description tag length (<= 160) 
        Generator, 
        Viewport, 
        Date, 
        """

        # get features from meta tag
        for meta_tag in self.soup_object.find_all("meta"): 
            attributes = meta_tag.attrs
            has_description = False
            if "name" in attributes and "content" in attributes: 
                # check for null content
                if attributes["content"] == "": 
                    continue

                # check for each description necessary for meta tag 
                if attributes["name"] == "application-name": 
                    self.feature_vector[0] += 1
                elif attributes["name"] == "keywords": 
                    self.feature_vector[1] += 1
                elif attributes["name"] == "author": 
                    self.feature_vector[2] += 1
                elif attributes["name"] == "description": 
                    self.feature_vector[3] += 2
                    has_description = True
                    if len(attributes["content"]) <= 160: 
                        self.feature_vector[4] += 1
                elif attributes["name"] == "generator": 
                    self.feature_vector[5] += 1
                elif attributes["name"] == "viewport": 
                    self.feature_vector[6] += 1
                elif attributes["name"] == "date": 
                    self.feature_vector[7] += 1
                elif attributes["name"] == "lang": 
                    self.feature_vector[8] -= 1
        
        if not has_description: 
            self.feature_vecotr[3] -= 2
        return 
    
    def feature_extraction_misc(self): 
        """
        Misc features: 
        Heading tags,
        Alt text images, 
        Anchor text, 
        """
        pass

    def get_feature_vector(self): 
        norm = np.linalg.norm(self.feature_vector)
        return self.feature_vector / norm 


In [3]:
html_text = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Sample HTML Page</title>
    <link rel="stylesheet" href="styles.css">
</head>
<body>

    <header>
        <h1>Welcome to My Website</h1>
        <nav>
            <ul>
                <li><a href="#home">Home</a></li>
                <li><a href="#about">About</a></li>
                <li><a href="#contact">Contact</a></li>
            </ul>
        </nav>
    </header>

    <section id="home">
        <h2>Home Section</h2>
        <p>This is the home section of the page.</p>
    </section>

    <section id="about">
        <h2>About Section</h2>
        <p>This is the about section of the page.</p>
        <img src="profile.jpg" alt="Profile Image">
    </section>

    <section id="contact">
        <h2>Contact Section</h2>
        <form action="/submit" method="post">
            <label for="name">Name:</label>
            <input type="text" id="name" name="name" required>
            <br>
            <label for="email">Email:</label>
            <input type="email" id="email" name="email" required>
            <br>
            <input type="submit" value="Submit">
        </form>
    </section>

    <footer>
        <p>&copy; 2022 My Website. All rights reserved.</p>
    </footer>

</body>
</html>
"""

In [47]:
feature_extractor = FeatureExtraction(html_text, [])

In [4]:
soup = BeautifulSoup(html_text, "html.parser")

In [7]:
head_tag = soup.find("head")

head_tag

<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title>Sample HTML Page</title>
<link href="styles.css" rel="stylesheet"/>
</head>

In [36]:
i = 0
text_list = []
for child in head_tag.children: 
    text = str(child.text)
    
    text_list.append(text)

In [37]:
print(text_list)

['\n', '', '\n', '', '\n', 'Sample HTML Page', '\n', '', '\n']


In [41]:
soup.find("html").attrs

{'lang': 'en'}

In [42]:
a = soup.find("strong")

In [43]:
print(a)

None
