In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Embedding, Attention, GlobalAveragePooling2D, Input, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from helpers import FeatureExtraction
import numpy as np 

In [None]:
bad_html_text = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Sample HTML Page</title>
    <link rel="stylesheet" href="styles.css">
</head>
<body>
    <header>
        <h1>Welcome to My Website</h1>
        <nav>
            <ul>
                <li><a href="#home">Home</a></li>
                <li><a href="#about">About</a></li>
                <li><a href="#contact">Contact</a></li>
                <li><a href="#connect"></a></li>
            </ul>
        </nav>
    </header>
    <section id="home">
        <h2>Home Section</h2>
        <p>This is the home section of the page.</p>
    </section>
    <section id="about">
        <h2>About Section</h2>
        <p>This is the about section of the page.</p>
        <img src="profile.jpg" alt="Profile Image">
    </section>
    <section id="contact">
        <h2>Contact Section</h2>
        <form action="/submit" method="post">
            <label for="name">Name:</label>
            <input type="text" id="name" name="name" required>
            <br>
            <label for="email">Email:</label>
            <input type="email" id="email" name="email" required>
            <br>
            <input type="submit" value="Submit">
        </form>
    </section>
    <footer>
        <p>&copy; 2022 My Website. All rights reserved.</p>
    </footer>
</body>
</html>
"""

In [None]:
good_html_text = """
<!DOCTYPE html>
<html lang="en">
<head>
    <title>Sample HTML Page</title>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <meta name="author" content="Le Tuan Hy"> 
    <meta name="description" content="Sample page for LSTM and RNN training"
    <meta name="date> content="2024-1:00-08:00"
    <meta name="application-name" content="Neural Network Ninjas">
    <meta name="keywords" content="Sample", "AI", "Training", "LSTM">
    <link rel="stylesheet" href="styles.css">
</head>
<body>
    <header>
        <h1>Welcome to My Website</h1>
        <nav>
            <ul>
                <li><a href="#home">Home</a></li>
                <li><a href="#about">About</a></li>
                <li><a href="#contact">Contact</a></li>
                <li><a href="#connect">Connect to others</a></li>
            </ul>
        </nav>
    </header>
    <section id="home">
        <h2>Home Section</h2>
        <p>This is the home section of the page.</p>
    </section>
    <section id="about">
        <h2>About Section</h2>
        <p>This is the about section of the page.</p>
        <img src="profile.jpg" alt="Profile Image">
    </section>
    <section id="contact">
        <h2>Contact Section</h2>
        <form action="/submit" method="post">
            <label for="name">Name:</label>
            <input type="text" id="name" name="name" required>
            <br>
            <label for="email">Email:</label>
            <input type="email" id="email" name="email" required>
            <br>
            <input type="submit" value="Submit">
        </form>
    </section>
    <footer>
        <p>&copy; 2022 My Website. All rights reserved.</p>
    </footer>
</body>
</html>
"""

### TOKENIZE

In [None]:
tokenizer = Tokenizer(filters='\n')

In [None]:
tokenizer.fit_on_texts([bad_html_text])
tokenizer.fit_on_texts([good_html_text])

In [None]:
bad_html_sequences = tokenizer.texts_to_sequences([bad_html_text])[0]
good_html_sequences = tokenizer.texts_to_sequences([good_html_text])[0]

In [None]:
print(bad_html_sequences)
print(good_html_sequences)

In [None]:
print(tokenizer.word_counts)

In [None]:
VOCAB_SIZE = len(tokenizer.word_index)
print(VOCAB_SIZE)

### SEQUENCE PADDING

In [None]:
MAX_LENGTH = max(len(bad_html_sequences), len(good_html_sequences))
print(MAX_LENGTH)

In [None]:
bad_html_sequences_padded = pad_sequences([bad_html_sequences], maxlen=MAX_LENGTH, truncating="post", padding="post")
good_html_sequences_padded = pad_sequences(
    [good_html_sequences], maxlen=MAX_LENGTH, truncating="post", padding="post")

In [None]:
print(bad_html_sequences_padded)
print(good_html_sequences_padded)

### FEATURE EXTRACTION

In [None]:
keywords = ["Sample", "AI", "Training", "LSTM"]
bad_html_feature_extractor = FeatureExtraction(bad_html_text, keywords=keywords) 
good_html_feature_extractor = FeatureExtraction(good_html_text, keywords=keywords)

In [None]:
# start extracting features
bad_html_feature_extractor.start()
good_html_feature_extractor.start()

In [None]:
bad_html_feature_vector = bad_html_feature_extractor.get_feature_vector()
good_html_feature_vector = good_html_feature_extractor.get_feature_vector()

print(bad_html_feature_vector)
print(good_html_feature_vector)

### FEATURE CONCATENATION

### MODEL CREATION

In [None]:
max_sequence_length = MAX_LENGTH

# Define the model architecture
embedding_dim = 100
lstm_units = MAX_LENGTH

# Input layer for bad HTML code and features
bad_input = Input(shape=(max_sequence_length,))
bad_embedding = Embedding(input_dim=VOCAB_SIZE,
                          output_dim=embedding_dim)(bad_input)
bad_lstm = LSTM(lstm_units, return_sequences=True)(
    bad_embedding)  # Return sequences for attention mechanism

# Input layer for features
feature_input = Input(shape=(18))

tiled_feature_vector = tf.tile(tf.expand_dims(
    feature_input, axis=1), [1, max_sequence_length, 1])

# Concatenate the LSTM output and features
concatenated_inputs = Concatenate(
    axis=-1)([bad_lstm, tiled_feature_vector])

# Attention mechanism
attention_output = Attention()([concatenated_inputs, concatenated_inputs])

# Output layer
output = Dense(VOCAB_SIZE + 1, activation='softmax')(attention_output)

# Create the model
model = Model(inputs=[bad_input, feature_input], outputs=output)

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Prepare training data (replace with your own data)
X_train_bad = np.array(bad_html_sequences_padded)
X_train_feature = np.array([bad_html_feature_vector])
y_train = np.array([good_html_sequences_padded])

# Train the model
model.fit([X_train_bad, X_train_feature], y_train,
          epochs=20)

In [None]:
model.summary()