In [1]:
# Importing necessary libraries
import numpy as np
import urllib.parse
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten,Embedding,Dense

# Previous pre-processing unit 
def convert_url(raw_url):
    # Tokenize the URL
    tokenized_url = urllib.parse.quote(raw_url)

    # Standardize the URL
    standardized_url = urllib.parse.urlsplit(tokenized_url).geturl()

    # Truncate or pad the URL
    max_url_length = 200  # Maximum length of the padded URL

    if len(standardized_url) > max_url_length:
        # Truncate the URL if it is longer than the maximum length
        truncated_url = standardized_url[-max_url_length:]
        padded_url = [ord(char) for char in truncated_url]
    else:
        # Pad the URL with zeros if it is shorter than the maximum length
        padded_url = [0] * (max_url_length - len(standardized_url)) + [ord(char) for char in standardized_url]

    return padded_url

# Example usage
raw_url = "https://example.com/path?param1=value1&param2=value2"
padded_url = convert_url(raw_url)

print(padded_url)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 104, 116, 116, 112, 115, 37, 51, 65, 47, 47, 101, 120, 97, 109, 112, 108, 101, 46, 99, 111, 109, 47, 112, 97, 116, 104, 37, 51, 70, 112, 97, 114, 97, 109, 49, 37, 51, 68, 118, 97, 108, 117, 101, 49, 37, 50, 54, 112, 97, 114, 97, 109, 50, 37, 51, 68, 118, 97, 108, 117, 101, 50]


In [2]:
# Embedding and flattening

model = Sequential([
    
])
padded_url = np.array(padded_url)

'''
Parameters and their meaning
Input_dim - number of distinct characters
Output_dim -  the dimensionality of the embedding vectors
Input length - The size of tokenized URL
'''

inp_dim = 256
emb_dim = 32
seq_len = 200


embedding_layer = Embedding(input_dim = inp_dim, output_dim = emb_dim, input_length = seq_len)

# Printing out the results of embedding layer
embedded_urls = embedding_layer(padded_url)

# Convert the embedded URLs tensor to a NumPy array
embedded_urls_array = embedded_urls.numpy()

# Print the embedded URLs array
for i, url in enumerate(embedded_urls_array):
    print(f"Embedded URL {i+1}:")
    print(url)
    print("\n")
print(embedded_urls.shape)


# Reshape the embedded URLs
flattened_urls = np.reshape(embedded_urls, (seq_len * emb_dim, -1))
# Print the shape of the flattened URLs
print("Shape of flattened URLs:", flattened_urls.shape)
print(flattened_urls)
# Hence, we get rid of bias in terms of numbers
# Since we've checked their working, we add it to our model


Embedded URL 1:
[ 0.02731843 -0.03987877  0.00499691  0.04310935 -0.03833193 -0.04076358
 -0.03595383 -0.0030483   0.03295065  0.02185084 -0.00078671  0.03128355
 -0.03473247 -0.04376375 -0.03491961  0.04842858 -0.01799716  0.00164417
 -0.03126593  0.0352643  -0.04464358  0.01294397  0.00923046 -0.03411702
 -0.00229772 -0.03312014 -0.0473436   0.03216504  0.02354756 -0.00807798
  0.02798145  0.01893303]


Embedded URL 2:
[ 0.02731843 -0.03987877  0.00499691  0.04310935 -0.03833193 -0.04076358
 -0.03595383 -0.0030483   0.03295065  0.02185084 -0.00078671  0.03128355
 -0.03473247 -0.04376375 -0.03491961  0.04842858 -0.01799716  0.00164417
 -0.03126593  0.0352643  -0.04464358  0.01294397  0.00923046 -0.03411702
 -0.00229772 -0.03312014 -0.0473436   0.03216504  0.02354756 -0.00807798
  0.02798145  0.01893303]


Embedded URL 3:
[ 0.02731843 -0.03987877  0.00499691  0.04310935 -0.03833193 -0.04076358
 -0.03595383 -0.0030483   0.03295065  0.02185084 -0.00078671  0.03128355
 -0.03473247 -0.0437