In [2]:
import json
import os
import string
from html import unescape

from bertopic import BERTopic
import pandas as pd
import lxml.html
from sentence_transformers.models.tokenizer import ENGLISH_STOP_WORDS
from umap import UMAP
from hdbscan import HDBSCAN
import plotly as plt
import plotly.express as px

In [3]:
def preprocess_text(text):

    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word.lower() not in ENGLISH_STOP_WORDS])
    
    # Remove numbers
    text = ''.join([char for char in text if not char.isdigit()])
    return str(text)

In [4]:
def json_extract(file):
    with open(file) as json_file:
        data = json.load(json_file)
        text =''
        for p in data['class']:
            text =text + p+' '
        data_final = {'url': file , 'text': text}
        return data_final

In [5]:
def from_list_to_df(liste_path):
    df = pd.DataFrame()
    n = 0
    for json in liste_path:  # parcours de la liste des chemins vers les fichiers json
        liste_url_content = json_extract(json)
        n+=1
        row = {
            'id': n,
            'source_file': liste_url_content['url'],  # collecte du nom du site 
            'text': liste_url_content['text'],  # contenu de la classe class du dictionnaire json
        }
        # Ajout des résultats au dataframe
        df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
    return df
    

In [6]:
liste_path = [] # liste des chemins vers les fichiers json
path_directory = '../../html_extraction/dictionnaire'
for filename in os.listdir(path_directory): # parcours du dossier
        if filename != '.DS_Store':
            liste_path.append(path_directory + '/' + filename)
df = from_list_to_df(liste_path)
df

Unnamed: 0,id,source_file,text
0,1,../../html_extraction/dictionnaire/attribute_d...,center
1,2,../../html_extraction/dictionnaire/attribute_d...,home archive post-type-archive post-type-archi...
2,3,../../html_extraction/dictionnaire/attribute_d...,js yes-js js_active home page-template page-te...
3,4,../../html_extraction/dictionnaire/attribute_d...,js-site-favicon logged-in env-production page-...
4,5,../../html_extraction/dictionnaire/attribute_d...,js-site-favicon logged-in env-production page-...
5,6,../../html_extraction/dictionnaire/attribute_d...,js-focus-visible yoast-schema-graph page-templ...
6,7,../../html_extraction/dictionnaire/attribute_d...,js flexbox canvas canvastext webgl no-touch g...
7,8,../../html_extraction/dictionnaire/attribute_d...,ppws_modal ppws_modal_overlay ppws_modal_box p...
8,9,../../html_extraction/dictionnaire/attribute_d...,product-template-default single single-product...
9,10,../../html_extraction/dictionnaire/attribute_d...,js-site-favicon logged-in env-production page-...


In [7]:
# Step 1 - Get embeddings
docs = df['text'].tolist()
id = df['id'].astype(str).tolist()

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine')
# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=4, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
# Step 4 - Create topic model
topic_model = BERTopic(
  umap_model=umap_model,                    # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
)
topics, probs = topic_model.fit_transform(docs)

In [8]:
topic_model.get_document_info(docs)

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,center,-1,-1_button_formcontrol_container_menuitemhome,"[button, formcontrol, container, menuitemhome,...",[js-site-favicon logged-in env-production page...,button - formcontrol - container - menuitemhom...,0.0,False
1,home archive post-type-archive post-type-archi...,-1,-1_button_formcontrol_container_menuitemhome,"[button, formcontrol, container, menuitemhome,...",[js-site-favicon logged-in env-production page...,button - formcontrol - container - menuitemhom...,0.0,False
2,js yes-js js_active home page-template page-te...,-1,-1_button_formcontrol_container_menuitemhome,"[button, formcontrol, container, menuitemhome,...",[js-site-favicon logged-in env-production page...,button - formcontrol - container - menuitemhom...,0.0,False
3,js-site-favicon logged-in env-production page-...,-1,-1_button_formcontrol_container_menuitemhome,"[button, formcontrol, container, menuitemhome,...",[js-site-favicon logged-in env-production page...,button - formcontrol - container - menuitemhom...,0.0,True
4,js-site-favicon logged-in env-production page-...,-1,-1_button_formcontrol_container_menuitemhome,"[button, formcontrol, container, menuitemhome,...",[js-site-favicon logged-in env-production page...,button - formcontrol - container - menuitemhom...,0.0,True
5,js-focus-visible yoast-schema-graph page-templ...,-1,-1_button_formcontrol_container_menuitemhome,"[button, formcontrol, container, menuitemhome,...",[js-site-favicon logged-in env-production page...,button - formcontrol - container - menuitemhom...,0.0,False
6,js flexbox canvas canvastext webgl no-touch g...,-1,-1_button_formcontrol_container_menuitemhome,"[button, formcontrol, container, menuitemhome,...",[js-site-favicon logged-in env-production page...,button - formcontrol - container - menuitemhom...,0.0,False
7,ppws_modal ppws_modal_overlay ppws_modal_box p...,-1,-1_button_formcontrol_container_menuitemhome,"[button, formcontrol, container, menuitemhome,...",[js-site-favicon logged-in env-production page...,button - formcontrol - container - menuitemhom...,0.0,False
8,product-template-default single single-product...,-1,-1_button_formcontrol_container_menuitemhome,"[button, formcontrol, container, menuitemhome,...",[js-site-favicon logged-in env-production page...,button - formcontrol - container - menuitemhom...,0.0,False
9,js-site-favicon logged-in env-production page-...,-1,-1_button_formcontrol_container_menuitemhome,"[button, formcontrol, container, menuitemhome,...",[js-site-favicon logged-in env-production page...,button - formcontrol - container - menuitemhom...,0.0,True


In [9]:
results = topic_model.visualize_documents(id) # dataframe with the id and the coordinates
results.drop(results[results['id'].isnull()].index, inplace = True)
results

Unnamed: 0,id,x,y
0,4,3.750287,6.772539
1,7,4.746617,7.116956
2,3,3.148042,7.313091
3,10,5.33364,7.872265
4,1,3.164693,6.360981
5,9,5.609956,7.312529
6,6,4.450284,7.857515
7,5,3.978364,7.552486
8,8,5.189946,6.635677
9,2,2.807961,6.794755


In [17]:

# Assume 'filename' is the column containing filenames in the results dataframe
chosen_ids = results['id'].unique()


# Create a new column for color based on the unique IDs
results['color'] = results['id']

# Define color map for unique colors
color_map = {id: px.colors.qualitative.Set1[i % len(px.colors.qualitative.Set1)] for i, id in enumerate(chosen_ids)}


# Plot the scatter plot using Plotly Express with unique colors
fig = px.scatter(results, x='x', y='y', color='color', color_discrete_map=color_map, 
                 title='<b>Html class structure similarity between organ trafficking suspected websites</b>', hover_data=['id'])
fig.update_layout(
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(color='black'),  # Set text color to black
    xaxis=dict(title_text='X', color='black'),  # Set X axis label and color
    yaxis=dict(title_text='Y', color='black'),  # Set Y axis label and color
)
filename_list = [os.path.basename(file_path) for file_path in df['source_file']]
#Update legend labels with filenames
legend_labels = {id: filename for id, filename in zip(results['id'], filename_list)}
fig.for_each_trace(lambda t: t.update(name=legend_labels[t.name]))

# Show the plot
fig.show()

# Save the plot to an HTML file
fig.write_html('../results/graphs/html_class/scatter_plot6.html')
