In [None]:
import os

import seaborn as sns
import itertools
import re
import networkx as nx
import matplotlib.pyplot as plt
from certifi import contents
from webcolors import names

palette = sns.color_palette(["#2176AB", "#F97662", "#FFBF00", "#50C878", "#B284BE"])
sns.set_palette(palette)


In [None]:
def read_html_files(folder_path, limit=None):
    html_files = {}  # {name, content}
    for i, file in enumerate(os.listdir(folder_path)):
        if limit is not None and i >= limit:
            break
        if file.endswith(".html"):
            with open(os.path.join(folder_path, file), "r") as f:
                # add the file name and content to the dictionary
                html_files.update({file: f.read()})
                
    return html_files



def extract_text_from_html(html_content):
    # Rimuovere i tag HTML
    clean_text = re.sub(r'<[^>]+>', '', html_content)
    return clean_text

def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0

def create_shingles(text, k=5):
    # Dividere il testo in shingles (sottostringhe di lunghezza k)
    return {text[i:i + k] for i in range(len(text) - k + 1)}

def build_similarity_graph(html_documents, k=5):
    G = nx.Graph()

    shingles_dict = {}

    # Creare shingles per ogni documento HTML
    for key, value in html_documents.items():
        text = extract_text_from_html(value)
        shingles_dict[key] = create_shingles(text, k)
        
    # Aggiungere nodi al grafo  
    for key in html_documents.keys():
        G.add_node(str(key).rstrip('.html'))

    # Aggiungere archi basati sulla somiglianza Jaccard
    keys = list(html_documents.keys())
    for (i, j) in itertools.combinations(keys, 2):
        sim = jaccard_similarity(shingles_dict[i], shingles_dict[j])
        G.add_edge(i, j, weight=sim)

    return G



In [None]:
html_folder = "../../sources"  # Il nome della cartella contenente gli HTML
html_files = read_html_files(html_folder)  # Leggere i file HTML

# Creare il grafo dei vertici (documenti HTML)
G = build_similarity_graph(html_files, k=5)

nx.write_gexf(G, "../../dataset/graphs/similarity_graph.gexf")

In [None]:
#plotting the clusters

plt.figure(figsize=(10, 8))
pos = nx.spring_layout(G)
nx.draw(G, pos, with_labels=True, node_size=20, node_color=palette[0], font_size=10)
plt.title("Similarity Graph")
plt.show()

