In [53]:
import os
import string
from html import unescape
from bertopic import BERTopic
import pandas as pd
import lxml.html
from bertopic.vectorizers import ClassTfidfTransformer
from sentence_transformers.models.tokenizer import ENGLISH_STOP_WORDS
from umap import UMAP
from hdbscan import HDBSCAN
import plotly as plt
import plotly.express as px

In [54]:

def preprocess_text(text):
    # Remove HTML entities
    text = unescape(text)
    STOP_WORDS = {'onion', 'tor', 'network', 'sites', 'links', 'var', 'px', 'webhelvetica', 'mainnav', 'file', 'browser', 'deep', 'web', 'minwidth', ' px', 'px ', ' px ' , 'li', 'solid' }
    # Remove characters related to HTML structure
    html_related_chars = ['<', '>', '/', '=', '"', "'", 'px', 'sw', ' sw ', 'px ', ' px '
                          'sw ', ' sw',]
    text = ''.join([char for char in text if char not in html_related_chars])

    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word.lower() not in ENGLISH_STOP_WORDS])
    
    # Remove words related to onion not relevant for analysis
    text = ' '.join([word for word in text.split() if word.lower() not in STOP_WORDS])
    # Remove numbers
    text = ''.join([char for char in text if not char.isdigit()])
    return str(text)

In [55]:
def html_dict(path):
    text_content = lxml.html.parse(path).getroot().text_content()
    text_content = preprocess_text(text_content)
    id = path.split('/')[-1]
    liste_id_text = [id, text_content] # liste contenant l'id et le texte clean
    return liste_id_text

In [56]:
def from_list_to_df(liste_path):
    df = pd.DataFrame()
    n = 0
    for html in liste_path:  # new, top, hot, etc. limit:réglé à None collecte un maximum de résultats possible
        liste_url_content = html_dict(html)
        n+=1
        row = {
            'id': n,
            'url': liste_url_content[0],  # Collecte de l'id
            'text': liste_url_content[1],  # contenu texte clean du html
        }
        # Ajout des résultats au dataframe
        df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
    return df

In [57]:
liste_path = [] # liste des chemins vers les fichiers html
for i in range(2):
    if i == 0:
        path_directory = '../../ClearWeb/results/code_client_serveur/client_code'
    else:
        path_directory = '../../Darknet/results/results_url_filter/client_codes'


    for filename in os.listdir(path_directory): # parcours du dossier
        if filename != '.DS_Store':
            liste_path.append(path_directory + '/' + filename)
    df = from_list_to_df(liste_path)
df = df[df['id'] != 7]
# desired_ids = [5, 6, 8, 9, 10, 11]
# df_proches = df[df['id'].isin(desired_ids)]
# df_proches.to_excel('../BERT_ressources/sites_darkweb_proches/DW_proches.xlsx')
df

Unnamed: 0,id,url,text
0,1,CodeClient_organcity.html,documentdocumentElementclassName documentdocum...
1,2,CodeClient_groodcity.html,documentdocumentElementclassName documentdocum...
2,3,CodeClient_organ_city.html,documentdocumentElementclassName documentdocum...
3,4,CodeClient_activescienceparts.html,documentseraphaccelusbpbdocumentcreateElements...
4,5,http___bmgunsszmlrt3wvghzthp45nbzftx7udgrlkder...,Buy Guns Online GunS Sale Black Market Guns Fi...
...,...,...,...
84,85,http___stormer5v52vjsw66jmds7ndeecudq444woadhz...,Science Finds Plastic People’s Lungs – Daily S...
85,86,http___5l4azvzhtzzxybvrotdmdh6q2ot47sdncz4uyne...,Buy furosemide lasix frusenex mg tabs worldwi...
86,87,http___quetre.g4c3eya4clenolymqbpgwz3q3tawoxw5...,form ANS QuetreSkip main content QuetreWhat fo...
87,88,http___dc6rcegxudbr6ldfe54yfd35wimbnez3yceaqm7...,c Main page functionwduwreadyQwbindReadyQfunct...


In [58]:
# Step 1 - Get embeddings
docs = df['text'].tolist()
id = df['id'].astype(str).tolist()
# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=8, n_components=2, min_dist=0.0, metric='cosine',random_state=42)
# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=3, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
# tfid transformer for bm25 weighting to reduce the impact of small dataset
ctfidf_model = ClassTfidfTransformer(bm25_weighting=True)
# Step 4 - Create topic model
topic_model = BERTopic(
    embedding_model='all-mpnet-base-v2',
    umap_model=umap_model,                    
    hdbscan_model=hdbscan_model,
    ctfidf_model=ctfidf_model,
)
topics, probs = topic_model.fit_transform(docs)
# new_topics = topic_model.reduce_outliers(docs, topics, strategy='embeddings')
# topic_model.update_topics(docs, topics=new_topics)

In [59]:
topic_model.get_document_info(docs)

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,documentdocumentElementclassName documentdocum...,0,0_px_ul_important_human,"[px, ul, important, human, productthumb, rgba,...",[Decreased libido PsychonautWiki documentdocum...,px - ul - important - human - productthumb - r...,1.000000,False
1,documentdocumentElementclassName documentdocum...,0,0_px_ul_important_human,"[px, ul, important, human, productthumb, rgba,...",[Decreased libido PsychonautWiki documentdocum...,px - ul - important - human - productthumb - r...,1.000000,False
2,documentdocumentElementclassName documentdocum...,0,0_px_ul_important_human,"[px, ul, important, human, productthumb, rgba,...",[Decreased libido PsychonautWiki documentdocum...,px - ul - important - human - productthumb - r...,1.000000,False
3,documentseraphaccelusbpbdocumentcreateElements...,0,0_px_ul_important_human,"[px, ul, important, human, productthumb, rgba,...",[Decreased libido PsychonautWiki documentdocum...,px - ul - important - human - productthumb - r...,1.000000,False
4,Buy Guns Online GunS Sale Black Market Guns Fi...,3,3_gift_market_exchange_economy,"[gift, market, exchange, economy, free, guns, ...",[black market guns Searches footerbottomleftpo...,gift - market - exchange - economy - free - gu...,0.654108,False
...,...,...,...,...,...,...,...,...
83,Science Finds Plastic People’s Lungs – Daily S...,-1,-1_px_ebebeb_poison_important,"[px, ebebeb, poison, important, guns, ul, subm...",[poison person Poison person traces windowdata...,px - ebebeb - poison - important - guns - ul -...,0.000000,True
84,Buy furosemide lasix frusenex mg tabs worldwi...,6,6_subcat_categorybtnchecked_block_display,"[subcat, categorybtnchecked, block, display, i...",[Buy propranolol inderal migabet mg tabs worl...,subcat - categorybtnchecked - block - display ...,1.000000,True
85,form ANS QuetreSkip main content QuetreWhat fo...,-1,-1_px_ebebeb_poison_important,"[px, ebebeb, poison, important, guns, ul, subm...",[poison person Poison person traces windowdata...,px - ebebeb - poison - important - guns - ul -...,0.000000,False
86,c Main page functionwduwreadyQwbindReadyQfunct...,1,1_update_days_div_link,"[update, days, div, link, writes, new, account...",[glag gay little anime girls expanded display ...,update - days - div - link - writes - new - ac...,1.000000,True


In [60]:
results = topic_model.visualize_documents(id) # dataframe with the id and the coordinates
results.drop(results[results['x'].isnull()].index, inplace = True)
results.drop(results[results['id'].isnull()].index, inplace = True)
results

Unnamed: 0,id,x,y
0,35,-0.006216,-0.029080
1,64,5.442639,1.352883
2,23,-2.198564,-0.565112
3,14,-3.982191,-0.291734
4,87,7.780960,-1.822537
...,...,...,...
90,80,8.271223,-1.239780
91,86,7.687356,-1.893236
92,44,1.917973,0.489287
93,15,-3.873661,-0.394492


In [61]:
# Choose a list of IDs to highlight
chosen_ids = ['1','2','3','4']

# Create a new column for color based on the chosen IDs
results['Identified'] = results['id'].apply(lambda x: 'Yes' if x in chosen_ids else 'Unknown')

# Define color map for discrete colors
color_map = {'Yes': 'orange', 'Unknown': 'darkblue'}

# Plot the scatter plot using Plotly Express with discrete colors
fig1 = px.scatter(results, x='x', y='y', color='Identified', color_discrete_map=color_map,
                 title='<b>Mapping of identified clearweb websites and unknown darkweb websites </b>', hover_data=['id'])
fig1.update_layout(
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(color='black'),  # Set text color to black
    xaxis=dict(title_text='component 1', color='black', showgrid = False),  # Set X axis label and color
    yaxis=dict(title_text='component 2', color='black', showgrid = False),  # Set Y axis label and color
)
# Show the plot
fig1.show()
fig1.write_html('../results/graphs/with_root/scatterplot_identified5.html')

In [62]:
# Create a new column for marker size based on the chosen IDs
results['Topic'] = topics
results['Topic'] = results['Topic'].apply(lambda x: str(x))

# Get unique topics
unique_topics = sorted(set(topics))

# Define color map with specific colors for unique topics
color_map = {str(topic): px.colors.qualitative.Set1[i % len(px.colors.qualitative.Set1)] for i, topic in enumerate(unique_topics)}

#Plot the scatter plot using Plotly Express with topic colors and marker symbols
fig2 = px.scatter(results, x='x', y='y', color="Topic", 
                 color_discrete_map=color_map, 
                 title='<b>Mapping of clearweb and darkweb websites with topics</b>',
                 hover_data=['id', 'Topic'])

# Customize the layout
fig2.update_layout(
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(color='black'),  # Set text color to black
    xaxis=dict(title_text='component 1', color='black', showgrid=False),  # Set X axis label and color
    yaxis=dict(title_text='component 2', color='black', showgrid=False),  # Set Y axis label and color
)

# Show the plot
fig2.show()
fig2.write_html('../results/graphs/with_root/scatterplot_topics5.html')

In [63]:
fig = topic_model.visualize_barchart()
fig.show()
fig.write_html('../results/graphs/with_root/barchart5.html')

In [64]:
topic_model.visualize_hierarchy()

In [65]:
topic_model.visualize_heatmap()