In [53]:
import os
import string
from html import unescape
from bertopic import BERTopic
import pandas as pd
import lxml.html
from bertopic.vectorizers import ClassTfidfTransformer
from sentence_transformers.models.tokenizer import ENGLISH_STOP_WORDS
from umap import UMAP
from hdbscan import HDBSCAN
import plotly as plt
import plotly.express as px
from sqlalchemy import create_engine


In [54]:

def preprocess_text(text):
    # Remove HTML entities
    text = unescape(text)
    STOP_WORDS = {'onion', 'tor', 'network', 'sites', 'links', 'var', 'px', 'webhelvetica', 'mainnav', 'file', 'browser', 'deep', 'web', 'minwidth', ' px', 'px ', ' px ' , 'li', 'solid' }
    # Remove characters related to HTML structure
    html_related_chars = ['<', '>', '/', '=', '"', "'", 'px', 'sw', ' sw ', 'px ', ' px '
                          'sw ', ' sw',]
    text = ''.join([char for char in text if char not in html_related_chars])

    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word.lower() not in ENGLISH_STOP_WORDS])
    
    # Remove words related to onion not relevant for analysis
    text = ' '.join([word for word in text.split() if word.lower() not in STOP_WORDS])
    # Remove numbers
    text = ''.join([char for char in text if not char.isdigit()])
    return str(text)

In [55]:
def html_dict(path):
    text_content = lxml.html.parse(path).getroot()
    text_content = text_content.cssselect('body')[0].text_content()
    text_content = preprocess_text(text_content)
    id = path.split('/')[-1]
    liste_id_text = [id, text_content] # liste contenant l'id et le texte clean
    return liste_id_text

In [56]:
def from_list_to_df(liste_path):
    df = pd.DataFrame()
    n = 0
    for html in liste_path:  # parcours de la liste des chemins vers les fichiers html
        liste_url_content = html_dict(html)
        n+=1
        row = {
            'id': n,
            'url': liste_url_content[0],  # Collecte de l'id
            'text': liste_url_content[1],  # contenu texte clean du html
        }
        # Ajout des résultats au dataframe
        df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
    return df

In [57]:
liste_path = [] # liste des chemins vers les fichiers html
#df_listeDW = pd.DataFrame()
for i in range(2):
    if i == 0:
        path_directory = '../../ClearWeb/results/code_client_serveur/client_code'
    else:
        path_directory = '../../Darknet/results/results_url_filter/html_keyword'


    for filename in os.listdir(path_directory): # parcours du dossier
        if filename != '.DS_Store':
            liste_path.append(path_directory + '/' + filename)
    df = from_list_to_df(liste_path)
df = df[df['id'] != 7]
# Extraction of the 6 closest websites to the identified organ selling plateforms
# desired_ids = [5, 6, 8, 9, 10, 11]
# df_proches = df[df['id'].isin(desired_ids)]
# df_proches.to_excel('../BERT_ressources/sites_darkweb_proches/DW_proches.xlsx')
df


Unnamed: 0,id,url,text
0,1,CodeClient_organcity.html,Skip content Menu AllHuman HeartHuman KidneyHu...
1,2,CodeClient_groodcity.html,Sorry Javascript Disabled page meant appear en...
2,3,CodeClient_organ_city.html,Skip content Menu Menu Search Cart products ca...
3,4,CodeClient_activescienceparts.html,function seraphaccelcpsldRevcalcSizesaforvar b...
4,5,http___bmgunsszmlrt3wvghzthp45nbzftx7udgrlkder...,Skip navigation Skip content Black Market Guns...
...,...,...,...
68,69,http___photodakj4vrljvu55bf6s7acpdbzvtvxpkd65g...,permmedia PhotoDark search Home Explore Type t...
69,70,http___stormer5v52vjsw66jmds7ndeecudq444woadhz...,Featured Stories World Society Insight Feature...
70,71,http___5l4azvzhtzzxybvrotdmdh6q2ot47sdncz4uyne...,Home View Cart Search Sellers Policy Tickets ...
71,72,http___dc6rcegxudbr6ldfe54yfd35wimbnez3yceaqm7...,c Main page functionwduwreadyQwbindReadyQfunct...


In [58]:
# Step 1 - Get embeddings
docs = df['text'].tolist()
id = df['id'].astype(str).tolist()
# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=8, n_components=2, min_dist=0.0, metric='cosine',random_state=42)
# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=3, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
ctfidf_model = ClassTfidfTransformer(bm25_weighting=True)
# Step 4 - Create topic model
topic_model = BERTopic(
    embedding_model='all-mpnet-base-v2',
    umap_model=umap_model,                    
    hdbscan_model=hdbscan_model, 
    ctfidf_model=ctfidf_model,
)
topics, probs = topic_model.fit_transform(docs)


In [59]:
topic_model.get_document_info(docs)
topic_model.get_topic_info().head(7).set_index('Topic')[
   ['Count', 'Name', 'Representation']]

Unnamed: 0_level_0,Count,Name,Representation
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,51,0_human_px_new_gift,"[human, px, new, gift, sale, buy, people, onli..."
1,12,1_px_cards_services_site,"[px, cards, services, site, internet, created,..."
2,9,2_sw_said_belarusian_iranian,"[sw, said, belarusian, iranian, journalists, b..."


In [60]:
results = topic_model.visualize_documents(id) # dataframe with the id and the coordinates
results.drop(results[results['x'].isnull()].index, inplace = True)
results.drop(results[results['id'].isnull()].index, inplace = True)
results

Unnamed: 0,id,x,y
1,38,5.956974,6.863474
2,25,4.400595,8.867860
3,42,6.487746,5.870554
4,44,6.604545,5.761190
5,2,0.289068,8.106169
...,...,...,...
70,34,5.503561,7.762140
71,65,8.961190,2.701587
72,24,4.255194,9.047235
73,27,4.518327,8.713304


In [61]:
# Choose a list of IDs to highlight
chosen_ids = ['1', '2', '3', '4']

# Create a new column for marker size based on the chosen IDs
results['Topic'] = topics
results['Topic'] = results['Topic'].apply(lambda x: str(x))

# Get unique topics
unique_topics = sorted(set(topics))

# Define color map with specific colors for unique topics
color_map = {str(topic): px.colors.qualitative.Set1[i % len(px.colors.qualitative.Set1)] for i, topic in enumerate(unique_topics)}

#Plot the scatter plot using Plotly Express with topic colors and marker symbols
fig1 = px.scatter(results, x='x', y='y', color="Topic", 
                 color_discrete_map=color_map, 
                 title='<b>Mapping of clearweb and darkweb websites with topics</b>',
                 hover_data=['id', 'Topic'])

# Customize the layout
fig1.update_layout(
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(color='black'),  # Set text color to black
    xaxis=dict(title_text='component 1', color='black', showgrid=False),  # Set X axis label and color
    yaxis=dict(title_text='component 2', color='black', showgrid=False),  # Set Y axis label and color
)

# Show the plot
fig1.show()
fig1.write_html('../results/graphs/with_body/scatterplot_topics5.html')

In [62]:


# Create a new column for marker symbol based on the chosen IDs
results['Identified organ selling plateform'] = results['id'].apply(lambda x: 'Yes' if x in chosen_ids else 'Unknown')
fig2 = px.scatter(results, x='x', y='y', color="Identified organ selling plateform", 
                 color_discrete_map={'Yes': 'orange', 'Unknown': 'blue'}, 
                 title='<b>Mapping of identified clearweb websites and unknown darkweb websites</b>',
                 hover_data=['id', 'Topic'])
# Customize the layout
fig2.update_layout(
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(color='black'),  # Set text color to black
    xaxis=dict(title_text='component 1', color='black', showgrid=False),  # Set X axis label and color
    yaxis=dict(title_text='component 2', color='black',showgrid=False),  # Set Y axis label and color
)

fig2.show()
fig2.write_html('../results/graphs/with_body/scatterplot_identified5.html')

In [63]:
fig = topic_model.visualize_barchart()
fig.show()
fig.write_html('../results/graphs/with_body/barchart5.html')

In [64]:
topic_model.visualize_hierarchy()

In [65]:
topic_model.visualize_heatmap()