<html>
  <head>
    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.1.0/css/bootstrap.min.css">
    <style>
        body {
            background-color: #f0f4f8;  /* Appliquer la couleur de fond à tout le corps de la page */
        }
        pre {
            align-items: center;
            justify-content: center;
            display: flex;
            background: #ffffff;  /* Couleur de fond pour l'élément <pre> */
        }
    </style>
  </head>
  <body>
    <!-- Contenu de la page -->
  </body>
</html>

In [1]:
from RLWK.rlwk import Rlwk
from RLWK.utils.top_terms_utils import get_top_terms_count
from RLWK.utils.data_utils import load_benchmark, create_tfidf, calculate_nmi_ari_scores
from RLWK.visualization.viz_utils import plot_summary, plot_cluster_summaries
from RLWK.io.pubmed_io import parse_pubmed_api

from IPython.display import display, HTML, clear_output, Image
from ipywidgets import HTML, VBox, HBox, Label, widgets, Layout

import matplotlib.pyplot as plt
from wordcloud import WordCloud
import random

In [2]:
## Header logo
image1 = Image('static/cluster_insight.png')

img1 = widgets.Image(
    value=image1.data,
    width='auto',
    height='auto'
)

HBox(
    [img1], 
    layout={'display':'flex','height':'250px', 'justify_content':'center'}
)

HBox(children=(Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x02\xda\x00\x00\x01\xb2\x08\x06\x00\x…

In [3]:
## Default parameters
MAX_DOC = 5000
N_CLUST = 3
N_DOC = 1500
MAX_TOP_TERMS = 50
N_COUNT=100
N_WEIGHT=10

# Datasets information
dataset_clusters_info = {
    'BBC News': {'num_clusters': 5, 'clusters': ['Politics', 'Technology', 'Business', 'Entertainment', 'Sports']},
    '20 Newsgroups': {'num_clusters': 20, 'clusters': ['alt.atheism','comp.graphics','comp.os.ms-windows.misc','comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware','comp.windows.x','misc.forsale','rec.autos','rec.motorcycles','rec.sport.baseball','rec.sport.hockey','sci.crypt','sci.electronics','sci.med','sci.space','soc.religion.christian','talk.politics.guns','talk.politics.mideast','talk.politics.misc','talk.religion.misc']},
    'Pubmed 10': {'num_clusters': 10, 'clusters': ["Gout", "Chickenpox", "Raynaud Disease", "Jaundice", "Hepatitis A", "Hay Fever", "Kidney Calculi", "Age-related Macular Degeneration (AMD)", "Migraine", "Otitis"]},
    'Pubmed 5': {'num_clusters': 5, 'clusters': ["Hay Fever", "Kidney Calculi", "Age-related Macular Degeneration (AMD)", "Hepatitis A", "Otitis"]},
    'Classic 4': {'num_clusters': 4, 'clusters': ['CISI (Information Retrieval Abstracts)', 'CRAN (Aerodynamics Reports) ', 'MED (Medical Abstracts)', 'CACM (Communications of the ACM)']},
}

In [4]:
%%html
<!--Define personalized box styles-->
<style>
.box_style{
    box-shadow: 0px 0px 0px 0 rgb(0 0 0 / 0%), 0 3px 7px 0 rgb(0 0 0 / 24%);
    border-radius: 10px
}
.mini_box_style{
    box-shadow: 0px 0px 0px 0 rgb(0 0 0 / 0%), 0 1px 6px 0 rgb(0 0 0 / 24%);
    border-radius: 10px
}
.button_style{
    border-radius: 5px;
}
.summary_box_style {
    box-shadow: 0px 0px 0px 0 rgb(0 0 0 / 0%), 0 3px 7px 0 rgb(0 0 0 / 24%);
    border-radius: 5px;
    background-color: white; 
    #border: 2px solid #fdf2e9; 
}
</style>

In [5]:
## I. Initialize panels 

# A. Initialize main panel
out = widgets.Output(layout={'display':'flex',
                             'flex_flow':'column',
                             'align_items':'center',
                             'justify_content':'center'
                            })

# B. Declare number of clusters to update it after dataset change
nc = widgets.IntSlider(min=2, max=20, value=N_CLUST)

In [6]:
## II. Fill main panel with Data Parameters

# A.1 data and clustering box layout
box_layout = {'margin':'10px 10px 10px 10px', 'padding':'18px'}

# A.2 Pubmed query
query_label = Label(value='PubMed Query :')
query = widgets.Text(value='covid', disabled=False).add_class('button_style')
pm_query = VBox([query_label,query])

# A.3 Number of documents
label = Label(value='Number of documents :')
n_doc = widgets.BoundedFloatText(    
    min=100,
    max=MAX_DOC,
    step=100,
    disabled=False)
n_doc.value=N_DOC
num_doc = VBox([label,n_doc])

# A.4 Confirm button
cc_confirm = widgets.Button(
            description='Start clustering',
            disabled=False,
            button_style='info',
            layout={'margin':'0 0 10px 0'}
        )

# A.5 buttons switch
dc = widgets.ToggleButtons(
    options=['Benchmark', 'Pubmed API'],
    description='',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltips=['Use data stored locally', 'Fetch data from Pubmed API and store them locally']
)
data_choice = VBox([dc],layout={'align_items':'center'})

# A.6 Local data choice
dataset_choice = widgets.Dropdown(
    options=['BBC News', '20 Newsgroups', 'Pubmed 10', 'Pubmed 5', 'Classic 4'],
    value='BBC News',
    description='Dataset: ',
)

# -- Dynamic chances of dataset information --
cluster_info_output = widgets.Output()
def update_cluster_info(change):
    dataset = change['new']
    info = dataset_clusters_info.get(dataset, None)
    nc.value = info['num_clusters']
    with cluster_info_output:
        cluster_info_output.clear_output()
        if info:
            display(HTML(f"The expected number of clusters is <strong>{info['num_clusters']}</strong>.<br/>Topics are: {', '.join(info['clusters'])+'.'}",layout={ 'margin' : '15px 0 0 0',}))         
dataset_choice.observe(update_cluster_info, names='value')
update_cluster_info({'new': dataset_choice.value})

# Data locally
choice_1 = VBox([ 
    VBox([ 
        VBox([dataset_choice],layout={'align_items':'center'}), 
        cluster_info_output
    ], layout=box_layout).add_class("mini_box_style")],
    layout={'align_items':'center'})

# PubMed API
choice_2 = VBox([ 
    VBox([ 
        pm_query, 
        VBox([num_doc]),
    ], layout=box_layout).add_class("mini_box_style")],
    layout={'align_items':'center'})

# -> Final data container
dynamic_container = widgets.Output(layout={'align_items':'center','width': '400px'})
def update_dynamic_container(change):
    with dynamic_container:
        clear_output()
        if change['new'] == 'Benchmark':
            display(choice_1)
        else:
            display(choice_2)
    update_cluster_info({'new': dataset_choice.value})
dc.observe(update_dynamic_container, names='value')
update_dynamic_container({'new': dc.value})


In [7]:
## II.B Clustering parameters 

# B.1 Number of clusters
nc_label = Label(value='Number of clusters :')
#nc = nc
nc.style.handle_color = 'lightblue'
num_clust = VBox([nc_label,nc])

# B.2 Number of top terms
ntt_label = Label(value='Number of top terms :')
ntt = widgets.IntSlider(min=5, max=MAX_TOP_TERMS, value=15)
ntt.style.handle_color = 'lightblue'
num_top_terms = VBox([ntt_label,ntt])

# C. Group data & clustering boxes
params_box = HBox([
                    # data box
                    VBox([ 
                            HTML('<h5>Data<h5>'),
                            data_choice, 
                            dynamic_container
                            ],
                        layout={'align_items':'center'}),
                    # clust param box
                    VBox([ 
                            HTML('<h5>Clustering<h5>'),
                            VBox([ 
                                    num_clust, 
                                    num_top_terms
                                    ], layout=box_layout).add_class("mini_box_style")
                            ],layout={'align_items':'center'})
                    ],layout={ 'justify_content':'center', 'grid_gap':'80px', 'margin' : '0px 0 30px 0',})

# D. Final parameters box
params = widgets.VBox(
    [
        widgets.HTML('<h3>Parameters</h3>'),
        params_box,
        cc_confirm
    ],
    layout=widgets.Layout(
        align_items='center',
        padding='20px 50px 20px 50px',
        margin='20px 50px 30px 50px',
        background_color='white',
        width='auto'  
    )
)

# E. Center params box
centered_box = widgets.HBox(
    [params],
    layout=widgets.Layout(
        display='flex',
        justify_content='center',  
        width='100%'  
    )
)

In [8]:
## III. Define action functions

def on_clust_button_clicked(button):
    button.disabled = True
    nc.disabled = True
    ntt.disabled = True
    k = nc.value
    global out, spinner
    with out:
        try:
            # Load data based on selected option (Benchmark or PubMed API)
            if dc.value == 'Benchmark':
                mat, y, k_original, terms = load_benchmark(dataset_choice.value)
            else:
                try:
                    # Attempt to parse data from PubMed API
                    res, clean_data = parse_pubmed_api(query.value, int(n_doc.value))
                    mat, terms = create_tfidf(clean_data)
                except Exception as e:
                    error_message = HTML(f"""
                    <div style='text-align: center;'>
                        <b style='color: red;'>An unexpected error occurred while fetching data from PubMed API. Please try again later.</b>
                    </div>
                    """)
                    display(error_message)
                    button.disabled = False
                    nc.disabled = False
                    ntt.disabled = False
                    return  

            # Initialize and fit the model
            rlwk = Rlwk(k, lambda_val=10, alpha=10, beta=10, chi2=1, scale=0, init='s')
            rlwk.fit(mat, verbose=0)

            out2 = widgets.Output(layout={'display': 'flex',
                                          'align_items': 'center',
                                          'justify_content': 'center'})
            display(out2)

            # Plot summary
            if dc.value == 'Benchmark':
                nmi_score, ari_score = calculate_nmi_ari_scores(y, rlwk.labels_)
                result_summary = plot_summary(mat, rlwk, terms, nmi_score, ari_score, k, ntt.value)
            else:
                result_summary = plot_summary(mat, rlwk, terms, 0, 0, k, ntt.value)

            display(result_summary)
            display(plot_cluster_summaries(mat, rlwk, terms, k, ntt.value))

        except Exception as e:
            error_message = HTML(f"""
            <div style='text-align: center;'>
                <b style='color: red;'>An unexpected error occurred: {str(e)}</b>
            </div>
            """)
            display(error_message)
        
        finally:
            button.disabled = False
            nc.disabled = False
            ntt.disabled = False

# Link buttons to corresponding events
cc_confirm.on_click(on_clust_button_clicked)

In [9]:
## IV. Add css styles to widgets and display the main panel

params.add_class("box_style")
cc_confirm.add_class("button_style")

with out:
    display(centered_box)
    
display(out)

Output(layout=Layout(align_items='center', display='flex', flex_flow='column', justify_content='center'))