In [1]:
import dash
print(dash.__version__)
import dash_bootstrap_components as dbc
import dash_html_components as html
from dash.dependencies import Input, Output, State
import dash_core_components as dcc
print(dcc.__version__)
import dash_table
import warnings
warnings.filterwarnings("ignore")

1.20.0
1.16.0


In [2]:
from Bio import Entrez
from Bio import Medline
import re
import nltk
from nltk import word_tokenize
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

import matplotlib.cm as cm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from coclust.coclustering import CoclustMod
from coclust.clustering import SphericalKmeans
from coclust.coclustering import CoclustSpecMod
from scipy.sparse import coo_matrix
from coclust.visualization import plot_cluster_sizes, plot_cluster_top_terms
from coclust.visualization import (plot_cluster_top_terms,get_term_graph, plot_convergence)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gharbi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gharbi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gharbi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Preprocessing

In [3]:
def second_preprocessing(df_corpus):
    j=0
    for i in df_corpus['Abstract']:
        if i=="null":
            df_corpus = df_corpus.drop(labels=j)
    j=j+1
    return df_corpus


In [4]:

def preprocessing(text):
    
    # suppression de pontuation et caracteres numériques
    text = re.sub('[^a-zA-Z]',' ', text)
    # en lettres minuscules 
    text = text.lower()
    # tokenisation : prendre chaque mot a sa case
    text = word_tokenize(text)
    # suppression des stop words
    stop_words = stopwords.words('english')
    text = [word for word in text if word not in stop_words]
    # Lemmatiser les mots : exemple : Screening => screen , investigated => investigate
    lemma = WordNetLemmatizer()
    text = [lemma.lemmatize(word=w, pos='v') for w in text]
    # supprimer les mots de taille inférieur à 2
    text= [i for i in text if len(i) > 2]
    # reconvertir en String
    text = ' '.join(text)
    
    return text
def search(term, rmax):
    Entrez.email = ''
    handle = Entrez.esearch(db='pubmed', # DB
                            sort='relevance',  # tri par relevance
                            retmax=rmax, # combien d'articles
                            retmode='xml', 
                            term=term) # mot clé
    results = Entrez.read(handle)
    return results

def fetch_details(id_list):
    ids = ','.join(id_list)
    Entrez.email = 'your.email@example.com'
    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=ids)
    results = Entrez.read(handle)
    return results

    
def parse_authors(id_list):
    h = Entrez.efetch(db='pubmed', id=id_list, rettype='medline', retmode='text')
    records = Medline.parse(h)
    authors_list = []
    for record in records:
        au = record.get('AU', '?')
        for a in au: 
            if a not in authors_list:
                authors_list.append(a)
    return authors_list

def article_date(source):
    try :
        return source[0]['Year'] + '-' + source[0]['Month'] + '-' + source[0]['Day']
    except : return np.nan
    
def recup(num,terme):
    title_list = []
    abst_list = []
    date_list = []
    results = search(terme, num)
    #   IDs :
    id_list = results['IdList']

    papers = fetch_details(id_list)
    for i, paper in enumerate(papers['PubmedArticle']):

    #       titles : 
        title = paper['MedlineCitation']['Article']['ArticleTitle']
        print("{}) {}".format(i+1, title))
        title_list.append(title)

    #       Abstracts : 
        try:
            abst = paper['MedlineCitation']['Article']['Abstract']['AbstractText']
            abst = str(abst).strip("['']")
            abst_list.append(abst)
        except:
            abst = "null"
            abst_list.append(abst)

    #       Dates : 
        d = article_date(paper['MedlineCitation']['Article']['ArticleDate'])
        date_list.append(d)

    #   Authors:
    auth_list = []
    for id in id_list:
        auth = parse_authors(id)
        auth_list.append(auth)

    for i, abst in enumerate(abst_list):
        abst_list[i] = preprocessing(abst)

    dict_corpus = {'Article ID' : id_list, 'Titre' : title_list, 'Auteurs': auth_list, 'Date': date_list, 'Abstract' : abst_list}
    df_corpus = pd.DataFrame(dict_corpus)

    for i,a in enumerate(df_corpus['Auteurs']):
        df_corpus['Auteurs'][i] = ', '.join(a)
    df_corpus = second_preprocessing(df_corpus)
    return df_corpus


# Corpus creation

In [5]:
df = recup(10,'cancer')

NotXMLError: Failed to parse the XML data (XML declaration not found). Please make sure that the input data are in XML format.

# vectorization and analysis

In [None]:
def vectorisation_text(text,mindf,maxdf):    
    vectorizer = TfidfVectorizer(min_df=mindf,
                               max_df=maxdf,
                               max_features=None,
                               stop_words='english').fit(text)
    
    X = vectorizer.fit_transform(text)
    features = vectorizer.get_feature_names()
    return X, features

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
text, features = vectorisation_text(df['Abstract'],0.05,0.9)
print(type(text))

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

number_of_topics = 6
def lda(number_of_topics,text_lda):
    model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)
    model.fit(text_lda)
    # Transform the TF-IDF: nmf_features
    lda_features = model.transform(text_lda)
    return model

In [None]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

In [None]:
model = lda(6,text)

In [None]:
nwords = 5
topics = display_topics(model, features, nwords) 

# K-Means

In [None]:
#Elbow method : 
from sklearn.cluster import MiniBatchKMeans
def find_optimal_clusters(data, max_k):
    iters = range(2, max_k+1, 2)
    
    sse = []
    for k in iters:
        sse.append(MiniBatchKMeans(n_clusters=k, init_size=1024, batch_size=2048, random_state=0).fit(data).inertia_)     
    return iters, sse


x =[]
y=[]
x, y = find_optimal_clusters(text, 10)
x = list(x)
y= list(y)
print(x)
print(y)

In [None]:
import plotly.graph_objs as go
elbow = go.Figure(data=[go.Scatter(x=x, y=y)],layout = go.Layout(
        title="Elbow",
        height=400,
        width=500
        ))
elbow.show()

In [None]:
def kmeans(text_kmeans,number):
    clusters = MiniBatchKMeans(n_clusters=number, init_size=1024, batch_size=2048, random_state=20).fit_predict(text_kmeans)
    return clusters

In [None]:
def plot_tsne_pca(data, labels):
    max_label = max(labels)
    max_items = np.random.choice(range(data.shape[0]), size=3000, replace=True)
    
    pca = PCA(n_components=2).fit_transform(data[max_items,:].todense())
    tsne = TSNE().fit_transform(PCA().fit_transform(data[max_items,:].todense()))
    
    
    idx = np.random.choice(range(pca.shape[0]), size=300, replace=False)
    label_subset = labels[max_items]
    label_subset = [cm.hsv(i/max_label) for i in label_subset[idx]]
    
    f, ax = plt.subplots(1, 2, figsize=(14, 6))
    ax[0].scatter(pca[idx, 0], pca[idx, 1], c=label_subset)
    ax[0].set_title('PCA Cluster Plot')
    
    ax[1].scatter(tsne[idx, 0], tsne[idx, 1], c=label_subset)
    ax[1].set_title('TSNE Cluster Plot')
    return f

In [None]:
from io import BytesIO
import base64
def fig_to_uri(in_fig, close_all=True, **save_args):
    # type: (plt.Figure) -> str
    """
    Save a figure as a URI
    :param in_fig:
    :return:
    """
    out_img = BytesIO()
    in_fig.savefig(out_img, format='png', **save_args)
    if close_all:
        in_fig.clf()
        plt.close('all')
    out_img.seek(0)  # rewind file
    encoded = base64.b64encode(out_img.read()).decode("ascii").replace("\n", "")
    return "data:image/png;base64,{}".format(encoded)

In [None]:
#pour récupérer le nombre de clusters optimale grace a la modularité(plus grande valeur)
from coclust.visualization import plot_max_modularities
from coclust.evaluation.internal import best_modularity_partition

clusters_range = range(2, 10 )
model, modularities = best_modularity_partition(text, clusters_range, n_rand_init=1)
# plot the modularities over the range of cluster numbers


In [None]:
from coclust.coclustering import CoclustMod
from coclust.clustering import SphericalKmeans
from coclust.coclustering import CoclustSpecMod
from scipy.sparse import coo_matrix
from coclust.visualization import plot_cluster_sizes, plot_cluster_top_terms
from coclust.visualization import (plot_cluster_top_terms,get_term_graph, plot_convergence)
n_cluster = 8
model_mod = CoclustMod(n_clusters = n_cluster, random_state = 0) 
model_mod.fit(text)

In [None]:
def plot_cluster_top_terms(in_data, all_terms, nb_top_terms, model):
    """Plot the top terms for each cluster.

    Parameters
    ----------
    in_data : numpy array or scipy sparse matrix, shape=(n_samples, n_features)
    all_terms: list of string
        list of all terms from the original data set
    nb_top_terms: int
        number of top terms to be displayed per cluster
    model: :class:`coclust.coclustering.BaseDiagonalCoclust`
        a co-clustering model


    Example
    -------
    >>> plot_cluster_top_terms(in_data, all_terms, nb_top_terms, model)

    .. plot::

        from coclust.visualization import plot_cluster_top_terms
        from coclust.io.data_loading import load_doc_term_data
        from coclust.evaluation.internal import best_modularity_partition

        path = '../../../datasets/classic3_coclustFormat.mat'
        doc_term_data = load_doc_term_data(path)

        min_cluster_nbr = 2
        max_cluster_nbr = 9
        range_n_clusters = range(min_cluster_nbr, (max_cluster_nbr + 1))

        best_coclustMod_model, _ = \
            best_modularity_partition(doc_term_data['doc_term_matrix'],
                                      range_n_clusters, 1)
        n_terms = 10
        plot_cluster_top_terms(doc_term_data['doc_term_matrix'],
                               doc_term_data['term_labels'],
                               n_terms,
                               best_coclustMod_model)

    """

    if all_terms is None:
        logger.warning("Term labels cannot be found. Use input argument "
                       "'term_labels_filepath' in function "
                       "'load_doc_term_data' if term labels are available.")
        return

    x_label = "number of occurences"
    plt.subplots(figsize=(15, 15))
    plt.subplots_adjust(hspace=0.200)
    plt.suptitle("      Top %d terms" % nb_top_terms, size=15)
    number_of_subplots = model.n_clusters

    for i, v in enumerate(range(number_of_subplots)):
        # Get the row/col indices corresponding to the given cluster
        row_indices, col_indices = model.get_indices(v)
        # Get the submatrix corresponding to the given cluster
        cluster = model.get_submatrix(in_data, v)
        # Count the number of each term
        p = cluster.sum(0)
        t = p.getA().flatten()
        # Obtain all term names for the given cluster
        tmp_terms = np.array(all_terms)[col_indices]
        # Get the first n terms
        max_indices = t.argsort()[::-1][:nb_top_terms]

        pos = np.arange(nb_top_terms)

        v = v + 1
        f, ax1 = plt.subplot(number_of_subplots, 1, v)
        ax1.barh(pos, t[max_indices][::-1])
        ax1.set_title("Cluster %d (%d terms)" % (v, len(col_indices)), size=11)

        plt.yticks(.4 + pos, tmp_terms[max_indices][::-1], size=9.5)
        plt.xlabel(x_label, size=9)
        plt.margins(y=0.05)
        #_remove_ticks()
        plt.tick_params(axis='both', which='both', bottom='on', top='off',
                        right='off', left='off')

    # Tight layout often produces nice results
    # but requires the title to be spaced accordingly
    plt.tight_layout()
    plt.subplots_adjust(top=0.88)
    return f
    plt.show()


In [None]:
app = dash.Dash(__name__,external_stylesheets=[dbc.themes.BOOTSTRAP],prevent_initial_callbacks=True)


In [None]:
search_bar = dbc.Row(
    [
        dbc.Col(
                dbc.Input(id="num_max",type="number", placeholder="nombre d'articles", className="mr-3"),className="col-4"),
        dbc.Col(dbc.Input(id ="word", type="search", placeholder="Covid-19", className="mr-3"),className="col-4"),
        dbc.Col(
            dbc.Button("recuperer",id="submitt_search", color="success", className="ml-2"),
            width="auto",
        ),
    ],
    no_gutters=True,
    className="mr-auto flex-nowrap mt-3 mt-md-0",
    align="center",
    id="navbar_out"
)

sidebar = html.Div(
    id = "links",
    className="sidebare",
    children =[
        dbc.Nav(
            [
                dbc.Button("details fetching", href="/page-1", active="exact"),
                dbc.Button("K-means", href="/page-2", active="exact"),
                dbc.Button("LDA", href="/page-3", active="exact"),
                dbc.Button("Co-clustering", href="/page-4", active="exact")
            ],
            vertical=True,
            pills=True,
        ),
    ]
)
graphe = dash_table.DataTable(
    id='table',
    style_cell={
        'whiteSpace': 'normal',
        'height': 'auto',
    },
    style_table={
            'width': 950,
            'overflowY': 'auto',
            'overflowX': 'auto',
            'height': 540},
    columns=[{"name": i, "id": i} for i in df.columns],
    data=df.to_dict('records'),
    export_format="csv",

)
information_gen = dbc.Row(


    className="row",
)



In [None]:
visualisation = html.Div(
    className="content",
    children=[
        graphe,
        information_gen,
        
    ]

)

In [None]:
navbar = dbc.Navbar(
    [
        html.A(
            # Use row and col to control vertical alignment of logo / brand
            dbc.Row(
                [
                    dbc.Col(dbc.NavbarBrand("Corpus médical", className="ml-2")),
                ],
                align="center",
                no_gutters=True,
            ),
            href="https://plot.ly",
        ),
        dbc.NavbarToggler(id="navbar-toggler"),
        dbc.Collapse(search_bar, id="navbar-collapse", navbar=True),
        #no_gutters=True,
    ],
    color="dark",
    dark=True,
)


In [None]:
page_1_layout = html.Div(
    className="CONTENT_STYLE",
                  
                 children = [
                     visualisation
                 ]
                 
)

In [None]:
page_2_layout =html.Div(
        dbc.Row([
        dbc.Col(dcc.Graph(figure = elbow),className="col-4"),
        dbc.Col([
            html.P("choose the number of clusters "),
            
            dcc.Slider(
            id='box_size',
            min=1,
            max=10,
            value=4,
            step=1,
            marks=list(range(0, 10)),
        ),
         html.Div([html.Img(id = 'cur_plot', src = '')],
                 id='plot_div')
        ],className="col-8")
        

    ],
    className="mr-auto flex-nowrap mt-3 mt-md-0",
),
    id="layout2"
) 
        

In [None]:
page_4_layout =html.Div(
        dbc.Row([
        dbc.Col(html.Div([html.Img(id = 'cur_plotco', src = '')],
                 id='plot_div'),className="col-4"),
        dbc.Col([
            html.P("choose the number of clusters "),
            
            dcc.Slider(
            id='box_sizecoc',
            min=1,
            max=10,
            value=4,
            step=1,
            marks=list(range(0, 10)),
        ),
         html.Div([html.Img(id = 'cur_plotcoc', src = '')],
                 id='plot_div')
        ],className="col-8")
        

    ],
    className="mr-auto flex-nowrap mt-3 mt-md-0",
),
    id="layout4"
) 
        

In [None]:
page_3_layout =html.Div(
    
    id="layout3",
    className="content",
    children =[
     dbc.Col([
                html.P("choose the number of clusters "),
            
            dcc.Slider(
            id='box_sizecoc',
            min=1,
            max=10,
            value=4,
            step=1,
            marks=list(range(0, 10)),
        ),
            
     ]),
    dash_table.DataTable(
    id='table',
    style_cell={
        'whiteSpace': 'normal',
        'height': 'auto',
    },
    style_table={
            'width': 950,
            'overflowY': 'auto',
            'overflowX': 'auto',
            'height': 540},
    columns=[{"name": i, "id": i} for i in topics.columns],
    data=topics.to_dict('records'),
    export_format="csv",

)]
)  

In [None]:
app.layout = html.Div([dcc.Store(id='storage'),dcc.Location(id="url"), navbar,sidebar,html.Div(id ="page-content")])

In [None]:
@app.callback(
    Output(component_id='cur_plot', component_property='src'),
    [Input(component_id = 'box_size', component_property='value')]
)
def update_graph( n_val):
    clusters = kmeans(text, n_val)
    fig = plot_tsne_pca(text, clusters)
    out_url = fig_to_uri(fig)
    return out_url

In [None]:
@app.callback(
    Output(component_id='cur_plotcoc', component_property='src'),
    [Input(component_id = 'box_sizecoc', component_property='value')]
)
def update_graph2( n_val):
    n_cluster = 5
    model_mod = CoclustMod(n_clusters = n_cluster, random_state = 0) 
    model_mod.fit(text)
    fig = plot_cluster_top_terms(text,features,5,model_mod)
    out_url = fig_to_uri(fig)
    return out_url

In [None]:
# add callback for toggling the collapse on small screens
@app.callback(
    Output("navbar-collapse", "is_open"),
    [Input("navbar-toggler", "n_clicks")],
    [State("navbar-collapse", "is_open")]
)
def toggle_navbar_collapse(n,is_open):
    if n:
        return not is_open
    return is_open

In [None]:
@app.callback(
    Output("table", "data"),
    [Input("submitt_search","n_clicks")],
    [State('num_max', 'value'),
     State('word', 'value'),]
)
def fetch(n_clicks,num_max,word):
    df= recup(num_max,word)
    return df.to_dict('records')

In [None]:
@app.callback(dash.dependencies.Output('page-content', 'children'),
              [dash.dependencies.Input('url', 'pathname')])
def display_page(pathname):
    if pathname == '/page-1':
        return page_1_layout
    elif pathname == '/page-2':
        return page_2_layout
    elif pathname == '/page-3':
        return page_3_layout
    elif pathname == '/page-4':
        return page_4_layout
    else:
        return "404 not found"
    # You could also return a 404 "URL not found" page here

In [None]:
if __name__ == "__main__":
    app.config.suppress_callback_exceptions = True
    app.run_server()