## I - Imports

In [1]:
from ipyfilechooser import FileChooser
import math
import ipywidgets as widgets
import pandas as pd
import copy
import nltk
from nltk.corpus import stopwords
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
import matplotlib.pyplot as plt
from IPython.display import HTML
from ipydatagrid import DataGrid, TextRenderer, BarRenderer, Expr, VegaExpr,CellRenderer
from bqplot import LinearScale, ColorScale, OrdinalColorScale, OrdinalScale
import os

In [3]:
from extraction import * 
from calculs import *

%load_ext autoreload
%autoreload 2

#### Notes perso : Les principales variables :
- **annotations** *(dict)* : contient les différents textes de l'annotation et leurs infos (occurrence, stems et places), ainsi que les entités et leurs catégorie attribuées.
- **data** *(list)* : contient les mêmes informations contenues dans **annotations**, mais sous forme de liste. 
- **leven** *(dict)* : contient les stems de texte ayant été regoupés avec d'autres via la méthode de Levenshtein.
- **ent_cat** (dict) : contient les noms des entité et leurs catégories associées. 
- **cat_info** (df) : contient quelques infos sur chaque catégorie courante après catégorisation.
- **metrics_locations** (dict) : contient le calcul des emplacements (VP,FP,FN) depuis les catégories, pour chaque entité. 

## II -  Widgets 

In [4]:
# Global variables needed to start the interface without loading the files, realizing the extraction and normalization.
path=None
ent_cat={'entity1':['category?']}
df=pd.DataFrame([['entity1','category?','text1',1,['stem1'],[['text1',1,5]]]],columns=["entity", "category", "text", "occurrences", "stems", "places"])
df_tf_results = pd.DataFrame([['entity1','word1','occurrences1','tfidf1']],columns=['entity','word','occurrences','tfidf'])
ban_words_tfidf = {'entity1': []}

In [5]:
#return the list of all the entities
def getEnt(ent_cat):
    x = []
    for ent in list(ent_cat.keys()):
        x.append(ent)
    return x

def getCat(entity):
    x = []
    for cat in ent_cat[entity]:
        x.append(cat)
    return x

####################
# ENTITY SELECTION #
####################

# global variables
current_entity=getEnt(ent_cat)[0]
current_tabindex=0
current_category=''


def create_ban_words_tfidf(ent_cat) :
    ban_words_tfidf= {}
    for ent in getEnt(ent_cat):
        ban_words_tfidf[ent]=[]
    return ban_words_tfidf

def on_selection_change_entity(change):
    global tabs
    global current_entity
    global current_category
    current_entity=change.new
    update_tabs()
    current_category=getCat(current_entity)[0]
    on_visualisation_categorie_change("None")
    categorization()
    display_categorization_results()
    creation_donut()

button_selection_entity = widgets.Dropdown(options = getEnt(ent_cat),value=current_entity)
button_selection_entity.observe(on_selection_change_entity, names='value')
    
def update_button_selection_entity_options(ent_cat):    
    button_selection_entity.options = getEnt(ent_cat)
    return button_selection_entity

def create_button_results():
    button_results = widgets.ToggleButton(description ="Results", button_style='info',icon='poll',layout=widgets.Layout(width='100px'))
    button_results.observe(lambda change :print_results(change),names='value')
    return button_results

#################
# PRINT RESULTS #
#################

output_results=widgets.Output()
def print_results(value):
    if value.new==True:
        dg_results = calculate_dg_results()
        with output_results:
            output_results.clear_output()
            display(dg_results)
    else : 
        with output_results: 
            output_results.clear_output()
    return output_results

def background_color_tp(cell):
    return "#90be6d"

def background_color_fp(cell):
    return "#f94144"

def background_color_fn(cell):
    return "#f9c74f"
        
def calculate_dg_results():
    results=[]
    for entity in ent_cat:
        categorization()
        metrics_locations = calculate_location_metrics(entity,ent_cat,path,df,other_categories)
        df_metrics = calculate_df_metrics(pd.DataFrame(metrics_locations[entity],columns=["category", "result","text","file", "places"]))    
        TP = df_metrics['TP'].sum() 
        FP = df_metrics['FP'].sum()
        FN=df.loc[df['category'].str.contains(re.escape("category?")) & df['entity'].str.contains(entity), 'occurrences'].sum()
        precision = 0
        coverage = 0   
        if TP+FP !=0 : precision = round(TP/(TP+FP),2)
        if TP+FN !=0 :coverage = round(TP/(TP+FN),2)
        results.append([entity,TP,FP,FN,precision,coverage])
        
    df_results = pd.DataFrame(results,columns=["entity","TP","FP","FN","precision","coverage"])
    dg_results = DataGrid(df_results,column_widths={"entity":400,"TP":50,"FP":50,"FN":50,"precision":80,"coverage":80},layout={"height":"280px"})
    
    renderers = {
        "TP": TextRenderer(horizontal_alignment="center", bold=True, background_color=Expr(background_color_tp)),
        "FP": TextRenderer(horizontal_alignment="center", bold=True, background_color=Expr(background_color_fp)),
        "FN": TextRenderer(horizontal_alignment="center", bold=True, background_color=Expr(background_color_fn)),
        "precision": BarRenderer(horizontal_alignment="center",bar_color=ColorScale(min=0, max=1, scheme="cividis"),bar_value=LinearScale(min=0, max=1)),
        "coverage": BarRenderer(horizontal_alignment="center",bar_color=ColorScale(min=0, max=1, scheme="cividis"),bar_value=LinearScale(min=0, max=1)),
    }
    dg_results.renderers = renderers
    return dg_results

#############
# SAVE JSON #
#############

button_save = widgets.Button(description ="Save", button_style='info',icon='save',layout=widgets.Layout(width='70px'))
def create_button_save():
    def button_save_on_click(b):
        save_progress(path,ent_cat)
        with output_results:
            output_results.clear_output()
            print("Progress saved")
        button_save.button_style='info'
    button_save.on_click(button_save_on_click)
    
    return button_save

In [12]:
######################
# TAB 0 : LOAD FILES #
######################

def initiate_extraction(file_path):
    global path
    global ent_cat
    global df
    global df_tf_results
    global current_entity
    global ban_words_tfidf
    global button_selection_entity
    
    # Check if the selected path contains all the correct documents
    
    # 1 - Extraction + Stemming
    docs = load_from_brat(file_path, merge_all_fragments=True) #docs : generator ; doc : dict
    annotations = extract_annotations(docs,need_translation = False)
    annotations1 = stemming(annotations)
    annotations2 = annotations1
    
    # 2 - =Initialization of major variables
    path=file_path
    data,ent_cat = createData(annotations2)
    df = pd.DataFrame(data, columns=["entity", "category", "text", "occurrences", "stems", "places"])
    df_tf_results = calculate_tfidf(ent_cat,df)
    
    with output_t0:
        output_t0.clear_output()
        print(f"Selected Path: {file_path}")
        print("Extraction and Normalisation Done")
        print("Entities : "+str(getEnt(ent_cat)))
        
    ban_words_tfidf = create_ban_words_tfidf(ent_cat)
    current_entity=getEnt(ent_cat)[0]
    update_button_selection_entity_options(ent_cat) #update_tabs
    on_visualisation_categorie_change({'new':getCat(current_entity)[0]})
    
    progress_ent_cat = load_progress(path)
    if progress_ent_cat:
        ent_cat = progress_ent_cat
    
    categorization()
    display_categorization_results()
    creation_donut()
    update_tabs()
    
    
output_t0 = widgets.Output()
t0 = None
def create_t0():
    global t0
    
    file_select = FileChooser()
    button_accept_file = widgets.Button(description='Load', button_style='Primary', icon='upload', layout=widgets.Layout(width='100px'))
    
    def on_button_click(b):
        initiate_extraction(file_select.selected)
    button_accept_file.on_click(on_button_click)
    
    selection = widgets.HBox([button_accept_file, file_select])
    t0 = widgets.VBox([selection, output_t0])
    
    return t0

In [13]:
#####################################
# TAB I : VISUALISATION DE L'ENTITE #
#####################################

def on_visualisation_categorie_change(change):
    global current_category
    global current_entity
    global output_t1a1
    if change!= "None": 
        current_category=change['new']
    with output_t1a1:
        output_t1a1.clear_output()
        df1 = df.loc[(df['entity'] == current_entity) & (df['category'] == current_category)]
        df1 = df1.filter(["text","occurrences"])
        df1 = DataGrid(df1,column_widths={"text":800,"occurrences":100},layout={"height":"350px"},base_row_size=25)
        display(df1)

output_t1a1 = widgets.Output()
def create_t1a1():
    selection_category.observe(lambda change :on_visualisation_categorie_change(change),names='value')
    t1a1 = widgets.VBox([selection_category,output_t1a1])
    return t1a1

def color_row(value):
    if value == current_entity:
        return {'background_color': 'green'}
    else:
        return {}

def on_button_search_clicked(output_t1a2,word):
    with output_t1a2:
        output_t1a2.clear_output()
        docs = load_from_brat(path, merge_all_fragments=True)
        res_concordancer = calculate_concordancer(word,current_entity,path,docs)
        display(res_concordancer)
    
def create_t1a2():
    output_t1a2 = widgets.Output()
    text_t1a2 = widgets.Text(value="",description ="Word :")
    button_search = widgets.Button(description = 'Seach',button_style='primary',icon='search')
    button_search.on_click(lambda _: on_button_search_clicked(output_t1a2,text_t1a2.value))
    t1a2 = widgets.VBox([widgets.HBox([text_t1a2,button_search]),output_t1a2])
    return t1a2     

t1=None
def create_t1():
    global t1
    t1 = widgets.Accordion([create_t1a1(),create_t1a2()],titles=('Categories visualization','Concordancer'))
    return t1

In [14]:
###########################
# TAB II : Categorization #
###########################

##############################
# T2a1 : Category modification

def on_button_add_category_clicked(_):
    
    global tabs
    ent_cat[current_entity].append("[]")
    tabs.children = (t0,t1,create_t2(),t3) #update juste tab2, pas tab 3 sinon crash, car il recherche tous les patterns vides
    categorization_button.button_style='warning'
    
def modification_ent_cat(i,widget,):
    ent_cat[current_entity].remove(ent_cat[current_entity][i])
    update_tabs()
    categorization_button.button_style='warning'
        
def on_tag_change(change,i):
    ent_cat[current_entity][i] = repr(change['new'])  
    categorization_button.button_style='warning'

def create_categories_tags():
    tags=[]
    output_test = widgets.Output()
    for i,category in enumerate(ent_cat[current_entity]):
        if category == "category?":
            continue
        text = widgets.Label(value="Category "+str(i)+" :")
        tag = widgets.TagsInput(value=eval(ent_cat[current_entity][i]),allow_duplicates=True)
        tag.observe(lambda change,inti=i : on_tag_change(change,inti), names='value')
        button_supp = widgets.Button(button_style='danger',icon='minus-circle',layout=widgets.Layout(width='40px'))
        button_supp.on_click(lambda _, inti=i,current_tag=tag: modification_ent_cat(inti,current_tag))
        tags.append(widgets.HBox([text,button_supp,tag]))
    tags=widgets.VBox(tags)
    button_add_category = widgets.Button(button_style='success',icon='plus-circle',layout=widgets.Layout(width='70px'))
    button_add_category.on_click(on_button_add_category_clicked)
    return widgets.VBox([tags,button_add_category])

def on_tfidf_removed(change):
    global ban_words_tfidf
    global df_tf_results
    ban_words_tfidf[current_entity].append(change['old'][0])
    update_tabs()
    
def create_t2a1_texts():
    #Tfidf widgets
    top_tfidf_words = attribution_tf(current_entity,10,df_tf_results,ban_words_tfidf[current_entity])
    tf_occurrences = attribution_tf_occurrences(top_tfidf_words,df_tf_results,current_entity)
    tag_tf_occurrences =[]
    for word, occurrence in tf_occurrences:
        tag_tf = widgets.TagsInput(value=word,disabled=True)
        tag_tf.observe(on_tfidf_removed,names='value')
        tag_tf_occurrences.extend([tag_tf,widgets.HTML(value="("+str(occurrence)+")")])         
    widgets_tf_words= widgets.HBox(tag_tf_occurrences)
    
    #Ngrams widgets
    n_grams = calculate_n_grams(df,current_entity,df_tf_results,ban_words_tfidf[current_entity])
    treated_n_grams = treate_n_grams(n_grams,5)

    list_widgets_n_grams=[]
    for keyword in treated_n_grams:
        list_widget=[]
        text=widgets.HTML(value=str(keyword)+":")
        list_widget.append(text)
        add=False
        for gram, occurrence in treated_n_grams[keyword].items():
            list_widget.extend([widgets.TagsInput(value=gram),widgets.HTML(value="("+str(occurrence)+")")])
            add=True
        if add: list_widgets_n_grams.append(widgets.HBox(list_widget))
    
    return widgets.VBox([widgets.VBox([widgets_tf_words]),widgets.VBox(list_widgets_n_grams)])

def on_categorization_button_clicked(b):
    categorization()
    display_categorization_results()
    creation_donut()
    update_tabs()
    categorization_button.button_style='primary'
    button_save.button_style='warning'
    on_visualisation_categorie_change("None")

categorization_button = widgets.Button(button_style='primary',description = 'Categorization',icon='tasks')
categorization_button.on_click(on_categorization_button_clicked)
def create_t2a1():
    title1 = "I - Recommanded terms"
    title2 = "II -  Category creation"
    t2a1_texts= create_t2a1_texts()
    t2a1_title1 = widgets.HTML(value=f"<h2 style='height: 20px; line-height: 20px; text-align: left; display: flex; align-items: center;'>{title1}</h2>")
    t2a1_title2 = widgets.HTML(value=f"<h2 style='height: 20px; line-height: 20px; text-align: left; display: flex; align-items: center;'>{title2}</h2>")
    t2a1_tags = create_categories_tags()
    return widgets.VBox([t2a1_title1,t2a1_texts,t2a1_title2,t2a1_tags,space,categorization_button])

###############################
# T2a2 : Category visualisation

def creation_donut():
    global colors
    with t2a2_output2:
        t2a2_output2.clear_output()
        cat_infos=create_cat_infos()
        #filtered_cat_infos = cat_infos
        fig = go.Figure(data=[go.Pie(labels=cat_infos['category'].tolist(), 
                                    values=cat_infos['total_annotations_number'].tolist(),
                                    hole=0.5,
                                    showlegend=False,
                                    marker=dict(colors=colors))],
                            layout=go.Layout(template="plotly_dark"))
        fig.update_layout(title="Categories distribution for the entity : "+str(current_entity),title_x=0.5)
        fig.update_layout(margin=dict(t=50))  
        fig.show()

def apply_colors(row,col):
    color_index = row.name % len(col)
    first_col_background = '{}90'.format(col[color_index])  
    return ['background-color: {}'.format(first_col_background)] + [''] * (len(row) - 1)  
    
def display_categorization_results():
    with t2a2_output1:
        t2a2_output1.clear_output()
        cat_infos=create_cat_infos().sort_values(by='total_annotations_number',ascending=False)
        styled_cat_infos = cat_infos.style.apply(apply_colors, axis=1, col=colors)
        display(styled_cat_infos)
    
other_categories = []
def categorization():
    global current_entity
    global other_categories
    
    other_categories = []
    df['category']= "category?" #reset to "category?"to reset estimation
    ent_cat[current_entity] = [element for element in ent_cat[current_entity] if element != '[]'] #Erase all '[]' categories
    for ent in ent_cat:
        for cat in ent_cat[ent]:
            if cat != "category?":
                pattern = generate_regex(eval(cat))
                #if ent==current_entity:
                    #display(pattern)
                mask = (df['entity'] == ent) & df['text'].apply(lambda text: bool(re.search(pattern, text)))
                df.loc[mask, 'category'] = df.loc[mask, 'category'].apply(lambda x: x + 'AND' + cat)
                    
    #Calculation of "other_categories"
    for index, row in df.iterrows():
        categories = row['category'].split("AND")
        for i, category in enumerate(categories):
            if i > 1:
                for place in row['places']:
                    other_categories.append([row['entity'],category.strip(),row['text'],place[0]+".txt",[place[1],place[2]]])
        if len(categories) > 1: 
            df.at[index, 'category'] = categories[1].strip() 
    
        
def create_cat_infos():
    infos=[]
    for cat in ent_cat[current_entity]:
        tot_occurrences= df.loc[df['category'].str.contains(re.escape(cat)) & df['entity'].str.contains(current_entity), 'occurrences'].sum()
        nbr_occurrences= df.loc[df['category'].str.contains(re.escape(cat)) & df['entity'].str.contains(current_entity), 'occurrences'].shape[0]
        infos.append([cat,tot_occurrences,nbr_occurrences])
    df_cat_infos = pd.DataFrame(infos,columns=["category", "total_annotations_number","total_different_annotations_number"])
    return df_cat_infos

t2a2_output1 = widgets.Output()
t2a2_output2 = widgets.Output()
def create_t2a2():
    cat_infos=create_cat_infos()
    title = "III - Categorization distribution"
    t2a2_title = widgets.HTML(value=f"<h2 style='height: 30px; line-height: 30px; text-align: left; display: flex; align-items: center;'>{title}</h2>")
    t2a2_outputs = widgets.HBox([t2a2_output2,t2a2_output1])
    return widgets.VBox([t2a2_title,t2a2_outputs])

################
# Tab 2 creation

t2=None
def create_t2():
    global t2
    t2 = [create_t2a1(),create_t2a2()]
    return widgets.VBox(t2)

In [15]:
#################################
# TAB III : RESULTATS METRIQUES #
#################################
def create_t3a1():
    
    title = "I - Metrics Results"
    t3a1_title = widgets.HTML(value=f"<h2 style='height: 30px; line-height: 30px; text-align: left; display: flex; align-items: center;'>{title}</h2>")
    title = "II - Summary table of found pattern"
    t3a2_title = widgets.HTML(value=f"<h2 style='height: 30px; line-height: 30px; text-align: left; display: flex; align-items: center;'>{title}</h2>")
    
    metrics_locations = calculate_location_metrics(current_entity,ent_cat,path,df,other_categories)
    df_metrics_locations = pd.DataFrame(metrics_locations[current_entity],columns=["category", "result","text","file", "places"])
    dg_metrics_locations = create_grid_metrics_locations(metrics_locations,current_entity)
    df_metrics = calculate_df_metrics(df_metrics_locations)
    
    t3_output1 = widgets.Output()
    t3_output2 = widgets.Output()
    
    with t3_output1:
        t3_output1.clear_output()
        display(df_metrics.style.format({'precision': '{:.2f}', 'coverage': '{:.2f}'}).apply(apply_colors, axis=1, col=colors1))
        
    with t3_output2:
        t3_output2.clear_output()
        display(dg_metrics_locations)
    return widgets.VBox([t3a1_title,t3_output1,t3a2_title,t3_output2])

t3 =None
def create_t3():
    global t3
    #t3 = widgets.Output()
    t3 = create_t3a1()
    return t3

In [16]:
########
# TABS #
########

colors = ['#ced4da', '#580aff', '#147df5', '#0aefff', '#0aff99', '#a1ff0a', '#deff0a','#ff0000']
colors1 = ['#580aff', '#147df5', '#0aefff', '#0aff99', '#a1ff0a', '#deff0a','#ff0000']
display(HTML("<style>.widget-output{max-width: 1200px !important; }</style>"))
display(HTML("<style>.widget-dropdown select, .widget-dropdown .widget-readout { font-size: 14px !important; }</style>"))  


selection_category=widgets.Dropdown(options = getCat(current_entity),value=getCat(current_entity)[0],description ='Category : ',layout={'width': '700px'})
space = widgets.HTML(layout=widgets.Layout(height='10px'))
tabs = widgets.Tab([create_t0(),create_t1(), create_t2(),create_t3()],selected_index=0)

def create_tabs():
    global tabs
    tabs = widgets.Tab([create_t0(),create_t1(), create_t2(),create_t3()],selected_index=0)
    tabs.set_title(0, 'Load data')
    tabs.set_title(1, 'Data visualization')
    tabs.set_title(2, 'Categorization')
    tabs.set_title(3, 'Metrics')
    selection_results_save = widgets.HBox([button_selection_entity,create_button_results(),create_button_save()])
    interface = widgets.VBox([selection_results_save,output_results,tabs],layout={'border': '2px solid lightblue','width':'100%'})
    on_visualisation_categorie_change({'new':getCat(current_entity)[0]})
    display(interface)
    
def update_tabs():
    global tabs
    global current_tabindex
    current_tabindex = tabs.selected_index
    tabs.children = (t0,t1,create_t2(),create_t3())
    selection_category.options = getCat(current_entity)
    
    tabs.selected_index = current_tabindex

## III -  User interface

In [21]:
create_tabs()

VBox(children=(HBox(children=(Dropdown(index=2, options=('histologie_tumorale', 'traitement_specifique_du_canc…

In [20]:
ent_cat = {'histologie_tumorale': ['category?',
  "['carcinome canalaire']",
  "['grandes cellules']",
  "['cellules dendritiques']",
  "['neuroendocrine', '/', 'carcinome neuroendocrinien']",
  "['luminal a']",
  "['merkel']",                 
  "['insulinome']",
  "['adénocarcinome séreux', '/', 'séreux papillaire']",
  "['basocellulaires']",
  "['germinale']",
  "['composante mucineuse', '/', 'mucineuse']"],
 'traitement_specifique_du_cancer': ['category?',
  "['capécitabine', '/', 'topotécan', '/', 'carboplatine', '/', 'vinorelbine', '/', 'paclitaxel', '/', 'gemcitabine', '/', 'r-chop', '/', 'methotrexate', '/', 'méthotrexate', '/', 'doxorubicine', '/', 'r-gemox', '/', 'bep', '/', 'platine', '/', '5-fu', '/', 'docétaxel', '/', 'anthracycline', '/', 'témozolamide','/','fac']",
  "['radiothérapie']",
  "['sunitinib', '/', 'évérolimus', '/', 'vismodegib', '/', 'erlotinib', '/', 'tamoxifène', '/', 'rituximab', '/', 'exémestane', '/', 'somatuline', '/', 'létrozole', '/', 'fulvestrant']",
  "['tumorectomie', '/', 'mastectomie', '/', 'craniôtomie', '/', 'orchidectomie']",
  "['chimiothérapie']"],
 'signes_physiques': ['category?',
  "['adénopathie']",
  "['hypoglycémie']",
  "['parésie']",
  "['œdème']",
  "['hypoesthésie']"],
 'evolutivite_en_lien_avec_le_cancer': ['category?'],
 'reponse_a_la_chimiotherapie': ['category?'],
 'stade_metastatique_avec_localisations': ['category?'],
 'statut_tabagique': ['category?'],
 'atcd_geriatriques_et_medicaux_significatifs_pour_la_prise_en_charge': ['category?'],
 'stade_oms_ecog_karnofsky': ['category?'],
 'biomarqueurs_therapeutiques': ['category?'],
 'topographie_du_primitif': ['category?'],
 'symptomes': ['category?']}

In [20]:
path

'C:\\Users\\User\\AppData\\Roaming\\Microsoft\\Windows\\Start Menu\\Programs\\Python 3.11\\jupyter_playground\\LIMICS\\datasets\\cantemist-fr-18.04.2024\\'

In [115]:
ent_cat = {'histologie_tumorale': ['category?',"['carcinome','...15','cellules']","['carcinome','...40','cellules']","['carcinome','...100','cellules']"],
 'traitement_specifique_du_cancer': ['category?'],
 'signes_physiques': ['category?'],
 'evolutivite_en_lien_avec_le_cancer': ['category?'],
 'reponse_a_la_chimiotherapie': ['category?'],
 'stade_metastatique_avec_localisations': ['category?'],
 'statut_tabagique': ['category?'],
 'atcd_geriatriques_et_medicaux_significatifs_pour_la_prise_en_charge': ['category?'],
 'stade_oms_ecog_karnofsky': ['category?'],
 'biomarqueurs_therapeutiques': ['category?'],
 'topographie_du_primitif': ['category?'],
 'symptomes': ['category?']}

In [123]:
other_categories

[['histologie_tumorale',
  "['neuroendocrine', '/', 'carcinome neuroendocrinien']",
  'carcinome neuroendocrinien à grandes cellules',
  'cc_onco103.txt',
  [2177, 2222]],
 ['histologie_tumorale',
  "['neuroendocrine', '/', 'carcinome neuroendocrinien']",
  'carcinome neuroendocrinien métastatique à grandes cellules',
  'cc_onco103.txt',
  [4609, 4667]],
 ['traitement_specifique_du_cancer',
  "['chimiothérapie']",
  'chimiothérapie néoadjuvante avec un schéma ac pendant 2 cycles suivi de docétaxel pendant 4 cycles',
  'cc_onco1000.txt',
  [2229, 2327]],
 ['traitement_specifique_du_cancer',
  "['chimiothérapie']",
  'chimiothérapie à base de platine',
  'cc_onco103.txt',
  [4887, 4919]],
 ['traitement_specifique_du_cancer',
  "['radiothérapie']",
  '5-fu et une radiothérapie concomitante (rt) à visée néoadjuvante',
  'cc_onco110.txt',
  [1473, 1537]],
 ['traitement_specifique_du_cancer',
  "['chimiothérapie']",
  'chimiothérapie adjuvante avec 5-fu + leucovorine',
  'cc_onco110.txt',
  

In [121]:
other_categoriesx=[]

for index, row in df.iterrows():
    categories = row['category'].split("AND")
    for i, category in enumerate(categories):
        if i > 1:
            for place in row['places']:
                other_categoriesx.append([row['entity'],category.strip(),row['text'],place[0]+".txt",[place[1],place[2]]])
    if len(categories) > 1: 
        df.at[index, 'category'] = categories[1].strip()

In [124]:
other_categoriesx

Unnamed: 0,entity,category,text,occurrences,stems,places
0,traitement_specifique_du_cancer,['chimiothérapie'],chimiothérapie,4,[chimiothérap],"[[cc_onco118, 3328, 3342], [cc_onco120, 6057, ..."
1,symptomes,category?,alopécie,4,[alopec],"[[cc_onco125, 1216, 1224], [cc_onco125, 1370, ..."
2,symptomes,category?,crampes,4,[cramp],"[[cc_onco125, 1554, 1561], [cc_onco125, 1243, ..."
3,histologie_tumorale,['carcinome canalaire'],carcinome canalaire infiltrant,3,"[carcinom, canalair, infiltr]","[[cc_onco1000, 2086, 2116], [cc_onco109, 395, ..."
4,traitement_specifique_du_cancer,"['capécitabine', '/', 'topotécan', '/', 'carbo...",topotécan,3,[topotécan],"[[cc_onco103, 4469, 4478], [cc_onco111, 3416, ..."
...,...,...,...,...,...,...
357,symptomes,category?,hémiparésie gauche et d'un faible niveau de co...,1,"[hémipares, gauch, faibl, niveau, conscienc]","[[cc_onco120, 8254, 8308]]"
358,symptomes,category?,asthénie et d'une perte d'appétit,1,"[asthen, pert, appet]","[[cc_onco125, 1420, 1453]]"
359,symptomes,category?,dysgueusie et de l'anosmie,1,"[dysgueus, anosm]","[[cc_onco125, 2059, 2085]]"
360,symptomes,category?,nausées,1,[naus],"[[cc_onco125, 2580, 2587]]"


In [18]:
pd.set_option('display.width', 2000)
pd.options.display.max_rows=2000
pd.set_option('display.max_colwidth', 80)

In [19]:
widget_select = widgets.FileUpload(accept='',multiple=True)
widget_select

FileUpload(value=(), description='Upload', multiple=True)

In [12]:
fc = FileChooser()

In [98]:
display(fc.selected_filename)




In [41]:
file=widget_select.value[0]
file
#folder_path = os.path.dirname(filename)

{'name': 'cc_onco102.ann',
 'type': '',
 'size': 1838,
 'content': <memory at 0x000001FE8B6557C0>,
 'last_modified': datetime.datetime(2024, 4, 15, 9, 11, 27, tzinfo=datetime.timezone.utc)}

In [122]:
attribution_tf(current_entity,10,df_tf_results,ban_words_tfidf[current_entity])
#df_tf_results

['cellules',
 'grandes',
 'carcinome',
 'adénocarcinome',
 'sous-type',
 'séreux',
 'malin',
 'mucineuse',
 'néoplasie']

In [136]:
top_words= df_tf_results.sort_values(by='tfidf',ascending=False).reset_index(drop=True).groupby('entity')
for index,row in top_words.get_group(current_entity).iterrows():
    print(row['word'])

cellules
grandes
carcinome
adénocarcinome
sous-type
dendritiques
séreux
malin
mucineuse
néoplasie
neuroendocrinien
b
blastique
plasmacytoïdes
lymphome
diffus
luminal
papillaire
tumeur
canalaire
infiltrant
composante
merkel
insulinome
ovarien
pulmonaire
épithélial
indifférencié
nucléaire
germinales
somatostatine
exprimant
leucémie/lymphome
différenciation
cdi
germinale
abc
extranodal
g2
maligne
récepteurs
l'ovaire
épithéliale
fortement
indifférenciée
positive
synaptophysine
probable
chromogranine
carcinomes
métastatique
d'origine
a
hépatiques
neuroendocrine
pancréatique
rectal
réalisée
basocellulaires
icd
immunohistochimie
mammaire
métastases
