In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
from Bio import Entrez
from Bio import Medline
import re
import nltk
from nltk import word_tokenize
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

import pandas as pd
import numpy as np


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gharbi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gharbi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gharbi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
def search(term, rmax):
    Entrez.email = ''
    handle = Entrez.esearch(db='pubmed', # DB
                            sort='relevance',  # tri par relevance
                            retmax=rmax, # combien d'articles
                            retmode='xml', 
                            term=term) # mot clé
    results = Entrez.read(handle)
    return results

def fetch_details(id_list):
    ids = ','.join(id_list)
    Entrez.email = 'your.email@example.com'
    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=ids)
    results = Entrez.read(handle)
    return results

    
def parse_authors(id_list):
    h = Entrez.efetch(db='pubmed', id=id_list, rettype='medline', retmode='text')
    records = Medline.parse(h)
    authors_list = []
    for record in records:
        au = record.get('AU', '?')
        for a in au: 
            if a not in authors_list:
                authors_list.append(a)
    return authors_list

def article_date(source):
    try :
        return source[0]['Year'] + '-' + source[0]['Month'] + '-' + source[0]['Day']
    except : return np.nan
    

In [4]:
title_list = []
abst_list = []
date_list = []
term = 'grippe'
results = search(term, 100)
#   IDs :
id_list = results['IdList']
    
papers = fetch_details(id_list)
for i, paper in enumerate(papers['PubmedArticle']):
        
#       titles : 
    title = paper['MedlineCitation']['Article']['ArticleTitle']
    print("{}) {}".format(i+1, title))
    title_list.append(title)
        
#       Abstracts : 
    try:
        abst = paper['MedlineCitation']['Article']['Abstract']['AbstractText']
        abst = str(abst).strip("['']")
        abst_list.append(abst)
    except:
        abst = "null"
        abst_list.append(abst)
        
#       Dates : 
    d = article_date(paper['MedlineCitation']['Article']['ArticleDate'])
    date_list.append(d)
        
#   Authors:
auth_list = []
for id in id_list:
    auth = parse_authors(id)
    auth_list.append(auth)

1) La Grippe or Russian influenza: Mortality statistics during the 1890 Epidemic in Indiana.
2) A Mutation Network Method for Transmission Analysis of Human Influenza H3N2.
3) Long-term culture of human lung adenocarcinoma A549 cells enhances the replication of human influenza A viruses.
4) Identification of a novel antiviral micro-RNA targeting the NS1 protein of the H1N1 pandemic human influenza virus and a corresponding viral escape mutation.
5) Antigenic Change in Human Influenza A(H2N2) Viruses Detected by Using Human Plasma from Aged and Younger Adult Individuals.
6) Getting a grippe on severity: a retrospective comparison of influenza-related hospitalizations and deaths captured in reportable disease and administrative data sources in Ontario, Canada.
7) Eco-Epidemiological Evidence of the Transmission of Avian and Human Influenza A Viruses in Wild Pigs in Campeche, Mexico.
8) Predicting the short-term success of human influenza virus variants with machine learning.
9) Prevalenc

In [4]:
results

{'Count': '66180', 'RetMax': '10', 'RetStart': '0', 'IdList': ['30756469', '33022948', '31424377', '31470040', '31652870', '31088426', '32403268', '32259469', '29284132', '30389548'], 'TranslationSet': [{'From': 'grippe', 'To': '"influenza, human"[MeSH Terms] OR ("influenza"[All Fields] AND "human"[All Fields]) OR "human influenza"[All Fields] OR "grippe"[All Fields]'}], 'TranslationStack': [{'Term': '"influenza, human"[MeSH Terms]', 'Field': 'MeSH Terms', 'Count': '51500', 'Explode': 'Y'}, {'Term': '"influenza"[All Fields]', 'Field': 'All Fields', 'Count': '117525', 'Explode': 'N'}, {'Term': '"human"[All Fields]', 'Field': 'All Fields', 'Count': '3759407', 'Explode': 'N'}, 'AND', 'GROUP', 'OR', {'Term': '"human influenza"[All Fields]', 'Field': 'All Fields', 'Count': '53054', 'Explode': 'N'}, 'OR', {'Term': '"grippe"[All Fields]', 'Field': 'All Fields', 'Count': '1351', 'Explode': 'N'}, 'OR', 'GROUP'], 'QueryTranslation': '"influenza, human"[MeSH Terms] OR ("influenza"[All Fields] AND

# Pré-processing

In [5]:
def preprocessing(text):
    
    # suppression de pontuation et caracteres numériques
    text = re.sub('[^a-zA-Z]',' ', text)
    # en lettres minuscules 
    text = text.lower()
    # tokenisation : prendre chaque mot a sa case
    text = word_tokenize(text)
    # suppression des stop words
    stop_words = stopwords.words('english')
    text = [word for word in text if word not in stop_words]
    # Lemmatiser les mots : exemple : Screening => screen , investigated => investigate
    lemma = WordNetLemmatizer()
    text = [lemma.lemmatize(word=w, pos='v') for w in text]
    # supprimer les mots de taille inférieur à 2
    text= [i for i in text if len(i) > 2]
    # reconvertir en String
    text = ' '.join(text)
    
    return text
    
for i, abst in enumerate(abst_list):
    abst_list[i] = preprocessing(abst)

    

La réalisation du corpus médical:

In [6]:
dict_corpus = {'Article ID' : id_list, 'Titre' : title_list, 'Auteurs': auth_list, 'Date': date_list, 'Abstract' : abst_list}
df_corpus = pd.DataFrame(dict_corpus)

for i,a in enumerate(df_corpus['Auteurs']):
    df_corpus['Auteurs'][i] = ', '.join(a)

df_corpus

Unnamed: 0,Article ID,Titre,Auteurs,Date,Abstract
0,30756469,La Grippe or Russian influenza: Mortality stat...,Ewing ET,2019-02-12,stringelement russian influenza begin late lon...
1,33022948,A Mutation Network Method for Transmission Ana...,"Zhang C, Wang Y, Chen C, Long H, Bai J, Zeng J...",2020-10-03,characterize spatial transmission pattern crit...
2,31424377,Long-term culture of human lung adenocarcinoma...,"Ujie M, Takada K, Kiso M, Sakai-Tagawa Y, Ito ...",,long term culture human lung adenocarcinoma ce...
3,31470040,Identification of a novel antiviral micro-RNA ...,"Bavagnoli L, Campanini G, Forte M, Ceccotti G,...",2019-08-27,influenza virus iav protein one major regulato...
4,31652870,Antigenic Change in Human Influenza A(H2N2) Vi...,"Matsuzawa Y, Iwatsuki-Horimoto K, Nishimoto Y,...",2019-10-23,human influenza viruses emerge replace viruses...
5,31088426,Getting a grippe on severity: a retrospective ...,"Hobbs JL, Whelan M, Winter AL, Murti M, Hohena...",2019-05-14,stringelement since ontario reportable disease...
6,32403268,Eco-Epidemiological Evidence of the Transmissi...,"Maya-Badillo BA, Ojeda-Flores R, Chaves A, Rev...",2020-05-11,influenza zoonosis cause various influenza vir...
7,32259469,Predicting the short-term success of human inf...,"Hayati M, Biller P, Colijn C",2020-04-08,seasonal influenza viruses constantly change p...
8,29284132,Prevalence of human influenza virus in Iran: E...,"Mozhgani SH, Zarei Ghobadi M, Moeini S, Pakzad...",2017-12-25,systematic review meta analysis conduct consol...
9,30389548,Incidence of antiviral drug resistance markers...,"Moasser E, Moasser A, Zaraket H",2018-10-30,stringelement two class antiviral drug availab...


In [7]:
df_corpus.to_csv(r'corpus.csv',index = False,header=True)


# L'extaction des sous thématique : 

In [8]:
df_corpus

Unnamed: 0,Article ID,Titre,Auteurs,Date,Abstract
0,30756469,La Grippe or Russian influenza: Mortality stat...,Ewing ET,2019-02-12,stringelement russian influenza begin late lon...
1,33022948,A Mutation Network Method for Transmission Ana...,"Zhang C, Wang Y, Chen C, Long H, Bai J, Zeng J...",2020-10-03,characterize spatial transmission pattern crit...
2,31424377,Long-term culture of human lung adenocarcinoma...,"Ujie M, Takada K, Kiso M, Sakai-Tagawa Y, Ito ...",,long term culture human lung adenocarcinoma ce...
3,31470040,Identification of a novel antiviral micro-RNA ...,"Bavagnoli L, Campanini G, Forte M, Ceccotti G,...",2019-08-27,influenza virus iav protein one major regulato...
4,31652870,Antigenic Change in Human Influenza A(H2N2) Vi...,"Matsuzawa Y, Iwatsuki-Horimoto K, Nishimoto Y,...",2019-10-23,human influenza viruses emerge replace viruses...
5,31088426,Getting a grippe on severity: a retrospective ...,"Hobbs JL, Whelan M, Winter AL, Murti M, Hohena...",2019-05-14,stringelement since ontario reportable disease...
6,32403268,Eco-Epidemiological Evidence of the Transmissi...,"Maya-Badillo BA, Ojeda-Flores R, Chaves A, Rev...",2020-05-11,influenza zoonosis cause various influenza vir...
7,32259469,Predicting the short-term success of human inf...,"Hayati M, Biller P, Colijn C",2020-04-08,seasonal influenza viruses constantly change p...
8,29284132,Prevalence of human influenza virus in Iran: E...,"Mozhgani SH, Zarei Ghobadi M, Moeini S, Pakzad...",2017-12-25,systematic review meta analysis conduct consol...
9,30389548,Incidence of antiviral drug resistance markers...,"Moasser E, Moasser A, Zaraket H",2018-10-30,stringelement two class antiviral drug availab...


In [None]:
import dash
from dash.dependencies import Output, Input
import dash_html_components as html
import dash_core_components as dcc

app = dash.Dash(prevent_initial_callbacks=True)
app.layout = html.Div(
    [
        html.Button("Download Excel", id="btn_xlxs"),
        dcc.Download(id="download-dataframe-xlxs"),
    ]
)

import pandas as pd
import xlsxwriter

df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [2, 1, 5, 6], "c": ["x", "x", "y", "y"]})


@app.callback(
    Output("download-dataframe-xlxs", "data"),
    Input("btn_xlxs", "n_clicks"),
    prevent_initial_call=True,
)
def func(n_clicks):
    return dcc.send_data_frame(df.to_excel, "mydf.xlxs", sheet_name="Sheet_name_1")


if __name__ == "__main__":
    app.run_server()

Dash is running on http://127.0.0.1:8050/

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:8050/ (Press CTRL+C to quit)
127.0.0.1 - - [04/May/2021 04:19:36] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [04/May/2021 04:19:36] "[37mGET /_dash-layout HTTP/1.1[0m" 200 -
127.0.0.1 - - [04/May/2021 04:19:36] "[37mGET /_dash-dependencies HTTP/1.1[0m" 200 -
127.0.0.1 - - [04/May/2021 04:19:41] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
