1. Prétraitement du dataset

Téléchargement du dataset contenant les balises <AbstractText> segmentée en phrases

In [4]:
from pymed import PubMed
pubmed = PubMed(tool="MyTool", email="myemail@example.com")
results = pubmed.query("diabetis", max_results=15)
sentences=[]#liste des phrases
corpus=' '
for article in results:
    sentences.append(article.title)#chargement des données du dataset dans la liste créee
for i in sentences:
    corpus=corpus + ''.join(i)#Conversion de la liste des phrases en chaine de caractères

In [5]:
corpus

' Referable Diabetic Retinopathy Prediction Algorithm Applied to a Population of 120,389 Type 2 Diabetics over 11 Years Follow-Up.Severe hypoglycemia and hypoglycemia awareness are associated with preclinical atherosclerosis in patients with type 1 diabetes without an estimated high cardiovascular risk.Adherence to an energy-restricted Mediterranean diet is associated with the presence and burden of carotid atherosclerosis in people with type 1 diabetes.Survivin/BIRC5 as a novel molecular effector at the crossroads of glucose metabolism and radioresistance in head and neck squamous cell carcinoma.Midnight Cortisol is Associated with Changes in Systolic Blood Pressure and Diabetic Neuropathy in Subjects with Type\xa01 Diabetes Undergoing Simultaneous Kidney-Pancreas Transplantation.Rapid Reduction of HbA1c and Early Worsening of Diabetic Retinopathy: A Real-world Population-Based Study in Subjects With Type 2 Diabetes.Nuclear Magnetic Resonance-Based Lipidomics in the Assessment of Card

2. Part of Speech Tagging

In [3]:
import pandas as pd
from collections import Counter
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

def compute_unigram_probabilities(corpus):
    words = [word.lower() for sentence in corpus for word in sentence.split()]
    total_words = len(words)
    unigram_frequencies = Counter(words)
    unigram_probabilities = {word: freq / total_words for word, freq in unigram_frequencies.items()}
    unigram_df = pd.DataFrame(list(unigram_probabilities.items()), columns=['Unigram', 'Probability'])
    return unigram_df

def compute_bigram_probabilities(corpus):
    words = [word.lower() for sentence in corpus for word in sentence.split()]
    total_bigrams = len(words) - 1
    bigrams = [(words[i], words[i + 1]) for i in range(total_bigrams)]
    bigram_frequencies = Counter(bigrams)
    bigram_probabilities = {bigram: freq / total_bigrams for bigram, freq in bigram_frequencies.items()}
    bigram_df = pd.DataFrame(list(bigram_probabilities.items()), columns=['Bigram', 'Probability'])
    return bigram_df

# Compute unigram probabilities
unigram_df = compute_unigram_probabilities(sentences)
print("Unigram Probabilities:")
print(unigram_df)
# Compute bigram probabilities
bigram_df = compute_bigram_probabilities(sentences)
print("\nBigram Probabilities:")
print(bigram_df)

Unigram Probabilities:
           Unigram  Probability
0        referable     0.003846
1         diabetic     0.019231
2      retinopathy     0.007692
3       prediction     0.003846
4        algorithm     0.003846
..             ...          ...
158      diagnosis     0.003846
159        thyroid     0.007692
160         cancer     0.003846
161  indeterminate     0.003846
162       nodules.     0.003846

[163 rows x 2 columns]

Bigram Probabilities:
                        Bigram  Probability
0        (referable, diabetic)     0.003861
1      (diabetic, retinopathy)     0.007722
2    (retinopathy, prediction)     0.003861
3      (prediction, algorithm)     0.003861
4         (algorithm, applied)     0.003861
..                         ...          ...
225          (thyroid, cancer)     0.003861
226               (cancer, in)     0.003861
227        (in, indeterminate)     0.003861
228   (indeterminate, thyroid)     0.003861
229        (thyroid, nodules.)     0.003861

[230 rows x 2 col

In [4]:
# Get POS tags for each word in the sentences
text_words=word_tokenize(corpus)
tagged_words=pos_tag(text_words)
print(tagged_words)

[('Referable', 'JJ'), ('Diabetic', 'NNP'), ('Retinopathy', 'NNP'), ('Prediction', 'NNP'), ('Algorithm', 'NNP'), ('Applied', 'NNP'), ('to', 'TO'), ('a', 'DT'), ('Population', 'NN'), ('of', 'IN'), ('120,389', 'CD'), ('Type', 'NNP'), ('2', 'CD'), ('Diabetics', 'NNPS'), ('over', 'IN'), ('11', 'CD'), ('Years', 'NNS'), ('Follow-Up.Severe', 'JJ'), ('hypoglycemia', 'NN'), ('and', 'CC'), ('hypoglycemia', 'NN'), ('awareness', 'NN'), ('are', 'VBP'), ('associated', 'VBN'), ('with', 'IN'), ('preclinical', 'JJ'), ('atherosclerosis', 'NN'), ('in', 'IN'), ('patients', 'NNS'), ('with', 'IN'), ('type', 'JJ'), ('1', 'CD'), ('diabetes', 'NNS'), ('without', 'IN'), ('an', 'DT'), ('estimated', 'VBN'), ('high', 'JJ'), ('cardiovascular', 'JJ'), ('risk.Adherence', 'NN'), ('to', 'TO'), ('an', 'DT'), ('energy-restricted', 'JJ'), ('Mediterranean', 'NNP'), ('diet', 'NN'), ('is', 'VBZ'), ('associated', 'VBN'), ('with', 'IN'), ('the', 'DT'), ('presence', 'NN'), ('and', 'CC'), ('burden', 'NN'), ('of', 'IN'), ('carotid

3. Named Entity Recognition
Implement NER tagging using transition matrices.

In [None]:
import pandas as pd
import spacy
import requests
from bs4 import BeautifulSoup

nlp = spacy.load("en_core_web_sm")
pd.set_option("display.max_rows", 200)

doc = nlp(corpus)

#visualizing the entities in the sentence using the wonderfully-named displacy inside of spaCy.
from spacy import displacy
displacy.render(doc, style="ent")

#visualizing the entities in the sentence using a DataFrame.
entities = [(ent.text, ent.label_, ent.lemma_) for ent in doc.ents]
df = pd.DataFrame(entities, columns=['text', 'type', 'lemma'])
df

In [None]:
doc = nlp(corpus)
df[df.type == 'ORG'].lemma.value_counts()

In [None]:
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
diabetes.target[:3]

4. Topics Modeling

In [7]:
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
import numpy as np
import nltk
import os
from sklearn import datasets
# alternative writing from sklearn.datasets import fetch_20newsgroups
# we declare certain categories on which we will do our training
categories = ['sci.med','sci.space','sci.electronics']
# our training model
ng_train = datasets.fetch_20newsgroups(subset='train', 
                                       categories=categories, 
                                       remove=('headers', 
                                               'footers', 'quotes'))

In [112]:
# Data verification
print(ng_train.data[2])
print("++\n", ng_train.data[1504])
print("++\n", ng_train.data[1000])
print("\n\nData has {0:d} documents". format(len(ng_train.data)))

I am looking for current sources for lists of all the home
medical tests currently legally available.
I believe this trend of allowing tests at home where
feasible, decreased medical costs by a factor of 10 or
more and allows the patient some time and privacy to
consider the best action from the results of such tests.
In fact I believe home medical tests and certain basic
tests for serious diseases such as cancer, heart disease,
should be offered free to the American public.
This could actually help to reduce national medical costs
since many would have an earlier opportunity to know
about and work toward recuperation or cure.
Mike Romano


++
 [ Article crossposted from comp.windows.ms ]
[ Author was Kevin Routh ]
[ Posted on 19 Apr 1993 12:35:55 GMT ]

For your information:

I hooked up my ImageWriter I to my COM1 serial port and used the C-Itoh
8510 driver in Windows 3.1.  The cable I am using is a straight-thru
cable connected to a Null Modem Adapter I got at Radio Shack (catalog
#

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
#Pre-process all the words in the document, including deleting empty words.
#Render all text in lower case.
# the countvectorizer allows us to carry out several pre-processing operations at once
count_vectorizer = CountVectorizer(ngram_range=(1,2),
                                   stop_words='english',
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   max_features=1000)
X  = count_vectorizer.fit_transform(ng_train.data) # X is now our transformed data

In [9]:
#Using sklearn, create an LDA model with 3 subjects
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation (3, random_state=42, learning_method='online')
data = lda.fit_transform(X)
print(data[0])
print(categories)

[0.50943573 0.0461408  0.44442347]
['sci.med', 'sci.space', 'sci.electronics']


In [10]:
print(ng_train.data[0]) # This document is 50% topic 3!  according to statitics but the content of this document shows us something else

Another fish to check out is Richard Rast -- he works
for Lockheed Missiles, but is on-site at NASA Johnson.

Nick Johnson at Kaman Sciences in Colo. Spgs and his
friend, Darren McKnight at Kaman in Alexandria, VA.

Good luck.

R. Landis


5. Translation

In [None]:
from transformers import MarianMTModel, MarianTokenizer

# Load the pre-trained MarianMT model and tokenizer for English to French
model_name = 'Helsinki-NLP/opus-mt-en-fr'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

def translate(sentences, target_language='fr'):
    translated = []
    for sentence in sentences:
        translated_text = model.generate(**tokenizer(sentence, return_tensors="pt", padding=True))
        translated.append(tokenizer.decode(translated_text[0], skip_special_tokens=True))
    return translated

# Example usage
for translation in translations:
    print(translation)

Front-end: ----------------------voir le fichier app.py---------------------------------