In [1]:
# Importing libraries
import requests
from xml.etree import ElementTree as ET
from NCBI_retriever import perform_esearch_ids, perform_efetch_abstracts
from configs import *

A NCBI API KEY is needed here

In [2]:
# We test some query
query = 'polyphenols[mesh]+AND+microbiota[mesh]'

# Perform ESearch to get publication IDs
publication_ids, web_key = perform_esearch_ids(query, NCBI_API_KEY , sort_by="relevance", retmax=100)

print("Number of publications:", len(publication_ids))

# Perform EFetch for abstracts
publication_abstracts = perform_efetch_abstracts(publication_ids, query, web_key, NCBI_API_KEY)

# Print the retrieved abstracts XML
print("Abstracts XML:", publication_abstracts[0])
print("Number of retrieved abstracts:", len(publication_abstracts))

Number of publications: 100
Abstracts XML: Cocoa and its products are rich sources of polyphenols such as flavanols. These compounds exert antioxidant and anti-inflammatory activities, accountable for cocoa health-promoting effects. However, cocoa polyphenols are poorly absorbed in the intestine, and most of them cannot reach the systemic circulation in their natural forms. Instead, their secondary bioactive metabolites are bioavailable, enter the circulation, reach the target organs, and exhibit their activities. In fact, once reaching the intestine, cocoa polyphenols interact bidirectionally with the gut microbiota. These compounds can modulate the composition of the gut microbiota exerting prebiotic mechanisms. They enhance the growth of beneficial gut bacteria, such as 
Number of retrieved abstracts: 100


We generate some texts

In [6]:
query_list = ['polyphenols[mesh]+AND+microbiota[mesh]',
          'carbohydrates[mesh]+AND+microbiota[mesh]',
          'proteins[mesh]+AND+microbiota[mesh]',
          'salt[mesh]+AND+microbiota[mesh]',
          'lipids[mesh]+AND+microbiota[mesh]',
          'Dietary Fibers[mesh]+AND+microbiota[mesh]',
          'Dietary Fibers[mesh]+AND+nutrition[mesh]',
          'lipids[mesh]+AND+nutrition[mesh]',
          'proteins[mesh]+AND+nutrition[mesh]',
          'salt[mesh]+AND+nutrition[mesh]',
          'carbohydrates[mesh]+AND+nutrition[mesh]',
          'polyphenols[mesh]+AND+nutrition[mesh]']

In [7]:
corpus = []

for query in query_list:
    print("Search:", query)
    
    # Perform ESearch to get publication IDs
    publication_ids, web_key = perform_esearch_ids(query, NCBI_API_KEY, sort_by="relevance", retmax=100)
    
    # Perform EFetch for abstracts
    publication_abstracts = perform_efetch_abstracts(publication_ids, query, web_key, NCBI_API_KEY)

    print("Number of publications:", len(publication_ids))
    
    print("")
    
    corpus.append([publication_ids, publication_abstracts])

Search: polyphenols[mesh]+AND+microbiota[mesh]
Number of publications: 100

Search: carbohydrates[mesh]+AND+microbiota[mesh]
Number of publications: 100

Search: proteins[mesh]+AND+microbiota[mesh]
Number of publications: 100

Search: salt[mesh]+AND+microbiota[mesh]
Number of publications: 100

Search: lipids[mesh]+AND+microbiota[mesh]
Number of publications: 100

Search: Dietary Fibers[mesh]+AND+microbiota[mesh]
Number of publications: 100

Search: Dietary Fibers[mesh]+AND+nutrition[mesh]
Number of publications: 100

Search: lipids[mesh]+AND+nutrition[mesh]
Number of publications: 100

Search: proteins[mesh]+AND+nutrition[mesh]
Number of publications: 100

Search: salt[mesh]+AND+nutrition[mesh]
Number of publications: 100

Search: carbohydrates[mesh]+AND+nutrition[mesh]
Number of publications: 100

Search: polyphenols[mesh]+AND+nutrition[mesh]
Number of publications: 81



In [None]:
#!pip install ragstack-ai openai tiktoken pipdeptree langchain_openai 

In [5]:
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.schema.runnable import RunnableMap
from langchain.schema.output_parser import StrOutputParser

template = """
You're a helpful AI assistant tasked to answer the user's questions.
You're friendly and you answer extensively with short sentences. 
You are a scientist that use it's scientific knowledge to answer.
Your answers have to be related to nutrition.
You have to give nutritional advices when you answer. 
If you don't know, do not answer.

QUESTION: {question}

YOUR ANSWER:"""
prompt = ChatPromptTemplate.from_messages([("system", template)])

llm = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,
    temperature=0.3,
    model='gpt-3.5-turbo',
    streaming=True,
    verbose=True)

# Ask Question
inputs = RunnableMap({
  'question': lambda x: x['question']
})

# Pipeline
chain = inputs | prompt | llm | StrOutputParser()

chain.invoke({"question": "What kind of foods an adult should eat?"})

"An adult should eat a balanced diet that includes a variety of foods from different food groups. This includes fruits, vegetables, whole grains, lean proteins, and healthy fats. It's important to limit processed foods, sugary drinks, and excessive amounts of sodium and saturated fats. It's also important to stay hydrated and drink plenty of water throughout the day."

In [None]:
# Let's add our scientifics abstracts

In [22]:
corpus_texte = [textes for context in corpus for textes in context[1]]

In [19]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

# We prepare the corpus
corpus_texte = [textes for context in corpus for textes in context[1]]

documents = splitter.create_documents(corpus_texte)
document_chunks = splitter.split_documents(documents)

for item in document_chunks:
    print(item)

page_content='Cocoa and its products are rich sources of polyphenols such as flavanols. These compounds exert antioxidant and anti-inflammatory activities, accountable for cocoa health-promoting effects. However, cocoa polyphenols are poorly absorbed in the intestine, and most of them cannot reach the systemic circulation in their natural forms. Instead, their secondary bioactive metabolites are bioavailable, enter the circulation, reach the target organs, and exhibit their activities. In fact, once reaching the intestine, cocoa polyphenols interact bidirectionally with the gut microbiota. These compounds can modulate the composition of the gut microbiota exerting prebiotic mechanisms. They enhance the growth of beneficial gut bacteria, such as'
page_content='The biological properties of dietary polyphenols are greatly dependent on their bioavailability that, in turn, is largely influenced by their degree of polymerization. The gut microbiota play a key role in modulating the productio

In [24]:
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import AstraDB

# Create a new Astra DB Vector Store
vector_store = AstraDB(
    embedding=OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY),
    collection_name="sc_article_nutrition",
    api_endpoint=ASTRA_API_ENDPOINT,
    token=ASTRA_TOKEN
)

In [25]:
# Load the CNN documents into the Astra DB Vector Store
vector_store.add_documents(document_chunks)

['acd0de23d39849f9a9968a3868730807',
 '11d907938e334970a776c3f7bc70bf22',
 '9ac87655108440bf9d5f7218d6dc937a',
 '08b49919b2d7408eaacb5eb56b306a71',
 '6d3593f6367b4d41845c20c0f1c6bf9a',
 'f6dff32e2eac4c8d8d5dfd17efb84834',
 '24ae510e38c64620af0e44d0a9e47cb3',
 '518219a7a7f2411bb41bf1a947e277ec',
 '23f790eff55a4e9e8204d1a28d6899a5',
 'a5188d45359446e189138ad130987d82',
 '2c0430cad2c84d68b083ff7600dd8696',
 'cb1a7c1eeafd4550b7a33cb801b6bd0d',
 '95f7058c8a724e18bda25e7b82336913',
 '2edbc58a0d5e42d49a1866a550a8ad4d',
 'f0f176ee3311481ab69bbf2c97e54ac9',
 '3122b6e2df2d45f69d9151586788470a',
 '67b4422d936f455f85af3ce48fa8e398',
 '157fc714891944dbbf72b3ca9ebf148c',
 '16a5ba3316f14d2cbfc20cc3a29c12f8',
 '847c42372cb34c348ec891eeb750db6d',
 '62b3fe9fa10943d5bed3b1fcd5e01eed',
 '2a040dd39c4946e5a3e51b82c7e9253f',
 '9ca186a515574dd28dd779f7caa16d76',
 '645de5a2af1046afb806de9bb72f7d08',
 '2302236857fa4b62b01c409860cebd05',
 '19c75b3576c049cd82d76078b3278bc0',
 '92b27b06dac54b13952ecde115a6350d',
 

In [None]:
# We can check which sentence is the closest to the query

In [26]:
query = "What kind of foods an adult should eat?"
result_array = vector_store.similarity_search(query, k=5)
for item in result_array:
    print(item)

page_content='foods are required.'
page_content='Health authorities increasingly recommend a more plant-based diet, rich in fruits, vegetables, pulses, whole grains and nuts, low in red meat and moderate in dairy, eggs, poultry and fish which will be beneficial for both health and the environment. A systematic review of observational and intervention studies published between 2000 and January 2020 was conducted to assess nutrient intake and status in adult populations consuming plant-based diets (mainly vegetarian and vegan) with that of meat-eaters. Mean intake of nutrients were calculated and benchmarked to dietary reference values. For micronutrient status, mean concentrations of biomarkers were calculated and compared across diet groups. A total of 141 studies were included, mostly from Europe, South/East Asia, and North America. Protein intake was lower in people following plant-based diets compared to meat-eaters, but well within recommended intake levels. While fiber, polyunsatu

In [27]:
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.schema.runnable import RunnableMap
from langchain.schema.output_parser import StrOutputParser

# Get the retriever for the Chat Model
retriever = vector_store.as_retriever(
    search_kwargs={"k": 5}
)

# Create the prompt template
template = """
You're a helpful AI assistant tasked to answer the user's questions.
You're friendly and you answer extensively with short sentences. 
You are a scientist that use it's scientific knowledge to answer.
Your answers have to be related to nutrition.
You have to give nutritional advices when you answer. 
If you don't know, do not answer.

CONTEXT:
{context}

QUESTION: {question}

YOUR ANSWER:"""
prompt = ChatPromptTemplate.from_messages([("system", template)])

# Define the chain
# We added a context
inputs = RunnableMap({
  'context': lambda x: retriever.get_relevant_documents(x['question']),
  'question': lambda x: x['question']
})
chain = inputs | prompt | llm | StrOutputParser()

# Call the chain with the question
chain.invoke({"question": "What kind of foods an adult should eat?"})

'An adult should eat a variety of foods, including fruits, vegetables, pulses, whole grains, and nuts. It is also recommended to consume moderate amounts of dairy, eggs, poultry, and fish. Red meat should be consumed in lower amounts. This balanced diet will provide the necessary nutrients for optimal health.'

In [28]:
chain.invoke({"question": "What kind of foods an adult should eat to have a healthier gut microbiota?"})

'To have a healthier gut microbiota, adults should focus on consuming a diet rich in plant-based fiber and fermented foods. These types of foods have been shown to positively influence the composition and function of the gut microbiome. Plant-based fiber can be found in fruits, vegetables, whole grains, legumes, and nuts, while fermented foods include yogurt, sauerkraut, kimchi, and kefir. Additionally, it is important to maintain a diverse diet that includes a variety of fruits, vegetables, whole grains, lean proteins, and healthy fats. This will provide a wide range of nutrients that support the growth of beneficial bacteria in the gut.'

In [29]:
chain.invoke({"question": "Should i do some sport?"})

'Yes, engaging in sports can have numerous benefits for your overall health and well-being. Regular physical activity can help improve cardiovascular health, strengthen muscles and bones, enhance flexibility and coordination, boost mood and mental well-being, and manage weight. It is important to choose a sport or activity that you enjoy and that suits your fitness level and interests. Remember to consult with a healthcare professional before starting any new exercise program.'

With that, we can create 2 apps:

- One to load files to the Astradatabase
- One users engaging with the developped model 

In [30]:
# Create the prompt template
template = """
You're a helpful AI assistant tasked to answer the user's questions.
You're friendly and you answer extensively with short sentences. 
You are a scientist that use it's scientific knowledge to answer.
Your answers have to be related to nutrition and cite precise foods the can be eaten.
You have to give nutritional advices when you answer. 
If you don't know, do not answer.

CONTEXT:
{context}

QUESTION: {question}

YOUR ANSWER:"""
prompt = ChatPromptTemplate.from_messages([("system", template)])

# Define the chain
# We added a context
inputs = RunnableMap({
  'context': lambda x: retriever.get_relevant_documents(x['question']),
  'question': lambda x: x['question']
})
chain = inputs | prompt | llm | StrOutputParser()

# Call the chain with the question
chain.invoke({"question": "Should i do some sport?"})

'Yes, engaging in regular physical activity is beneficial for your overall health and well-being. It can help improve cardiovascular health, strengthen muscles and bones, enhance mental well-being, and manage weight. Some sports you can consider are swimming, cycling, running, or playing volleyball.'