In [1]:
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import textwrap
import numpy as np
import pandas as pd

import google.generativeai as genai
import google.ai.generativelanguage as glm

# Used to securely store your API key
from google.colab import userdata

from IPython.display import Markdown

In [3]:
# In Colab, add the key to the secrets manager under the "🔑" in the left panel. Give it the name GOOGLE_API_KEY.

# Once you have the API key, pass it to the SDK. You can do this in two ways:

  # Put the key in the GOOGLE_API_KEY environment variable (the SDK will automatically pick it up from there).
  # Pass the key to genai.configure(api_key=...)
GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')

genai.configure(api_key=GOOGLE_API_KEY)

In [4]:
model = genai.GenerativeModel('gemini-pro')
chat = model.start_chat(history=[])

In [5]:
def split_document(file_path, chunk_size_bytes=2000):
    """
    Splits the content of the specified file into smaller chunks based on a specified byte size.

    @param file_path: The path of the file to be split.
    @param chunk_size_bytes: The maximum size of each chunk in bytes. Default is 2000 bytes.

    @return: A list of dictionaries, each representing a document segment with a title and content.
    """
    documents = []
    current_chunk = ""
    segment_number = 1

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Strip newline characters from the line
            stripped_line = line.strip('\n')

            line_bytes = stripped_line.encode('utf-8')

            if len(current_chunk.encode('utf-8')) + len(line_bytes) > chunk_size_bytes:
                documents.append({
                    "title": f"seg{segment_number}",
                    "content": current_chunk
                })
                current_chunk = stripped_line
                segment_number += 1
            else:
                # Add a space when appending the stripped line to maintain separation between words
                current_chunk += ' ' + stripped_line if current_chunk else stripped_line

    if current_chunk.strip():
        documents.append({
            "title": f"seg{segment_number}",
            "content": current_chunk
        })

    return documents

# Path to your document
# @param {string} - your drive path to 2023-12-15_abstract-38041119-set.txt
file_path = "drive/MyDrive/Colab/src/2023-12-15_abstract-38041119-set.txt"

# Split the document
split_docs = split_document(file_path)


df = pd.DataFrame(split_docs)
df.columns = ['Title', 'Text']
df


Unnamed: 0,Title,Text
0,seg1,1. Diabetol Metab Syndr. 2023 Dec 2;15(1):247....
1,seg2,kang.lin@szhospital.com. (9)Guangdong Provinci...
2,seg3,a significantly higher risk of all-cause morta...
3,seg4,length between blood cadmium levels and cognit...
4,seg5,BACKGROUND: Due to the ongoing Coronavirus dis...
...,...,...
201,seg202,"saturated fatty acids, monounsaturated fatty a..."
202,seg203,partially mediated by smoking and body mass in...
203,seg204,"association between LTL and body mass index, w..."
204,seg205,"associations, which were noted for all macronu..."


In [6]:
# Get the embeddings of each text and add to an embeddings column in the dataframe
def embed_fn(title, text):
  return genai.embed_content(model="models/embedding-001",
                             content=text,
                             task_type="retrieval_document",
                             title=title)["embedding"]

df['Embeddings'] = df.apply(lambda row: embed_fn(row['Title'], row['Text']), axis=1)
df

Unnamed: 0,Title,Text,Embeddings
0,seg1,1. Diabetol Metab Syndr. 2023 Dec 2;15(1):247....,"[0.004888691, 0.016013522, -0.047940016, -0.00..."
1,seg2,kang.lin@szhospital.com. (9)Guangdong Provinci...,"[0.012020993, 0.012111678, -0.04627277, 0.0280..."
2,seg3,a significantly higher risk of all-cause morta...,"[0.04235799, -0.032533072, -0.059396077, 0.004..."
3,seg4,length between blood cadmium levels and cognit...,"[-0.002676987, -0.023281487, -0.047593344, -0...."
4,seg5,BACKGROUND: Due to the ongoing Coronavirus dis...,"[-0.007291612, -0.004329588, -0.028834466, 0.0..."
...,...,...,...
201,seg202,"saturated fatty acids, monounsaturated fatty a...","[0.041224774, -0.01732042, -0.029787604, 0.023..."
202,seg203,partially mediated by smoking and body mass in...,"[0.005650221, 0.025418088, -0.03435212, 0.0121..."
203,seg204,"association between LTL and body mass index, w...","[-0.01114577, 0.020348353, -0.036298294, 0.032..."
204,seg205,"associations, which were noted for all macronu...","[-0.023179742, 0.01785586, -0.03589972, 0.0021..."


In [7]:
# @param {string} query - query to ask the api
query = """Will cadmium exposure increase the risk of cognitive impairment?"""
model = 'models/embedding-001'

request = genai.embed_content(model=model,
                              content=query,
                              task_type="retrieval_query")

In [8]:
def find_best_passage(query, dataframe):
  """
  Finds the most relevant passage from a dataframe for a given query by computing
  the distances between the query and each document in the dataframe using the dot product
  with highest the most relevant.

  @param query: The query string for which the relevant passage is to be found.
  @param dataframe: A Pandas dataframe containing the documents. This dataframe must include a column named 'Embeddings' with precomputed embeddings for each document.

  @return: The text of the most relevant document based on the query.
  """
  query_embedding = genai.embed_content(model=model,
                                        content=query,
                                        task_type="retrieval_query")
  dot_products = np.dot(np.stack(dataframe['Embeddings']), query_embedding["embedding"])
  idx = np.argmax(dot_products)
  return dataframe.iloc[idx]['Text'] # Return text from index with max value

In [9]:
passage = find_best_passage(query, df)
passage

'length between blood cadmium levels and cognitive function among older adults in  the United States. METHODS: Using data from the National Health and Nutrition Examination Survey  (NHANES) 1999-2002. Cadmium exposure level was assessed by measuring cadmium  levels in blood samples. Leukocyte telomere length was measured by quantitative  polymerase chain reaction, and cognitive function was measured by the digit  symbol substitution test (DSST). RESULTS: A total of 2185 older adults aged over 60 were included in this study,  comprising 1109 (49.65%) males. Elevated blood cadmium levels were significantly  associated with the risk of a decline in cognitive function (β\xa0=\xa0- 2.842,  p\xa0=\xa00.018). Shorter leukocyte telomere lengths were significantly associated  with a higher risk of a decline in cognitive function (β\xa0=\xa04.144, p\xa0=\xa00.020).  The total indirect effect on the blood cadmium level and cognitive function via  leukocyte telomere length was - 0.218 (p\xa0=\xa00

In [10]:
def make_prompt(query, relevant_passage):
  escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
  prompt = textwrap.dedent("""You are a helpful and informative bot that answers questions using text from the reference passage included below. \
  Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. \
  You may look specifically into each CONCLUSION \
  However, you are talking to a technical audience, so be sure to be professional \
  If the passage is irrelevant to the answer, you may ignore it or simply answer I don't know.
  QUESTION: '{query}'
  PASSAGE: '{relevant_passage}'

    ANSWER:
  """).format(query=query, relevant_passage=escaped)

  return prompt

In [11]:
prompt = make_prompt(query, passage)
model = genai.GenerativeModel('gemini-pro')
answer = model.generate_content(prompt)
Markdown(answer.text)


Yes, according to the study's findings, cadmium exposure may increase the risk of cognitive impairment by causing shortened leukocyte telomere length.