In [1]:
## Load and Split PDF Documents

from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [3]:
# Load the PDF document
loader = PyPDFLoader("m-1.pdf")
pages = loader.load_and_split()

In [4]:
# Initialize the text splitter for chunking
define_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1400,  # Size of each chunk
    chunk_overlap=180,  # Overlap between chunks
    length_function=len,
    is_separator_regex=False,
    separators=["\n\n", "\n", "."]
)

In [5]:
# Example: Split text from a specific page
chunks = define_text_splitter.split_text(pages[1].page_content)

# Check the number of chunks
print(f"Total chunks from page 1: {len(chunks)}")

Total chunks from page 1: 1


In [6]:
# Print each chunk and its length
for chunk in chunks:
    print(chunk)
    print(f"Length: {len(chunk)}")


1. Introduction
Among pregnancy-related complications, hypertensive disorders of pregnancy signifi-
cantly contribute to maternal and perinatal mortality on a global scale. In Latin America
and the Caribbean, hypertensive disorders are responsible for almost 26% of maternal
deaths, whereas in Africa and Asia they contribute to 9% of deaths [1].
Preeclampsia (PE) is one of several hypertensive disorders of pregnancy and is defined
by the International Society for the Study of Hypertension in Pregnancy (ISSHP) as gesta-
tional hypertension after 20 weeks’ gestation, accompanied by one or more of the following
Int. J. Mol. Sci. 2024 ,25, 4532. https://doi.org/10.3390/ijms25084532 https://www.mdpi.com/journal/ijms
Length: 719


In [None]:
## Initialize Pinecone and Store Embeddings
from pinecone import Pinecone, ServerlessSpec

def get_embedding(text):
    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-small",
        encoding_format="float",
        dimensions=1536
    )
    return response.data[0].embedding


In [None]:
# Connect to Pinecone
pc = Pinecone(api_key="xxxxxxxxx")
spec = ServerlessSpec(cloud='aws', region='us-west-2')
index_name = 'preeclamsia'

# Create Pinecone index if it doesn't exist
if 'preeclamsia' not in pc.list_indexes().names():
    pc.create_index(
        name='preclamsia',
        dimension=1536,
        metric='cosine',
        spec=spec
    )

indexname = pc.Index(index_name)
print(indexname.describe_index_stats())

In [None]:
# Insert chunks into Pinecone
for item in chunks:
    chunk_id = item["chunk_id"]
    content = item["content"]
    embedding = get_embedding(content)
    indexname.upsert(
        vectors=[{
            "id": chunk_id,
            "values": embedding,
            "metadata": {"content": content}
        }]
    )
    print(f"Inserted chunk ID: {chunk_id}")

In [None]:
##Query Pinecone and GPT
from openai import OpenAI

# Initialize OpenAI client
OPENAI_API_KEY = "xxxxxxxxx"
client = OpenAI(api_key=OPENAI_API_KEY)

In [None]:
def query_pinecone(query):
    """Query Pinecone using the embedding of the user query."""
    query_embedding = get_embedding(query)
    results = indexname.query(
        vector=query_embedding,
        top_k=3,
        include_values=False,
        include_metadata=True
    )
    return results

In [None]:
def get_preeclampsia_info(query):
    """Fetch preeclampsia information using Pinecone and GPT-3.5."""
    pinecone_results = query_pinecone(query)

    if not pinecone_results['matches']:
        return "I couldn't find relevant information in the knowledge base."

    top_chunks = [match['metadata']['content'] for match in pinecone_results['matches']]
    top_chunks_content = "\n".join(top_chunks)

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a preeclampsia chatbot. Answer the user's question using the following knowledge:"},
            {"role": "system", "content": top_chunks_content},
            {"role": "user", "content": query}
        ]
    )

    return response.choices[0].message.content


# Chatbot Interaction- stuffing method

In [None]:
# Main loop to interact with the user
print("Hi!!! I am Clamsy. Ask me questions about PREECLAMSIA/n")
user_query = ""
while True:
    user_query = input("")
    if user_query.lower() == "quit":
        break
    
    info = get_preeclampsia_info(user_query)
    print(info)


# Chatbot interaction- map reference method

In [None]:
def get_preeclampsia_info(query):
    """Fetch preeclampsia information using Pinecone and GPT-3.5."""
    pinecone_results = query_pinecone(query)

    if not pinecone_results['matches']:
        return "I couldn't find relevant information in the knowledge base."

    top_chunks = [match['metadata']['content'] for match in pinecone_results['matches']]
    top_chunks_content = "\n".join(top_chunks)

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a preeclampsia chatbot. Answer the user's question using the following knowledge:"},
            {"role": "system", "content": top_chunks_content},
            {"role": "user", "content": query}
        ]
    )

    response.choices[0].message.content


  # Combine the LLM outputs from each chunk
  combined_response = "\n".join(responses)

  return combined_response

# Main loop to interact with the user
print("Hi!!! I am Clamsy. Ask me questions about PRECLAMSIA")
user_query = ""
while True:
  user_query = input("")
  if user_query.lower() == "quit":
    break

  info = get_preeclampsia_info(user_query)
  print(info)