In [None]:
%pip install psycopg2-binary pgvector

In [None]:
pip install -qU langchain_postgres

In [None]:
pip install -qU langchain-huggingface

In [22]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [None]:
from langchain_core.documents import Document
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector

# See docker command above to launch a postgres instance with pgvector enabled.
connection = "postgresql+psycopg://postgres:test@localhost:5432/vector_db"  # Uses psycopg3!
collection_name = "my_docs"


vector_store = PGVector(
    embeddings=embeddings,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True,
)

In [14]:
import os
import glob
import re
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import numpy as np
import json
import nltk
import psycopg2
from pgvector.psycopg2 import register_vector

In [None]:
import nltk
nltk.download('all')

In [None]:
print(nltk.data.path)

In [None]:
# Step 1: Extract text from Markdown files and find the first link
def extract_text_from_markdown(md_path):
    text = ""
    pdf_link = None
    try:
        with open(md_path, "r", encoding="utf-8") as file:
            text = file.read()
            # Find the first link in the Markdown text using regex
            match = re.search(r'\b(www\.nice\.org\.uk/guidance/\S*)\b', text)  # Adjusted regex for specific link format
            if match:
                pdf_link = match.group(1)  # Extract the URL from the match
    except Exception as e:
        print(f"Failed to extract text from {md_path}: {e}")
    return text, pdf_link

# Step 2: Chunk text into smaller pieces with semantic chunking and overlapping
def chunk_text(text, chunk_size=3000, overlap_ratio=0.3):
    sentences = nltk.sent_tokenize(text)  # Split text into sentences
    chunks = []
    current_chunk = []
    current_length = 0
    overlap_size = int(chunk_size * overlap_ratio)  # Calculate overlap size in terms of word count

    i = 0
    while i < len(sentences):
        sentence = sentences[i]
        sentence_length = len(sentence.split())

        # If adding the sentence exceeds the chunk size, finalize the current chunk
        if current_length + sentence_length > chunk_size and current_chunk:
            chunks.append(" ".join(current_chunk))
            # Calculate the number of sentences to overlap (based on overlap_size)
            overlap_sentences = 0
            overlap_length = 0
            for s in reversed(current_chunk):
                s_length = len(s.split())
                if overlap_length + s_length > overlap_size:
                    break
                overlap_sentences += 1
                overlap_length += s_length
            # Start a new chunk with the last `overlap_sentences` sentences of the previous chunk
            current_chunk = current_chunk[-overlap_sentences:]
            current_length = sum(len(s.split()) for s in current_chunk)
        else:
            # Add the sentence to the current chunk
            current_chunk.append(sentence)
            current_length += sentence_length
            i += 1

    # Add the last chunk if it exists
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

# Step 3: Create vector store with pgvector
def create_vector_store(md_folder, db_config, model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)

    # Fetch all Markdown files in the folder
    md_files = glob.glob(os.path.join(md_folder, "*.md"))

    if not md_files:
        print(f"No Markdown files found in the folder: {md_folder}")
        return

    print("Extracting text and generating embeddings...")
    text_chunks = []
    metadata = []

    # Connect to PostgreSQL
    conn = psycopg2.connect(**db_config)
    register_vector(conn)  # Register pgvector extension
    cur = conn.cursor()

    # Create table for embeddings if it doesn't exist
    cur.execute("""
        CREATE TABLE IF NOT EXISTS document_embeddings (
            id SERIAL PRIMARY KEY,
            md_file TEXT,
            pdf_link TEXT,
            chunk_index INT,
            chunk_text TEXT,
            embedding vector(384)  -- Adjust dimension based on your model
        );
    """)
    conn.commit()

    for md_file in tqdm(md_files, desc="Processing Markdown files"):
        print(f"Processing file: {md_file}")

        text, pdf_link = extract_text_from_markdown(md_file)
        if not text.strip():
            print(f"Warning: No text extracted from {md_file}")
            continue

        chunks = chunk_text(text)
        if not chunks:
            print(f"Warning: No valid chunks created for {md_file}")
            continue

        embeddings = model.encode(chunks)
        if embeddings.size == 0:
            print(f"Warning: No embeddings generated for {md_file}")
            continue

        # Insert embeddings and metadata into PostgreSQL
        for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
            cur.execute("""
                INSERT INTO document_embeddings (md_file, pdf_link, chunk_index, chunk_text, embedding)
                VALUES (%s, %s, %s, %s, %s);
            """, (md_file, pdf_link, i, chunk, embedding.tolist()))

        conn.commit()

    cur.close()
    conn.close()

    print("Vector store created successfully in PostgreSQL with pgvector!")

# Main pipeline
if __name__ == "__main__":
    # Change directory paths for Markdown files
    md_folder = "/Users/umer/Desktop/rag_nice/pdf_data_markdown_cleaned"  # Folder containing Markdown files

    # PostgreSQL connection configuration
    db_config = {
        "dbname": "vector_db",
        "user": "postgres",
        "password": "test",
        "host": "localhost",
        "port": 5432
    }

    # Ensure Markdown folder exists
    if not os.path.exists(md_folder):
        raise ValueError(f"Markdown folder not found at {md_folder}. Please upload the folder first.")

    # Create the vector store
    create_vector_store(md_folder, db_config)

In [31]:
query = "What is risk assesement in antrnatal care?"

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import json
import time
from sentence_transformers import SentenceTransformer
import psycopg2
from pgvector.psycopg2 import register_vector
from groq import Groq
import streamlit as st

# Step 4: Query the vector store using pgvector
def query_vector_store(query, db_config, top_k=3, model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)

    # Connect to PostgreSQL
    conn = psycopg2.connect(**db_config)
    register_vector(conn)  # Register pgvector extension
    cur = conn.cursor()

    # Encode the query into an embedding
    query_embedding = model.encode([query])

    # Query the database for the top-k most similar embeddings
    cur.execute("""
        SELECT md_file, pdf_link, chunk_index, chunk_text
        FROM document_embeddings
        ORDER BY embedding <-> %s
        LIMIT %s;
    """, (query_embedding.tolist(), top_k))

    results = []
    for row in cur.fetchall():
        md_file, pdf_link, chunk_index, chunk_text = row
        results.append({
            "md_file": md_file,
            "pdf_link": pdf_link,
            "chunk_index": chunk_index,
            "chunk_text": chunk_text
        })

    cur.close()
    conn.close()

    return results

# Step 5: Generate an answer using Groq
def generate_answer_with_groq(query, retrieved_texts, groq_api_key):
    client = Groq(api_key=groq_api_key)

    # Construct the prompt using the retrieved context
    prompt = (
        f"The following texts were retrieved as context:\n\n"
        f"{retrieved_texts}\n\n"
        f"Based on the context, answer the query:\n\n"
        f"{query}"
    )

    response = client.chat.completions.create(
        messages=[
            {"role": "user", "content": prompt},
        ],
        max_tokens=500,
        temperature=0.1,
        model="llama-3.1-8b-instant",  # Replace with your desired Groq model
    )

    return response.choices[0].message.content

# Streamlit UI
def main():
    st.title("AI ASSISTANT FOR NICE GUIDELINES")
    st.write("Welcome to the NICE Guidelines assistance")

    # Create session state for maintaining query history
    if 'history' not in st.session_state:
        st.session_state.history = []

    # Inputs: query input field and submit button
    query = st.text_input("Enter your query to get assistance:")

    if st.button("Submit"):
        if query:
            # PostgreSQL connection configuration
            db_config = {
                "dbname": "vector_db",
                "user": "postgres",
                "password": "test",
                "host": "localhost",
                "port": 5432
            }

            groq_api_key = "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"  # Replace with your Groq API key

            # Start timer for retrieval
            retrieval_start_time = time.time()

            # Retrieve context using the vector store
            with st.spinner("Retrieving relevant documents..."):
                results = query_vector_store(query, db_config)
                retrieved_texts = "\n\n".join([res["chunk_text"] for res in results])
                pdf_links = [res["pdf_link"] for res in results]

            # Calculate retrieval time
            retrieval_elapsed_time = time.time() - retrieval_start_time

            # Show retrieved texts to the user
            st.subheader("Retrieved Context:")
            st.text_area("Context", retrieved_texts, height=300)

            # Start timer for answer generation
            generation_start_time = time.time()

            # Generate an answer using Groq
            with st.spinner("Generating answer..."):
                answer = generate_answer_with_groq(query, retrieved_texts, groq_api_key)

            # Calculate generation time
            generation_elapsed_time = time.time() - generation_start_time

            # Display the answer and times
            st.subheader("Answer:")
            st.write(answer)

            st.subheader("Elapsed Time:")
            st.write(f"Time for context retrieval: {retrieval_elapsed_time:.2f} seconds")
            st.write(f"Time for answer generation: {generation_elapsed_time:.2f} seconds")
            st.write(f"Total time: {retrieval_elapsed_time + generation_elapsed_time:.2f} seconds")

            # Display references
            if pdf_links:
                st.subheader("References:")
                for link in pdf_links:
                    cleaned_link = link.removeprefix("http://localhost:8501/")
                    if not cleaned_link.startswith("http"):
                        cleaned_link = "https://" + cleaned_link
                    st.markdown(f"{cleaned_link}")

            # Add the query and answer to history
            st.session_state.history.append({
                "query": query,
                "answer": answer,
                "retrieval_time": retrieval_elapsed_time,
                "generation_time": generation_elapsed_time,
                "total_time": retrieval_elapsed_time + generation_elapsed_time
            })

    # Sidebar for interaction history
    st.sidebar.title("Interaction History")
    if st.session_state.history:
        for _, interaction in enumerate(st.session_state.history):
            with st.sidebar.expander(f"**Question :** {interaction['query']}"):
                st.write(f"**Answer :** {interaction['answer']}")
                st.write(f"**Time for Context Retrieval:** {interaction['retrieval_time']:.2f} seconds")
                st.write(f"**Time for Answer Generation:** {interaction['generation_time']:.2f} seconds")
                st.write(f"**Total Time:** {interaction['total_time']:.2f} seconds")

if __name__ == "__main__":
    main()

In [None]:
from opik import Opik
from opik.evaluation import evaluate
from opik.evaluation.metrics import Hallucination, AnswerRelevance
import time
from sentence_transformers import SentenceTransformer
import psycopg2
from pgvector.psycopg2 import register_vector
from groq import Groq
from itertools import islice

# Initialize Opik client
client = Opik()
dataset = client.get_dataset(name="ragg_nice")

# limited_dataset = dataset[:2]
# limited_dataset = list(dataset)[:2]
# limited_dataset = dataset.sample(2)
# limited_dataset = list(islice(dataset, 2))

# PostgreSQL connection configuration
db_config = {
    "dbname": "vector_db",
    "user": "postgres",
    "password": "test",
    "host": "localhost",
    "port": 5432
}

groq_api_key = "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"  # Replace with your Groq API key

def query_vector_store(query, db_config, top_k=2, model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    conn = psycopg2.connect(**db_config)
    register_vector(conn)
    cur = conn.cursor()

    query_embedding = model.encode(query).tolist()

    cur.execute(
        """
        SELECT md_file, pdf_link, chunk_index, chunk_text
        FROM document_embeddings
        ORDER BY embedding <-> %s::vector
        LIMIT %s;
        """, (query_embedding, top_k)
    )

    results = [row[3] for row in cur.fetchall()]  # Extracting chunk_text only
    cur.close()
    conn.close()
    return results

def generate_answer_with_groq(query, retrieved_texts, groq_api_key):
    client = Groq(api_key=groq_api_key)
    prompt = (
        f"The following texts were retrieved as context:\n\n"
        f"{retrieved_texts}\n\n"
        f"Based on the context, answer the query:\n\n"
        f"{query}"
    )
    response = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}],
        max_tokens=500,
        temperature=0.1,
        model="llama-3.1-8b-instant"
    )
    return response.choices[0].message.content

def evaluation_task(dataset_item):
    query = dataset_item['query']
    retrieved_texts = query_vector_store(query, db_config)
    answer = generate_answer_with_groq(query, "\n\n".join(retrieved_texts), groq_api_key)

    return {
        "input": query,
        "output": answer,
        "context": retrieved_texts
    }

metrics = [Hallucination(), AnswerRelevance()]

eval_results = evaluate(
    experiment_name="my_evaluation",
    dataset=dataset,
    task=evaluation_task,
    scoring_metrics=metrics,
    task_threads=1,
)

print(eval_results)