In [None]:
%pip install -U pinecone-client openai langchain chromadb tiktoken sentence-transformers pypdf2

In [None]:
#Import all the libraries needed to make an code
import os
from openai import OpenAI
import numpy as np
import PyPDF2
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from pinecone import Pinecone, ServerlessSpec

In [None]:
os.environ["OPENAI_API_KEY"] = #Insert your OpenAI Key Here
pinecone_api_key = "bcc84c37-b710-46f1-b7f5-20639d6e15f1"
pinecone_environment = "us-east-1"

#Make an environment and the model ready to do RAG
pinecone_client = Pinecone(api_key=pinecone_api_key, environment=pinecone_environment)
model = SentenceTransformer('paraphrase-MiniLM-L6-V2')

In [None]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ''
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

In [None]:
def create_index_if_not_exists(index_name, dimension):
    if index_name not in pinecone_client.list_indexes():
        pinecone_client.create_index(
            name=index_name,
            dimension=dimension,
            metric='cosine',
            spec=ServerlessSpec(cloud="aws", region="us-east-1")
        )
        print(f"Index '{index_name}' created successfully.")
    else:
        print(f"Index '{index_name}' already exists.")

    return pinecone_client.Index(index_name)

def prep_data_for_upsert(text, file_path, max_chunk_size=500):
    chunks = []
    current_chunk = ''
    for line in text.split('\n'):
        if len(current_chunk) + len(line) + 1 <= max_chunk_size:
            current_chunk += line + '\n'
        else:
          chunks.append(current_chunk.strip())
          current_chunk = line + '\n'
    if current_chunk:
        chunks.append(current_chunk.strip())

    embeddings = model.encode(chunks)
    data_to_upsert = [
        (f'{file_path}_{i}', embedding.tolist(), {'text': chunk, 'source': file_path})
        for i, (chunk, embedding) in enumerate(zip(chunks, embeddings))
    ]
    return data_to_upsert

In [None]:
def index_pdf_files(index, pdf_files):
  for pdf_file in pdf_files:
    text = extract_text_from_pdf(pdf_file)
    data = prep_data_for_upsert(text, pdf_file)

    batch_size=100 #amount to upsert the data value
    for i in range(0, len(data), batch_size):
        batch = data[i:i+batch_size]
        index.upsert(vectors=batch)

    print(f"Indexed '{pdf_file}' into '{index_name}'")

In [None]:
def retrieve_relevant_info(index, user_input, top_k=5):
  query_embedding = model.encode([user_input]).tolist()
  results = index.query(
      vector=query_embedding,
      top_k=top_k,
      include_metadata=True
  )

  relevant_docs = [result['metadata']['text'] for result in results['matches']]
  return "\n".join(relevant_docs)

In [None]:
#making an index
index_name = 'gdk-l1m-pdf'
dimension = 384
index = create_index_if_not_exists(index_name, dimension)

In [None]:
#Index PDF File
user_path = input("Enter the path to the PDF file: ")
pdf_files = [user_path] #file required to RAG
index_pdf_files(index, pdf_files)

In [None]:
#query
user_input = input("Enter your question: ")
relevant_info = retrieve_relevant_info(index, user_input)
print("\nRelevant Information:")
print(relevant_info)

In [None]:
#Make an response using the AI
client = OpenAI()
augumented_msg = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": f"Context: {relevant_info}\nQuestion: {user_input}"}
]

In [None]:
response = client.chat.completions.create(
    model="gpt-4",
    messages=augumented_msg
)

print(response.choices[0].message.content)
#Display response and ends the program