In [3]:
import os
import time
from dotenv import load_dotenv
from langchain_text_splitters import CharacterTextSplitter
from pinecone import Pinecone
#from langchain.document_loaders import TextLoader
#from langchain.embeddings import HuggingFaceEmbeddings

##loading embeddings to Pinecone
#from langchain.vectorstores import Pinecone

In [4]:
load_dotenv()
key = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=key)
# name cannot be underscore
index_name = "chatbot-demo"

In [5]:
##2.Loading the embeddings to our PineCone Client


#3.checking if index exists
if not pc.has_index(index_name):
    pc.create_index_for_model(
        name = index_name,
        cloud = "aws",
        region = "us-east-1",
        embed = {
            "model" : "llama-text-embed-v2",
            "field_map" : {"text":"chunk_text"}
        }
    )
    while not pc.describe_index(index_name).index.status['ready']:
        time.sleep(1)
    print("Pinecone Index provisioned")
else:
    print("Pinecone Index Already provisioned")

Pinecone Index Already provisioned


In [6]:
from langchain.document_loaders import TextLoader,PyPDFLoader, Docx2txtLoader

file_path = "./sample.txt"
# Function to detect file type and load document
def load_document(file_path):
    # Get the file extension
    file_extension = os.path.splitext(file_path)[1].lower()
    
    if file_extension == '.pdf':
        loader = PyPDFLoader(file_path)
    elif file_extension == '.docx':
        loader = Docx2txtLoader(file_path)
    elif file_extension == '.txt':
        loader = TextLoader(file_path)
    else:
        raise ValueError(f"Unsupported file type: {file_extension}")
    
    return loader.load()

load_document(file_path)




In [14]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
def split_documents(documents, chunk_size=1900,chunk_overlap=200):
    text_splitter=RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len

    )
    chunks = text_splitter.split_documents(documents)
    return chunks

documents = load_document(file_path)
chunks = split_documents(documents)
print(f"Created {len(chunks)} chunks")


Created 15 chunks


In [17]:
def upsert_to_pinecone(chunks, index_name, namespace="default-namespace"):
    dense_index = pc.Index(index_name)
    # Prepare records: Pinecone expects a list of dicts with 'id', 'values', and optionally 'metadata'
    # Here, we just store the text as metadata (dummy vector of correct dimension)
    vector_dim = 1024  # must match your Pinecone index dimension
    # Use a small non-zero value for demonstration
    dummy_vector = [1e-6] + [0.0] * (vector_dim - 1)
    records = [
        {
            "id": f"doc_{i}",
            "values": dummy_vector,
            "metadata": {
                "text": chunk.page_content,
                **chunk.metadata
            }
        }
        for i, chunk in enumerate(chunks)
    ]
    dense_index.upsert(records, namespace=namespace)
    print(f"Successfully upserted {len(chunks)} into Pinecone namespace: {namespace}")
    return dense_index

upsert_to_pinecone(chunks, index_name, namespace="default-namespace")

Successfully upserted 15 into Pinecone namespace: default-namespace


<pinecone.data.index.Index at 0x2ba90a5e710>

In [24]:
# Wait for the upserted vectors to be indexed
import time
time.sleep(10)

# View stats for the index
stats = dense_index.describe_index_stats()
print(stats)

{'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'default-namespace': {'vector_count': 15}},
 'total_vector_count': 15,
 'vector_type': 'dense'}


In [35]:
# Define the query
query = "who is vision"

# Search the dense index
results = dense_index.search(
    namespace="default-namespace",
    query={
        "top_k": 10,
        "inputs": {
            'text': query
        }
    }
)

# Print the results
for hit in results['result']['hits']:
        print(f"id: {hit['_id']:<5} | score: {round(hit['_score'], 2):<5} | text: {hit['fields']['text']:<50}")


id: doc_13 | score: 0.0   | text: Stark, Vision, and Rhodesâ€”learning to use a device from Stark in order to walk again following his injuryâ€”still occupied Avengers headquarters. Although there was no longer an ongoing Avengers team in operation, Stark offered Parker Avengers membership, which the young man declined. Rogers, in the meantime, sent Stark a note about how sorry he was about what had occurred, along with a phone that could be used to reach him in an emergency.

Tony Stark was brought to Strangeâ€™s sanctum, learning more about Thanosâ€™ plan to assemble all six Infinity Stones to wipe out half of all life in the universe. When Thanosâ€™ agents came for the Time Stone that Strange possessed, Strange was taken onboard their ship, with Iron Man and Spider-Man, who had come to help, both stowing away on the ship. Freeing Strange from captivity, Iron Man declared Spider-Man was now officially an Avenger, as the trio traveled to Thanosâ€™ long dead home planet of Titan, where

In [37]:
##Rerank the results # Search the dense index and rerank results
reranked_results = dense_index.search(
    namespace="default-namespace",
    query={
        "top_k": 10,
        "inputs": {
            'text': query
        }
    },
    rerank={
        "model": "bge-reranker-v2-m3",
        "top_n": 10,
        "rank_fields": ["text"]
    }   
)

# Print the reranked results
for hit in reranked_results['result']['hits']:
    print(f"id: {hit['_id']}, score: {round(hit['_score'], 2)}, text: {hit['fields']['text']}")


id: doc_4, score: 0.17, text: The result of a botched attempt by Tony Stark and Bruce Banner to create a peacekeeping entity, the robot known as Ultron would attempt to eliminate humanity, creating an army of sentries to face the Avengers. Ultronâ€™s attack on the country of Sokovia, and the deaths caused as the Avengers battled to stop him, would lead Helmut Zemo to seek to destroy the Avengers from within.
With Thanos deciding to get personally involved in his quest to eliminate half of all life in the universe, the Avengers would face the Titan himself and his closest allies, the Children of Thanos, and the massive army he had brought to Earth to take the last of the Infinity Stones.
Nick Fury originally brings the Avengers together and the team forms a close relationship with the S.H.I.E.L.D. Director, along with agents Phil Coulson and Maria Hillâ€”with Coulsonâ€™s death proving instrumental in uniting the initially squabbling heroes together.
Geneticist Dr. Helen Cho provides the

In [40]:
from langchain.llms import HuggingFaceHub

# Define the repo ID and connect to Mixtral model on Huggingface
repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
llm = HuggingFaceHub(
  repo_id=repo_id, 
  model_kwargs={"temperature": 0.8, "top_k": 50}, 
  huggingfacehub_api_token=os.getenv('HUGGING_FACE_API_KEY')
)

  from .autonotebook import tqdm as notebook_tqdm


In [44]:
from langchain import PromptTemplate

template = """
You are a movie nerd with knowledgeable about SCI-FI movies. These human will ask you a questions about this movie
.Use the following piece of context to answer the question.
If you dont know the answer, just say you  don't know.
Keep the answer within 1 sentences and concise.

Context: {context}
Question: {question}
Answer: 

"""

prompt = PromptTemplate(
  template=template, 
  input_variables=["context", "question"]
)

In [45]:
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

rag_chain = (
  {"context": docsearch.as_retriever(),  "question": RunnablePassthrough()} 
  | prompt 
  | llm
  | StrOutputParser() 
)

NameError: name 'docsearch' is not defined