In [1]:
import cohere
import pinecone
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import numpy as np
import os
from dotenv import load_dotenv

sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\manas\AppData\Local\sagemaker\sagemaker\config.yaml


In [2]:
# Load environment variables
load_dotenv()

cohere_api_key = os.getenv("COHERE_API_KEY")

In [3]:
# Initialize Pinecone and Cohere clients
pc = Pinecone(
    api_key="7503c7e4-ecbb-43b8-8a06-920f4281ff21",
    environment="us-east-1"  # Change this to your Pinecone environment
)
co = cohere.Client(cohere_api_key)

In [4]:
# Define the index name
index_name = "sample-article"

# Create Pinecone index if it doesn't exist
if index_name not in pc.list_indexes().names():
    spec = ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=spec
    )

# Connect to the index
index = pc.Index(index_name)

In [5]:
# Load pre-trained SentenceTransformer for embedding generation
embedder = SentenceTransformer('all-MiniLM-L6-v2')



In [6]:
def load_document(source_type="text", source=None, chunk_size=500):
    """
    Load a document from a local file, URL, or raw text, and split it into chunks.

    Args:
        source_type (str): The type of source ('file', 'url', 'text').
        source (str): The path to the file, URL, or raw text.
        chunk_size (int): The size of each document chunk (in characters).

    Returns:
        list: A list of document chunks.
    """
    if source_type == "file":
        with open(source, 'r', encoding='utf-8') as f:
            document = f.read()
    elif source_type == "url":
        import requests
        response = requests.get(source)
        document = response.text
    elif source_type == "text":
        document = source
    else:
        raise ValueError("Unsupported source_type. Choose from 'file', 'url', or 'text'.")
    
    # Split document into chunks
    chunks = [document[i:i+chunk_size] for i in range(0, len(document), chunk_size)]
    return chunks

In [7]:
# Example usage: Load a document from raw text
document_chunks = load_document(source_type="text", source="chunking.txt", chunk_size=500)

In [8]:
# 2. Generate embeddings for each chunk
embeddings = embedder.encode(document_chunks, convert_to_tensor=True)

In [9]:
# 3. Upload embeddings to Pinecone
for i, chunk in enumerate(document_chunks):
    index.upsert([(str(i), embeddings[i].numpy())])  # Upsert embeddings with ids

In [10]:
# Function to retrieve relevant documents
def retrieve_relevant_chunks(query, top_k=3):
    query_embedding = embedder.encode([query])[0]  # Embed the query
    query_results = index.query(vector=[0.1, 0.2, 0.3], top_k=10, namespace='my_namespace') # Query Pinecone
    relevant_chunks = [document_chunks[int(match['id'])] for match in query_results['matches']]
    return relevant_chunks

In [11]:
# Cohere Generation Function
def generate_answer(question, context):
    response = co.generate(
        model='command',  # Choose your Cohere model size
        prompt=f"Context: {context}\n\nQuestion: {question}\n\nAnswer:",
        max_tokens=100
    )
    return response.generations[0].text

In [12]:
# Full QA function (Retrieve + Generate)
def answer_question(question):
    # 1. Retrieve relevant chunks
    relevant_chunks = retrieve_relevant_chunks(question)
    # 2. Combine relevant chunks
    context = " ".join(relevant_chunks)
    # 3. Generate answer
    answer = generate_answer(question, context)
    return answer

In [13]:
# Test the model with a sample query
sample_question = "What is the chunking?"
answer = answer_question(sample_question)
print(f"Question: {sample_question}\nAnswer: {answer}")
print("------------------------------------------------")
sample_question = "What are all the types of chunking can you give them and compare each of them in a tabular manner?"
answer = answer_question(sample_question)
print(f"Question: {sample_question}\nAnswer: {answer}")

Question: What is the chunking?
Answer:  The process of grouping individual pieces of information together in order to make them easier to memorize or retain. Chunking is a technique often used in cognitive psychology and language acquisition to improve memory and increase the amount one can remember. It works by organizing information into more easily learned groups, phrases, words, or numbers.

For example, it is easier to memorize this sequence of numbers: 1492-2006-93-15 than it is to remember these separate dates: 1492, 2006,
------------------------------------------------
Question: What are all the types of chunking can you give them and compare each of them in a tabular manner?
Answer:  Below is a comparison of the three types of chunking, outlining their defining characteristics, application in natural language processing(NLP) and language modeling (LM), along with their corresponding advantages and disadvantages. 
1. **Phonetic Chunking**:
- Definition: Phonetic chunking invo