In [1]:
!pip install pymupdf python-docx pinecone-client sentence-transformers transformers



In [2]:
import os
import uuid
import fitz  # PyMuPDF
import docx
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer
from transformers import pipeline

# -------------------------------
# Configuration
# -------------------------------

EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
INDEX_NAME = "rag-demo"
EMBEDDING_DIM = 384  # all-MiniLM-L6-v2 outputs 384-dim embeddings

# Initialize models
embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
qa_pipeline = pipeline("question-answering")

# Initialize Pinecone
pc = Pinecone(api_key="pcsk_6P5Sjd_43iWWWgSyUBQJU7oJijUpEZ3k6gjEkdZfVQ2GqCceTmhJCg4Rhn6owjFaQpSSft")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


In [3]:
# -------------------------------
# Helper Functions
# -------------------------------

def check_pinecone_connection():
    try:
        indexes = pc.list_indexes()
        print("Connected to Pinecone. Indexes available:", indexes)
        return True
    except Exception as e:
        print("Error connecting to Pinecone:", e)
        return False

def extract_text(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    text = ""
    if ext == '.pdf':
        with fitz.open(file_path) as doc:
            for page in doc:
                text += page.get_text()
    elif ext == '.docx':
        doc = docx.Document(file_path)
        text = "\n".join([para.text for para in doc.paragraphs])
    elif ext == '.txt':
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
    else:
        print("Unsupported file format.")
    return text

def chunk_text(text, chunk_size=100):
    words = text.split()
    return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

def get_embedding(text):
    try:
        return embedding_model.encode(text).tolist()
    except Exception as e:
        print("Embedding error:", e)
        return None

def embed_and_store(chunks):
    if INDEX_NAME not in pc.list_indexes():
        print(f"Creating new index: {INDEX_NAME}")
        pc.create_index(
            name=INDEX_NAME,
            dimension=EMBEDDING_DIM,
            metric="cosine",
            spec=ServerlessSpec(cloud="aws", region="us-east-1")
        )

    index = pc.Index(INDEX_NAME)
    for chunk in chunks:
        embedding = get_embedding(chunk)
        if embedding:
            metadata = {"text": chunk}
            vector_id = f"resume-{uuid.uuid4()}"
            index.upsert([{
                "id": vector_id,
                "values": embedding,
                "metadata": metadata
            }])

def retrieve_context(question, top_k=3):
    index = pc.Index(INDEX_NAME)
    query_vector = get_embedding(question)
    if not query_vector:
        return "Failed to embed query."

    result = index.query(vector=query_vector, top_k=top_k, include_metadata=True)
    if not result['matches']:
        return "No relevant context found."
    top_chunks = [match['metadata']['text'] for match in result['matches']]
    return "\n".join(top_chunks)

def generate_answer(question, context):
    try:
        result = qa_pipeline({
            "question": question,
            "context": context
        })
        return result["answer"]
    except Exception as e:
        print("Answer generation error:", e)
        return "Error generating answer."


In [4]:
from google.colab import files

uploaded = files.upload()  # Choose your resume file (.pdf, .docx, .txt)
file_path = list(uploaded.keys())[0]
print(f"Uploaded file: {file_path}")

Saving PINECONE POINTS.pdf to PINECONE POINTS.pdf
Uploaded file: PINECONE POINTS.pdf


In [5]:
if os.path.exists(file_path):
    print(f"Processing file: {file_path}")
    text = extract_text(file_path)
    print(f"\nExtracted text (first 300 chars):\n{text[:300]}")

    chunks = chunk_text(text)
    print(f"Created {len(chunks)} text chunks.")

    if check_pinecone_connection():
        embed_and_store(chunks)
        print("Chunks successfully stored in Pinecone.")
    else:
        print("Failed to connect to Pinecone.")
else:
    print("File not found. Please check the path.")


Processing file: PINECONE POINTS.pdf

Extracted text (first 300 chars):
PINECONE POINTS 
1. Setup and install the packages initially 
   pip install pinecone-client openai 
 
 pinecone-client: Official Python client to interact with the Pinecone vector database. 
 openai: Required to access OpenAI’s services, especially for generating embeddings 
using OpenAI models l
Created 5 text chunks.
Connected to Pinecone. Indexes available: []
Creating new index: rag-demo
Chunks successfully stored in Pinecone.


In [7]:
question = "show me the packages to install"
context = retrieve_context(question)
print("\nTop context retrieved:\n", context)

if context:
    answer = generate_answer(question, context)
    print("\nFinal Answer:\n", answer)



Top context retrieved:
 PINECONE POINTS 1. Setup and install the packages initially pip install pinecone-client openai  pinecone-client: Official Python client to interact with the Pinecone vector database.  openai: Required to access OpenAI’s services, especially for generating embeddings using OpenAI models like text-embedding-ada-002 2. Now importing the required libraries import pinecone  This imports the Pinecone SDK, allowing you to initialize, create, connect to, and query Pinecone vector indexes. 3. Now need to fetch the API keys from the pinecone console pinecone_api_key = “ your_key ” pinecone_environment = ” your_env “  pinecone_api_key: Your personal access key for using Pinecone.  pinecone_environment:
The Pinecone region where your index is hosted (e.g., "us- east-1").  You get both from your Pinecone console. 4. Initialize the pinecone by giving your API key and your environment pinecone.init(api_key = pinecone_api_key, environment = pinecone_env)  This sets up y