## Model: distilgpt2

In [1]:
import os
import logging
import requests
from pinecone import Pinecone
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from langchain.prompts import PromptTemplate
from transformers import pipeline

verify Pinecone integration

In [2]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def verify_pinecone_vectors(index_name='youtube-transcripts', expected_count=615):
    """Verify vectors in Pinecone index."""
    load_dotenv()
    pinecone_api_key = os.getenv('PINECONE_API_KEY')
    if not pinecone_api_key:
        logging.error('PINECONE_API_KEY not found in .env')
        raise ValueError('PINECONE_API_KEY not found')

    # Initialize Pinecone
    pc = Pinecone(api_key=pinecone_api_key)
    if index_name not in [idx['name'] for idx in pc.list_indexes()]:
        logging.error(f'Index {index_name} not found')
        raise ValueError(f'Index {index_name} not found')

    # Connect to index
    index = pc.Index(index_name)
    
    # Get index stats
    stats = index.describe_index_stats()
    total_vectors = stats['total_vector_count']
    
    # Verify vector count
    logging.info(f'Total vectors in index {index_name}: {total_vectors}')
    if total_vectors == expected_count:
        logging.info(f'Verification successful: {total_vectors} vectors match expected count {expected_count}')
    else:
        logging.warning(f'Verification failed: Found {total_vectors} vectors, expected {expected_count}')

    # Sample a few vector IDs to confirm format
    sample_ids = [f'{i}_0' for i in range(1, 6)]  # Check first chunk of videos 1-5
    for sample_id in sample_ids:
        try:
            result = index.fetch(ids=[sample_id])
            if sample_id in result['vectors']:
                logging.info(f'Found vector for chunk_id {sample_id}')
            else:
                logging.warning(f'No vector found for chunk_id {sample_id}')
        except Exception as e:
            logging.error(f'Error fetching chunk_id {sample_id}: {e}')

    return total_vectors

# Run verification
verify_pinecone_vectors()

2025-06-26 17:53:16,283 - INFO - Total vectors in index youtube-transcripts: 615
2025-06-26 17:53:16,284 - INFO - Verification successful: 615 vectors match expected count 615
2025-06-26 17:53:16,583 - INFO - Found vector for chunk_id 1_0
2025-06-26 17:53:16,726 - INFO - Found vector for chunk_id 2_0
2025-06-26 17:53:16,845 - INFO - Found vector for chunk_id 3_0
2025-06-26 17:53:17,000 - INFO - Found vector for chunk_id 4_0
2025-06-26 17:53:17,122 - INFO - Found vector for chunk_id 5_0


615

Implement Retrieval-Augmented Generation (RAG) Pipeline

In [3]:

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def retrieve_chunks(query, index_name='youtube-transcripts', top_k=5):
    """Retrieve top-k transcript chunks from Pinecone."""
    load_dotenv()
    pinecone_api_key = os.getenv('PINECONE_API_KEY')
    if not pinecone_api_key:
        logging.error('PINECONE_API_KEY not found in .env')
        raise ValueError('PINECONE_API_KEY not found')

    # Initialize Pinecone and SentenceTransformer
    pc = Pinecone(api_key=pinecone_api_key)
    index = pc.Index(index_name)
    embedder = SentenceTransformer('all-MiniLM-L6-v2')

    # Encode query
    query_embedding = embedder.encode(query, show_progress_bar=False).tolist()

    # Query Pinecone
    results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)
    chunks = [match['metadata']['text'] for match in results['matches']]
    
    logging.info(f'Retrieved {len(chunks)} chunks for query: {query}')
    return chunks

In [4]:
# Ensure latest transformers and huggingface_hub are installed
#%pip install --upgrade transformers huggingface_hub
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# Load Mistral model and tokenizer
#import torch
#os.environ["USE_TF"] = "0" 
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer_config.json: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [5]:
def generate_response(query, chunks):
    context = "\n".join(chunks) if chunks else "No relevant information found."

    prompt = f"""[INST] You are a helpful ServiceNow expert. Based on the following context, answer the query accurately and concisely.
Context:
{context}

Query: {query}
Answer: [/INST]"""

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=200,
        do_sample=True,
        top_k=50,
        top_p=0.9,
        temperature=0.7,
        repetition_penalty=1.2
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract response after "Answer:" if needed
    return response.split("Answer:")[-1].strip()

Built Chatbot interface

In [6]:
test_queries = [
    "What is ITSM in ServiceNow?",
    "Explain CMDB relationships.",
    "How does Incident Management work?"
]

for query in test_queries:
    chunks = retrieve_chunks(query)
    result = generate_response(query, chunks)
    print(f"Q: {query}\nA: {result}\n")

2025-06-26 17:55:52,142 - INFO - Use pytorch device_name: cpu
2025-06-26 17:55:52,145 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2025-06-26 17:55:56,131 - INFO - Retrieved 5 chunks for query: What is ITSM in ServiceNow?
Token indices sequence length is longer than the specified maximum sequence length for this model (845 > 512). Running this sequence through the model will result in indexing errors
2025-06-26 17:55:58,049 - INFO - Use pytorch device_name: cpu
2025-06-26 17:55:58,049 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Q: What is ITSM in ServiceNow?
A: aiops and event management



2025-06-26 17:56:00,548 - INFO - Retrieved 5 chunks for query: Explain CMDB relationships.
2025-06-26 17:56:01,867 - INFO - Use pytorch device_name: cpu
2025-06-26 17:56:01,868 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Q: Explain CMDB relationships.
A: 



2025-06-26 17:56:04,634 - INFO - Retrieved 5 chunks for query: How does Incident Management work?


Q: How does Incident Management work?
A: The key is being proactive versus reactive and having the right supporting system to be able to achieve that

