In [None]:
"""
Computational Linguistics Teaching Assistant
A chatbot that answers questions about NLP using Stanford's CS224N lecture content.
"""


In [15]:
!pip install pytube openai-whisper langchain openai yt-dlp pinecone-client python-dotenv
!pip install -U langchain-community
!pip install -U langchain-pinecone
!pip install -U langchain-openai langchain-core
!pip install torch==2.1.1+cu121 torchvision==0.16.1+cu121 -f https://download.pytorch.org/whl/torch_stable.html
!pip install huggingface_hub==0.11.1
!pip install -U pinecone-client
!pip install pydantic
!pip install python-dotenv
!pip install -U sentence-transformers
!pip install git+https://github.com/openai/whisper.git
!pip install langsmith
!pip install --upgrade gradio
!pip install flake8 black isort


[0mLooking in links: https://download.pytorch.org/whl/torch_stable.html
[0mCollecting huggingface_hub==0.11.1
  Using cached huggingface_hub-0.11.1-py3-none-any.whl.metadata (7.5 kB)
Using cached huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
Installing collected packages: huggingface_hub
  Attempting uninstall: huggingface_hub
    Found existing installation: huggingface-hub 0.26.2
    Uninstalling huggingface-hub-0.26.2:
      Successfully uninstalled huggingface-hub-0.26.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gradio 5.5.0 requires huggingface-hub>=0.25.1, but you have huggingface-hub 0.11.1 which is incompatible.
gradio-client 1.4.2 requires huggingface-hub>=0.19.3, but you have huggingface-hub 0.11.1 which is incompatible.
tokenizers 0.20.3 requires huggingface-hub<1.0,>=0.16.4, but you have huggingface-hub 0.11.1 which is incompatible.


In [10]:
# Standard library imports
import os
import json
import shutil
import uuid
from pathlib import Path
from typing import Dict, List, Optional, Type, Tuple

# Third-party imports
import gradio as gr
import numpy as np
import torch
import whisper
from dotenv import load_dotenv, find_dotenv
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from tqdm import tqdm
from yt_dlp import YoutubeDL

# LangChain imports
from langchain.agents import Tool, AgentExecutor, create_react_agent, initialize_agent
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.memory import ConversationBufferMemory, ConversationBufferWindowMemory
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import PromptTemplate
from langchain.prompts.chat import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.schema import SystemMessage
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.tools import BaseTool
from langchain.vectorstores import Pinecone as LangchainPinecone
from langchain_openai import ChatOpenAI
from langchain_pinecone import PineconeVectorStore

In [11]:
# CONFIGURATION ENVIRONMENT VARIABLES

load_dotenv(find_dotenv())

# Define environment variables
env_content = """
OPENAI_API_KEY=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
PINECONE_API_KEY=53e1e922-7da6-4a86-90e7-4d61371838a4
PINECONE_INDEX_NAME=lecture-embeddings
PINECONE_INDEX_URL=https://lecture-embeddings-7moxkfr.svc.aped-4627-b74a.pinecone.io
LANGCHAIN_TRACING_V2=true
LANGCHAIN_ENDPOINT=https://api.smith.langchain.com
LANGCHAIN_API_KEY=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
LANGCHAIN_PROJECT=pr-rash-improvement-74
"""

# Write environment variables
env_file_path = "/notebooks/.env"
with open(env_file_path, 'w') as f:
    f.write(env_content)

# Load and verify environment variables
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_INDEX_NAME = os.getenv('PINECONE_INDEX_NAME')
PINECONE_INDEX_URL = os.getenv('PINECONE_INDEX_URL')
LANGCHAIN_API_KEY = os.getenv('LANGCHAIN_API_KEY')

# Verify API keys are loaded
print("OpenAI API Key Loaded:", bool(OPENAI_API_KEY))
print("Pinecone API Key Loaded:", bool(PINECONE_API_KEY))
print("LangChain API Key Loaded:", bool(LANGCHAIN_API_KEY))

# Initialize base clients
openai_client = OpenAI(api_key=OPENAI_API_KEY)
llm = ChatOpenAI()

OpenAI API Key Loaded: True
Pinecone API Key Loaded: True
LangChain API Key Loaded: True


In [None]:
# YOUTUBE DOWNDLOAD AND TRANSCRIPTION SETUP
# Set up storage paths
NOTEBOOKS_PATH = "/notebooks"
STORAGE_PATH = os.path.join(NOTEBOOKS_PATH, "youtube_transcripts")
DOWNLOADS_DIR = os.path.join(STORAGE_PATH, "downloads")
TRANSCRIPTS_DIR = os.path.join(STORAGE_PATH, "transcripts")
MODEL_CACHE_DIR = os.path.join(STORAGE_PATH, "model_cache")

# Create necessary directories
for directory in [DOWNLOADS_DIR, TRANSCRIPTS_DIR, MODEL_CACHE_DIR]:
    os.makedirs(directory, exist_ok=True)
    print(f"Created directory: {directory}")

def read_urls_from_file(file_path: str) -> List[str]:
    """Read URLs from a text file."""
    with open(file_path, 'r') as f:
        urls = [line.strip() for line in f.readlines() if line.strip()]
    return urls

def download_audio(url: str, lecture_number: int) -> str:
    """
    Download audio from YouTube URL with sequential lecture numbering.
    """
    output_template = os.path.join(DOWNLOADS_DIR, f'Lecture{lecture_number}')
    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192'
        }],
        'outtmpl': output_template,
        'quiet': False,
        'no_warnings': False
    }
    
    with YoutubeDL(ydl_opts) as ydl:
        try:
            ydl.download([url])
            audio_path = f"{output_template}.mp3"
            return audio_path
        except Exception as e:
            print(f"Error downloading {url}: {str(e)}")
            raise

def transcribe_audio(audio_path: str) -> tuple[str, str]:
    """
    Transcribe audio and save the transcript to a file.
    Returns tuple of (transcript text, transcript file path)
    """
    print(f"\nStarting transcription process for {audio_path}")
    print("Loading Whisper model...")
    os.environ['WHISPER_CACHE_DIR'] = MODEL_CACHE_DIR
    
    try:
        model = whisper.load_model("base", device="cuda" if torch.cuda.is_available() else "cpu")
        print("Model loaded successfully. Beginning transcription...")
        
        result = model.transcribe(audio_path)
        print("Transcription completed successfully")
        
        base_name = os.path.basename(audio_path).replace('.mp3', '')
        transcript_path = os.path.join(TRANSCRIPTS_DIR, f"{base_name}_transcript.txt")
        
        print(f"Saving transcript to {transcript_path}")
        with open(transcript_path, 'w', encoding='utf-8') as f:
            f.write(result['text'])
        print("Transcript saved successfully")
        
        return result['text'], transcript_path
    except Exception as e:
        print(f"Error during transcription: {str(e)}")
        raise

class BatchDownloadAndTranscribeTool(BaseTool):
    name: str = "Batch Download and Transcribe"
    description: str = "Downloads audio from multiple YouTube URLs and transcribes them to text. Input can be either a single URL or path to a text file containing URLs."
    
    def _run(self, input_path_or_url: str) -> str:
        """Run the tool."""
        try:
            print(f"\nStarting batch process for input: {input_path_or_url}")
            
            if input_path_or_url.endswith('.txt'):
                urls = read_urls_from_file(input_path_or_url)
                print(f"Found {len(urls)} URLs in file")
            else:
                urls = [input_path_or_url]
            
            results = []
            for i, url in enumerate(urls, 1):
                print(f"\nProcessing URL {i}/{len(urls)}: {url}")
                try:
                    print(f"Step 1: Downloading audio for Lecture {i}...")
                    audio_path = download_audio(url, i)
                    print(f"✓ Audio downloaded to: {audio_path}")
                    
                    print(f"Step 2: Transcribing Lecture {i}...")
                    transcript, transcript_path = transcribe_audio(audio_path)
                    print(f"✓ Transcript saved to: {transcript_path}")
                    
                    audio_size = os.path.getsize(audio_path) / (1024 * 1024)
                    transcript_size = os.path.getsize(transcript_path) / 1024
                    
                    results.append({
                        'lecture_number': i,
                        'url': url,
                        'audio_path': audio_path,
                        'audio_size': audio_size,
                        'transcript_path': transcript_path,
                        'transcript_size': transcript_size,
                        'success': True
                    })
                    
                except Exception as e:
                    print(f"Error processing Lecture {i}: {str(e)}")
                    results.append({
                        'lecture_number': i,
                        'url': url,
                        'error': str(e),
                        'success': False
                    })
            
            report = "\nBatch Processing Summary:\n"
            report += "=" * 50 + "\n"
            for result in results:
                report += f"\nLecture {result['lecture_number']}:\n"
                report += f"URL: {result['url']}\n"
                if result['success']:
                    report += f"Audio ({result['audio_size']:.2f}MB): {result['audio_path']}\n"
                    report += f"Transcript ({result['transcript_size']:.2f}KB): {result['transcript_path']}\n"
                else:
                    report += f"Failed: {result['error']}\n"
            
            return report
            
        except Exception as e:
            error_msg = f"An error occurred during batch processing: {str(e)}"
            print(error_msg)
            return error_msg
    
    def _arun(self, url: str) -> str:
        """Run the tool asynchronously."""
        raise NotImplementedError("Async not implemented")

# Initialize the tool and agent
batch_download_tool = BatchDownloadAndTranscribeTool()

# Helper function to format tools
def format_tools(tools):
    return "\n".join([f"{tool.name}: {tool.description}" for tool in tools])

# Initialize memory for conversational context
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

# Define the prompt template with tool instructions
template = """You are a helpful assistant that processes YouTube videos for transcription.

You have access to the following tools:

{tools}

Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question

Begin!

Question: {input}
{agent_scratchpad}"""

prompt = PromptTemplate(
    template=template,
    input_variables=["input", "agent_scratchpad"],
    partial_variables={"tools": lambda x: format_tools(tools), "tool_names": lambda x: ", ".join(t.name for t in tools)}
)

# Initialize the agent with the tool
tools = [batch_download_tool]
agent = create_react_agent(
    llm=llm,
    tools=tools,
    prompt=prompt
)

agent_executor = AgentExecutor(
    agent=agent,
    tools=tools,
    memory=memory,
    verbose=True,
    handle_parsing_errors=True
)

# Reset directories if needed
for dir_path in [DOWNLOADS_DIR, TRANSCRIPTS_DIR]:
    if os.path.exists(dir_path):
        shutil.rmtree(dir_path)
    os.makedirs(dir_path, exist_ok=True)

print("YouTube download and transcription setup complete!")

In [None]:
# TRANSCRIPT PROCESSING

# Define lecture information and metadata
LECTURE_INFO = {
    1: {
        'title': 'Natural Language Processing with Deep Learning',
        'main_topics': ['NLP basics', 'Word Vectors'],
        'key_concepts': ['Natural Language Processing', 'Word Vectors', 'Singular Value Decomposition', 
                        'Skip-gram', 'Continuous Bag of Words', 'Negative Sampling', 
                        'Hierarchical Softmax', 'Word2Vec'],
        'builds_on': []  # First lecture, no prerequisites
    },
    2: {
        'title': 'Word Vector Representations: word2vec',
        'main_topics': ['Word Vector Implementation', 'Word2Vec Details'],
        'key_concepts': ['Word Vectors', 'Skip-gram', 'Continuous Bag of Words', 
                        'Negative Sampling', 'Hierarchical Softmax', 'Word2Vec'],
        'builds_on': ['Word Vectors', 'Natural Language Processing']
    },
    3: {
        'title': 'GloVe: Global Vectors for Word Representation',
        'main_topics': ['GloVe', 'Word Vector Evaluation'],
        'key_concepts': ['GloVe', 'Intrinsic evaluation', 'Extrinsic evaluation', 
                        'Word analogies', 'Context windows', 'Window classification'],
        'builds_on': ['Word Vectors', 'Word2Vec']
    },
    4: {
        'title': 'Word Window Classification and Neural Networks',
        'main_topics': ['Neural Networks', 'Classification'],
        'key_concepts': ['Neural networks', 'Forward computation', 'Backward propagation',
                        'Neuron Units', 'Max-margin Loss', 'Gradient checks', 
                        'Xavier initialization', 'Learning rates', 'Adagrad'],
        'builds_on': ['Window classification']
    }
}

def load_transcripts(transcripts_dir: str) -> Dict[int, str]:
    """Load all transcripts from the directory."""
    transcripts = {}
    transcript_files = sorted(Path(transcripts_dir).glob("*_transcript.txt"))
    
    for file_path in transcript_files:
        if not str(file_path).endswith('.ipynb_checkpoints'):
            lecture_num = int(file_path.name.split('_')[0].replace('Lecture', ''))
            with open(file_path, 'r') as f:
                transcripts[lecture_num] = f.read()
    
    print(f"Loaded {len(transcripts)} transcripts")
    return transcripts

def create_chunks(text: str, chunk_size: int = 1500, overlap: int = 200) -> List[Dict]:
    """Split text into overlapping chunks."""
    chunks = []
    start = 0
    
    while start < len(text):
        # Determine end position of current chunk
        end = min(start + chunk_size, len(text))
        
        # If we're not at the end of the text, find the next period for clean breaks
        if end < len(text):
            next_period = text[end:min(end + 100, len(text))].find('.')
            if next_period != -1:
                end = end + next_period + 1
        
        chunk = text[start:end].strip()
        chunks.append({
            'content': chunk,
            'char_start': start,
            'char_end': end
        })
        
        start = end - overlap
    
    return chunks

# Main processing function
def process_all_transcripts():
    """Process all transcripts and create structured chunks with metadata."""
    print("\nStarting transcript processing...")
    
    # Load all transcripts
    transcripts = load_transcripts(TRANSCRIPTS_DIR)
    
    processed_chunks = []
    
    # Process each lecture
    for lecture_num, text in transcripts.items():
        print(f"\nProcessing Lecture {lecture_num}...")
        
        # Get lecture metadata
        lecture_info = LECTURE_INFO.get(lecture_num, {
            'title': f'Lecture {lecture_num}',
            'key_concepts': [],
            'main_topics': []
        })
        
        # Create chunks for this lecture
        chunks = create_chunks(text)
        
        # Add metadata to each chunk
        for i, chunk in enumerate(chunks):
            chunk_data = {
                'lecture_number': lecture_num,
                'lecture_title': lecture_info['title'],
                'key_concepts': lecture_info['key_concepts'],
                'main_topics': lecture_info['main_topics'],
                'chunk_index': i,
                'total_chunks': len(chunks),
                'content': chunk['content'],
                'char_start': chunk['char_start'],
                'char_end': chunk['char_end']
            }
            processed_chunks.append(chunk_data)
        
        print(f"Created {len(chunks)} chunks for Lecture {lecture_num}")
    
    # Save processed chunks
    output_dir = os.path.join(STORAGE_PATH, "processed")
    os.makedirs(output_dir, exist_ok=True)
    output_file = os.path.join(output_dir, 'lecture_chunks.json')
    
    with open(output_file, 'w') as f:
        json.dump(processed_chunks, f, indent=2)
    
    print(f"\nProcessing complete!")
    print(f"Total chunks created: {len(processed_chunks)}")
    print(f"Saved to: {output_file}")
    
    return processed_chunks

# Execute the processing
if __name__ == "__main__":
    processed_chunks = process_all_transcripts()

In [None]:
# EMBEDDING GENERATION

# Initialize OpenAI client for embeddings
print("Initializing embedding generation...")

def get_embedding(text: str) -> list:
    """Generate embedding for text using OpenAI's API"""
    try:
        response = openai_client.embeddings.create(
            model="text-embedding-ada-002",
            input=text
        )
        return response.data[0].embedding
    except Exception as e:
        print(f"Error generating embedding: {e}")
        return None

def process_chunks_with_embeddings():
    """Process all chunks and add embeddings"""
    # Load chunks
    input_file = os.path.join(STORAGE_PATH, "processed", "lecture_chunks.json")
    print(f"Loading chunks from {input_file}")
    
    try:
        with open(input_file, 'r') as f:
            chunks = json.load(f)
        print(f"Loaded {len(chunks)} chunks")
    except FileNotFoundError:
        print("Error: Processed chunks file not found. Please run transcript processing first.")
        return
    except json.JSONDecodeError:
        print("Error: Invalid JSON file")
        return

    # Process chunks and add embeddings
    chunks_with_embeddings = []
    total_chunks = len(chunks)
    
    print("\nGenerating embeddings for all chunks...")
    for i, chunk in enumerate(chunks, 1):
        print(f"Processing chunk {i}/{total_chunks}", end='\r')
        
        # Generate embedding
        embedding = get_embedding(chunk['content'])
        
        if embedding:
            # Add embedding to chunk data
            chunk_with_embedding = chunk.copy()
            chunk_with_embedding['embedding'] = embedding
            chunks_with_embeddings.append(chunk_with_embedding)
        else:
            print(f"\nWarning: Failed to generate embedding for chunk {i}")

    # Save embeddings
    output_file = os.path.join(STORAGE_PATH, "processed", "lecture_chunks_with_embeddings.json")
    with open(output_file, 'w') as f:
        json.dump(chunks_with_embeddings, f)

    print(f"\nEmbedding generation complete!")
    print(f"Processed {len(chunks_with_embeddings)} chunks with embeddings")
    print(f"Saved to: {output_file}")
    
    # Print sample embedding dimension
    if chunks_with_embeddings:
        embedding_dim = len(chunks_with_embeddings[0]['embedding'])
        print(f"Embedding dimension: {embedding_dim}")
    
    return chunks_with_embeddings

def verify_embeddings(chunks_with_embeddings):
    """Verify the quality and consistency of generated embeddings"""
    if not chunks_with_embeddings:
        print("No embeddings to verify")
        return False

    expected_dim = 1536  # Expected dimension for OpenAI ada-002 embeddings
    all_valid = True

    print("\nVerifying embeddings...")
    for i, chunk in enumerate(chunks_with_embeddings):
        # Check if embedding exists
        if 'embedding' not in chunk:
            print(f"Chunk {i} missing embedding")
            all_valid = False
            continue

        # Check embedding dimension
        embedding_dim = len(chunk['embedding'])
        if embedding_dim != expected_dim:
            print(f"Chunk {i} has incorrect dimension: {embedding_dim} (expected {expected_dim})")
            all_valid = False

        # Check for null values
        if None in chunk['embedding']:
            print(f"Chunk {i} contains null values in embedding")
            all_valid = False

    if all_valid:
        print("✓ All embeddings verified successfully!")
    else:
        print("× Some embeddings failed verification")

    return all_valid

# Execute embedding generation and verification
if __name__ == "__main__":
    print("Starting embedding generation process...")
    
    # Generate embeddings
    chunks_with_embeddings = process_chunks_with_embeddings()
    
    # Verify embeddings
    if chunks_with_embeddings:
        verify_embeddings(chunks_with_embeddings)
    
    print("\nEmbedding process complete!")

In [12]:
# PINECONE SET UP AND VECTOR STORAGE

import os
import json
from pinecone import Pinecone, ServerlessSpec
from langchain.embeddings import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from tqdm import tqdm

def initialize_pinecone(index_name):
    """Initialize Pinecone client and connect to index"""
    print("\nInitializing Pinecone...")
    try:
        # Initialize Pinecone
        pc = Pinecone(api_key=PINECONE_API_KEY)
        
        # Check if index exists, if not create it
        try:
            index = pc.Index(index_name)
            print(f"Connected to existing Pinecone index: {index_name}")
        except Exception as e:
            print(f"Index not found, creating new index: {index_name}")
            # Create index with serverless spec
            spec = ServerlessSpec(cloud="aws", region="us-west-2")
            pc.create_index(
                name=index_name,
                dimension=1536,  # OpenAI ada-002 embedding dimension
                metric="cosine",
                spec=spec
            )
            index = pc.Index(index_name)
            print(f"Created new Pinecone index: {index_name}")
        
        return index
    except Exception as e:
        print(f"Error initializing Pinecone: {e}")
        return None

def prepare_vectors_for_upsert(chunks_with_embeddings):
    """Prepare vectors in the format required by Pinecone"""
    vectors = []
    
    for i, chunk in enumerate(chunks_with_embeddings):
        # Create unique ID for each vector
        vector_id = f"chunk_{chunk['lecture_number']}_{chunk['chunk_index']}"
        
        # Prepare metadata - Store main content in 'content' field
        metadata = {
            'lecture_number': chunk['lecture_number'],
            'lecture_title': chunk['lecture_title'],
            'chunk_index': chunk['chunk_index'],
            'key_concepts': chunk['key_concepts'],
            'content': chunk['content']  # Main content stored here
        }
        
        # Create vector object
        vector = {
            'id': vector_id,
            'values': chunk['embedding'],
            'metadata': metadata
        }
        
        vectors.append(vector)
    
    return vectors

def upsert_to_pinecone(index, vectors, batch_size=100):
    """Upsert vectors to Pinecone in batches"""
    print("\nUpserting vectors to Pinecone...")
    total_vectors = len(vectors)
    
    for i in range(0, total_vectors, batch_size):
        batch = vectors[i:min(i + batch_size, total_vectors)]
        try:
            index.upsert(vectors=batch)
            print(f"Upserted batch {i//batch_size + 1}/{(total_vectors + batch_size - 1)//batch_size}")
        except Exception as e:
            print(f"Error upserting batch starting at index {i}: {e}")
            return False
    
    return True

def verify_pinecone_index(index):
    """Verify the index statistics and content"""
    try:
        # Get index stats
        stats = index.describe_index_stats()
        
        print("\nPinecone Index Statistics:")
        print(f"Total vectors: {stats.total_vector_count}")
        print(f"Dimension: {stats.dimension}")
        
        # Perform a test query with embedding
        embedding_model = OpenAIEmbeddings(
            model="text-embedding-ada-002",
            openai_api_key=OPENAI_API_KEY
        )
        test_query = embedding_model.embed_query("test")
        
        results = index.query(
            vector=test_query,
            top_k=1,
            include_values=True,
            include_metadata=True
        )
        
        if results.matches:
            print("\nTest query successful!")
            print("Sample document metadata:")
            metadata = results.matches[0].metadata
            print("\nMetadata fields found:")
            for key in metadata.keys():
                if key == 'content':
                    print(f"{key}: <content length: {len(str(metadata[key]))} chars>")
                else:
                    print(f"{key}: {metadata[key]}")
        else:
            print("Test query returned no results")
            
        return True
    except Exception as e:
        print(f"Error verifying index: {e}")
        return False

def initialize_vector_store(index):
    """Initialize LangChain's vector store wrapper for Pinecone"""
    try:
        vector_store = PineconeVectorStore(
            index=index,
            embedding=OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY),
            text_key="content"  # Using 'content' as the text key
        )
        print("\nLangChain vector store initialized successfully")
        
        # Verify vector store works
        test_results = vector_store.similarity_search("test", k=1)
        if test_results:
            print("Vector store retrieval test successful")
            print(f"Retrieved document length: {len(test_results[0].page_content)}")
        
        return vector_store
    except Exception as e:
        print(f"Error initializing vector store: {e}")
        return None

def main_pinecone_setup(mode='connect'):
    """Main function to set up Pinecone and upload vectors"""
    # Initialize Pinecone
    index = initialize_pinecone(PINECONE_INDEX_NAME)
    if not index:
        return False
        
    if mode == 'new':
        # Load chunks with embeddings
        input_file = os.path.join(STORAGE_PATH, "processed", "lecture_chunks_with_embeddings.json")
        try:
            with open(input_file, 'r') as f:
                chunks_with_embeddings = json.load(f)
            print(f"\nLoaded {len(chunks_with_embeddings)} chunks with embeddings")
        except Exception as e:
            print(f"Error loading embeddings file: {e}")
            return False
        
        # Prepare and upsert vectors
        vectors = prepare_vectors_for_upsert(chunks_with_embeddings)
        print(f"Prepared {len(vectors)} vectors for upload")
        
        if not upsert_to_pinecone(index, vectors):
            return False
    
    # Verify the index
    if not verify_pinecone_index(index):
        return False
    
    # Initialize vector store
    vector_store = initialize_vector_store(index)
    if not vector_store:
        return False
    
    print("\nPinecone setup completed successfully!")
    return vector_store

if __name__ == "__main__":
    # Choose setup mode: 'new' for fresh setup, 'connect' for existing index
    setup_mode = 'connect'  # or 'new'
    
    vector_store = main_pinecone_setup(mode=setup_mode)
    
    if vector_store:
        print("\nSetup complete! The vector store is ready for querying.")
        
        # Verification step
        print("\nPerforming final verification...")
        try:
            test_docs = vector_store.similarity_search("test query", k=1)
            if test_docs:
                print("✅ Vector store is working correctly")
                print(f"Sample document metadata keys: {list(test_docs[0].metadata.keys())}")
            else:
                print("⚠️ Vector store returned no results")
        except Exception as e:
            print(f"❌ Error in final verification: {e}")
    else:
        print("\nSetup failed. Please check the errors above.")


Initializing Pinecone...
Connected to existing Pinecone index: lecture-embeddings

Pinecone Index Statistics:
Total vectors: 311
Dimension: 1536

Test query successful!
Sample document metadata:

Metadata fields found:
chunk_index: 9.0
content: <content length: 1500 chars>
key_concepts: ['Neural Networks', 'Classification', 'Backpropagation']
lecture_number: 4.0
lecture_title: Word Window Classification and Neural Networks
page_content: tors. And we had trained these ward vectors on a very, very large corpus. And it learned all these three words appear often in similar context, so they're close by in the vector space. And now we're going to train, but our smaller sentiment data set only includes in a training set, the X, Y, Y, S, TV and telly and not television. So now what happens as we train these ward vectors? Well, they will start to move around. We'll project sentiment into them. And so you now might see telly and TV, so it's a pretty status set. So like to move somewhere else in

In [13]:
# PINECONE CONNECTION TESTING UTILITY

def verify_pinecone_connection():
    """
    Verify Pinecone connection and index functionality.
    Tests connection, queries, and metadata retrieval.
    """
    try:
        # Initialize Pinecone
        print("\n1. Connecting to Pinecone...")
        pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
        
        # Get index
        index_name = os.environ["PINECONE_INDEX_NAME"]
        print(f"2. Accessing index: {index_name}")
        index = pc.Index(index_name)
        
        # Create test vector
        print("3. Creating test query...")
        test_vector = [0.0] * 1536  # OpenAI embeddings dimension
        
        # Perform query
        print("4. Performing test query...")
        response = index.query(
            vector=test_vector,
            top_k=1,
            include_metadata=True
        )
        
        if response and response.matches:
            print("\n✅ Connection successful!")
            print("\nSample document metadata:")
            match = response.matches[0]
            for key, value in match.metadata.items():
                if isinstance(value, str) and len(value) > 100:
                    print(f"{key}: <text length: {len(value)} chars>")
                    print("Preview:", value[:100], "...")
                else:
                    print(f"{key}:", value)
            return True
        else:
            print("\n⚠️ Connection successful but no documents found")
            return False
            
    except Exception as e:
        print(f"\n❌ Error: {str(e)}")
        return False

if __name__ == "__main__":
    verify_pinecone_connection()


1. Connecting to Pinecone...
2. Accessing index: lecture-embeddings
3. Creating test query...
4. Performing test query...

✅ Connection successful!

Sample document metadata:
chunk_index: 1.0
content: <text length: 1500 chars>
Preview:  start with sort of vectors and derivatives and chain rules and all of that stuff. So you should get ...
key_concepts: ['NLP', 'Word Vectors', 'Word2Vec']
lecture_number: 1.0
lecture_title: Natural Language Processing with Deep Learning
page_content: <text length: 1500 chars>
Preview:  start with sort of vectors and derivatives and chain rules and all of that stuff. So you should get ...


In [None]:
import gradio as gr
from typing import List
import os
from langchain.embeddings import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain.memory import ConversationBufferWindowMemory
from langchain_openai import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
from pinecone import Pinecone

# Initialize components 
embeddings = OpenAIEmbeddings(
    model="text-embedding-ada-002",
    openai_api_key=os.environ["OPENAI_API_KEY"]
)

llm = ChatOpenAI(
    model="gpt-4o-mini",
    openai_api_key=os.environ["OPENAI_API_KEY"],
    temperature=0
)

# Initialize Pinecone
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
index = pc.Index(os.environ["PINECONE_INDEX_NAME"])

vector_store = PineconeVectorStore(
    index=index,
    embedding=embeddings,
    text_key="content"
)

memory = ConversationBufferWindowMemory(
    memory_key="chat_history",
    return_messages=True,
    k=3,
    output_key="answer"
)

# Modified prompt template to enforce strict retrieval
custom_template = """
You are a helpful teaching assistant for a course on Computational Linguistics.
You must ONLY answer using information from the provided context. 
If the context doesn't contain enough information to answer the question properly, respond with "I can't find specific information about this in the course materials."
Do not use any external knowledge or make assumptions beyond what's in the context.

Current conversation:
{chat_history}

Context from course materials:
{context}

Instructions:
1. Check if the context contains relevant information to answer the question
2. If sufficient information exists:
   - Connect your answer to specific lectures
   - Use only information from the provided context
   - Include key concepts mentioned in the context
3. If insufficient information exists:
   - Respond with "I can't find specific information about this in the course materials."
4. Never make up or infer information not present in the context

Student Question: {question}

Teaching Assistant Answer:"""

CUSTOM_PROMPT = PromptTemplate(
    template=custom_template, 
    input_variables=["context", "question", "chat_history"]
)

qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=vector_store.as_retriever(search_kwargs={"k": 8}),
    memory=memory,
    combine_docs_chain_kwargs={'prompt': CUSTOM_PROMPT},
    return_source_documents=True
)

def chat(message: str) -> str:
    """Process a single message and return the response"""
    print(f"\nProcessing question: {message}")
    
    if not message.strip():
        return "Please enter a question."
    
    try:
        # Get response
        response = qa_chain({"question": message})
        
        # Check if we got any source documents
        if not response.get('source_documents'):
            return "I can't find specific information about this in the course materials."
        
        # Process the response
        answer = response["answer"].strip()
        
        # If the answer indicates no information was found, return early
        if "can't find specific information" in answer.lower():
            return answer
        
        # Add sources and concepts if we have a valid answer
        sources = set()
        concepts = set()
        
        for doc in response['source_documents']:
            if hasattr(doc, 'metadata'):
                lecture_num = doc.metadata.get('lecture_number')
                lecture_title = doc.metadata.get('lecture_title')
                if lecture_num and lecture_title:
                    sources.add(f"Lecture {lecture_num}: {lecture_title}")
                
                if 'key_concepts' in doc.metadata:
                    concepts.update(doc.metadata['key_concepts'])
        
        # Add metadata to response
        formatted_response = answer
        if sources:
            formatted_response += "\n\n📚 Sources:\n" + "\n".join(f"• {s}" for s in sorted(sources))
        if concepts:
            formatted_response += "\n\n🔑 Key Concepts:\n• " + ", ".join(sorted(concepts))
        
        # Debug information
        print(f"Found {len(sources)} relevant lectures")
        print(f"Found {len(concepts)} key concepts")
        
        return formatted_response
    
    except Exception as e:
        print(f"Error: {str(e)}")
        return f"❌ Error: {str(e)}\nPlease try asking your question again."

# Create the Gradio interface - Fixed indentation
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    # Friendly header
    gr.Markdown("""
    # 🎓 Computational Linguistics Teaching Assistant
    
    Welcome! I'm your CS teaching assistant, trained on Stanford's lectures. 
    I can help you understand:
    - 📚 Word Vectors and Embeddings
    - 🧠 Neural Networks in NLP
    - 🤖 GloVe and Word Representations
    
    Ask me anything about these topics!
    """)

    # Main interface
    with gr.Row():
        with gr.Column(scale=1):
            question = gr.Textbox(
                placeholder="🌟 No question is too simple or too complex - I'm here to help!",
                label="Your Question",
                lines=3
            )
            with gr.Row():
                submit = gr.Button("Send", variant="primary")
                clear = gr.Button("Clear")

        with gr.Column(scale=2):
            answer = gr.Textbox(
                label="Assistant Response",
                lines=15,
                interactive=False
            )

    # Handle events
    def clear_fields():
        return "", ""
    
    submit.click(fn=chat, inputs=question, outputs=answer)
    question.submit(fn=chat, inputs=question, outputs=answer)
    clear.click(fn=clear_fields, inputs=[], outputs=[question, answer])

    # Example Questions
    gr.Markdown("### 💡 Not sure where to start?")
    gr.Examples(
        examples=[
            "🤔 What is cross entropy?",
            "✨ How do neural networks work in NLP?",
            "🔍 What is GloVe and what problem does it solve?",
        ],
        inputs=question,
        label="Try these questions"
    )

# Launch the interface
demo.queue()
demo.launch(share=True)

* Running on local URL:  http://127.0.0.1:7863
* Running on public URL: https://49388f5b7d6d092856.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)





Processing question: what does NLP mean?


Found document with no `content` key. Skipping.
Found document with no `content` key. Skipping.
Found document with no `content` key. Skipping.
Found document with no `content` key. Skipping.


Found 1 relevant lectures
Found 3 key concepts

Processing question: What is cross entropy?
Found 3 relevant lectures
Found 9 key concepts

Processing question: How is it used in neural networks?
Found 3 relevant lectures
Found 9 key concepts

Processing question: What other loss functions are mentioned in the neural networks lecture?

Processing question: What is BERT?


Found document with no `content` key. Skipping.
Found document with no `content` key. Skipping.
Found document with no `content` key. Skipping.
Found document with no `content` key. Skipping.



Processing question: What methods for word representation ARE covered in the lectures?


Found document with no `content` key. Skipping.
Found document with no `content` key. Skipping.


Found 2 relevant lectures
Found 5 key concepts
