# Text Preprocessing and Embeddings

In [1]:
import os
import psycopg2
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
from tqdm import tqdm

## Load environmental variables

In [None]:
if not load_dotenv(".env"):
    print("An error has occurred. Make sure the file exists and is readable")
else:
    print("Loading successful")

Loading successful


## Establish connection with PostgreSQL

In [3]:
def establish_connection():
    try:
        conn = psycopg2.connect(
            dbname= os.getenv("POSTGRES_DBNAME"),
            user= os.getenv("POSTGRES_USER"),
            password= os.getenv("POSTGRES_PASSWORD"),
            host= os.getenv("POSTGRES_HOST"),
            port= os.getenv("POSTGRES_PORT")
        )
        print("Success in connecting to PostgreSQL DB")
        cur = conn.cursor()

        # Check for 'passages' table
        cur.execute("""
            SELECT EXISTS (
                SELECT 1 FROM information_schema.tables
                WHERE table_schema = 'public' AND table_name = 'passages'
            );
        """)
        table_exists_result = cur.fetchone()

        if table_exists_result is None:
            raise Exception("Database did not return a result for passages table existence check.")
        table_exists = table_exists_result[0]


        if not table_exists:
            print("Error: The 'passages' table does not exist in the database.")
            print("Please run the datapreprocessing script first to create and populate it.")
            if cur: cur.close()
            if conn: conn.close()
            return None, None
        else:
            # If table exists, check for 'status' column
            cur.execute("""
                SELECT EXISTS (
                    SELECT 1 FROM information_schema.columns
                    WHERE table_schema = 'public' AND table_name = 'passages' AND column_name = 'status'
                );
            """)
            column_exists_result = cur.fetchone()

            if column_exists_result is None:
                raise Exception("Database did not return a result for status column existence check.")
            column_exists = column_exists_result[0]


            if not column_exists:
                print("Adding 'status' column to existing 'passages' table.")
                cur.execute("ALTER TABLE passages ADD COLUMN status TEXT DEFAULT 'pending_embedding';")
        
                conn.commit()
                print("'status' column added.")

        return conn, cur

    except psycopg2.Error as e:
        print(f"A database error occurred during connection or setup check: {e}")

        if cur: 
            cur.close()
        if conn: 
            conn.close()

        return None, None
    
    except Exception as e: 
        print(f"An unexpected error occurred during DB connection/setup: {e}")

        if cur: 
            cur.close()
        if conn: 
            conn.close()
            
        return None, None

## Set up Pinecone

### Innitiate Pinecone, check for index, create one if needed

In [4]:
def setup_pinecone_index():
    pc = Pinecone(api_key = os.getenv("PINECONE_API_KEY"))
    index_name = os.getenv("PINECONE_INDEX_NAME")

    if not pc:
        print("An error has occurred. Pinecone's API key not found in environmental variables")
        return None
    
    try:
        print("Initialize Pinecone connection")
        print(f"\nChecking for Pinecone index '{index_name}")
        index_list = [i.name for i in pc.list_indexes()]

        # Check if the index exist
        if index_name not in index_list:
            print(f"Index '{index_name}' not found. Creating the index")
            pc.create_index(
                name=index_name,
                dimension=1024,
                metric="cosine",
                spec=ServerlessSpec(cloud="aws", region="us-east-1"),
                embed={
                    "model": "multilingual-e5-large",
                    "field_map": {"text": "text"}   
                }
            )
            print(f"Index '{index_name}' created successfully.")
        else:
            print(f"Index '{index_name}' already exists. Skipping creation.")

        return pc
    
    except Exception as e:
        print(f"An error has occurred during Pinecone setup or index creation: {e}")
        return None

## Data Fetching and Preparation

### Fetch, embed, and upsert texts from PostgreSQL onto Pinecone

In [None]:
def fetch_and_prepare_data(cur):
    print("Fetching data from PostgreSQL database for passages pending embedding.")
    vectors_to_upsert = []

    try:
        cur.execute("""
                    SELECT passage_id, title, text FROM passages
                    WHERE status = 'pending_embedding'
                    """)
            
        passages = cur.fetchall()
        print(f"Fetched {len(passages)} passages pending embedding from the Database")

    except psycopg2.Error as e:
        print(f"An error has occurred when fetching passages: {e}")
        return None
    
    print("Preparing vectors for upsert")
    skipped_count = 0
    if not passages:
        print("No passages found with status 'pending_embedding'.")
        return vectors_to_upsert

    for passage_id, title, text in passages:
        if not text or not text.strip():
            skipped_count += 1
            continue

        vector_id = f"passage-{passage_id}"
        metadata = {
            "passage_id": str(passage_id),
            "title": str(title),
            "text": str(text) 
        }

        vectors_to_upsert.append({
            "id": vector_id,
            "metadata": metadata
        })

    if skipped_count > 0:
        print(f"Skipped {skipped_count} pending passages because of empty text")
        
    print(f"Prepare {len(vectors_to_upsert)} vectors with non-empty text")
    return vectors_to_upsert

## Pinecone Batch Upsert 

### Upsert vectors to Pinecone index in batches

In [None]:
def batch_upsert(index, vectors, namespace, conn, cur, batch_size=480):
    if not vectors:
        print("No vectors to upsert")
        return True
    
    if not cur or not conn:
        print("Database cursor or connection not provided to batch_upsert. Cannot update status.")
        return False

    print(f"Uploading {len(vectors)} vector to Pinecone namespace '{namespace}' in batches of {batch_size}")
    
    all_successful = True
    for i in tqdm(range(0, len(vectors), batch_size)):
        batch_vectors = vectors[i:i + batch_size]
        
        batch_ids_to_update = []                    # Store original IDs for this batch
        pinecone_batch = []                         # Prepare batch and collect original IDs from metadata

        for vec in batch_vectors:
            pinecone_batch.append(vec)

            # Extract original passage ID from metadata for Database update
            try:
                original_passage_id = int(vec['metadata']['passage_id'])
                batch_ids_to_update.append(original_passage_id)

            except (KeyError, ValueError, TypeError) as e:
                print(f"Skipped empty batch starting at index {i}")
                continue
        
        # Attempt to upsert batch into Pinecone
        try:
            index.upsert(vectors=pinecone_batch, namespace=namespace)

            # If upsert is successful, attempt to update status in PostgreSQL
            if batch_ids_to_update:
                try:
                    update_query = "UPDATE passages SET status = 'embedded' WHERE passage_id = ANY(%s)"
                    cur.execute(update_query, (batch_ids_to_update,))
                    conn.commit()
                    
                except (psycopg2.Error, Exception) as db_e:
                    print(f"Error updating PostgreSQL status for batch starting at index {i} after successful upsert: {db_e}")
                    print(f"IDs attempted to update: {batch_ids_to_update}")
                    conn.rollback() 
                    all_successful = False

            else:
                 print(f"Warning: Upserted batch starting at index {i} but no valid passage IDs found to update status.")

        except Exception as pinecone_e:
            print(f"Error in batch upsert: {pinecone_e}")
            all_successful = False

    if all_successful:
        print(f"Successfully uploaded {len(vectors)} vectors onto Pinecone.")
    else:
        print("Batch upload process finished with errors.")

    return all_successful

## Main Execution Block

In [7]:
if __name__ == "__main__":
    conn, cur = establish_connection()
    print("Initialize text embedding process")

    if conn and cur:
        try:
            pinecone_client = setup_pinecone_index()
            if pinecone_client:
                index_name = os.getenv("PINECONE_INDEX_NAME")
                namespace = os.getenv("PINECONE_NAMESPACE")

                index = pinecone_client.Index(index_name)
                print(f"Connected to Pinecone index '{index_name}'.")

                # Fetch and Prepare Data for Pending Passages
                print(f"Fetching passages pending embedding from namespace '{namespace}'...")
                vectors = fetch_and_prepare_data(cur)
                
                # Upsertting data 
                if vectors is not None:
                    if vectors: 
                        print(f"Found {len(vectors)} vectors to upsert.")
                        batch_upsert(index, vectors, namespace, conn, cur)

                    else:
                        print(f"No passages found in status 'pending_embedding'. Nothing to upsert.")
                else:
                    print("An error has occurred. Skipping upsert because data fetching or preparation failed.")

            else: 
                 print("An error has occurred. Skipping main logic due to Pinecone setup failure.")

        # CCatches other potential errors
        except Exception as e:
            print(f"An unexpected error occurred during execution: {e}")

        # Cleanup Block (Always Executes if connection succeeded initially)
        finally:
            print("\nClosing database connection")
            try:
                if cur:
                    cur.close()
                    print("Database cursor closed.")
                if conn:
                    conn.close()
                    print("Database connection closed.")
            except (psycopg2.Error, Exception) as e:
                print(f"Error closing database connection: {e}")
    else:
        print("An error has occurred. Database connection could not be established during setup.")

    print("\nComplete the Text Embedding process.")

Success in connecting to PostgreSQL DB
Initialize text embedding process
Initialize Pinecone connection

Checking for Pinecone index 'ielts-rag
Index 'ielts-rag' already exists. Skipping creation.
Connected to Pinecone index 'ielts-rag'.
Fetching passages pending embedding from namespace 'ielts-passages'...
Fetching data from PostgreSQL database for passages pending embedding.
Fetched 2075 passages pending embedding from the Database
Preparing vectors for upsert
Prepare 2075 vectors with non-empty text
Found 2075 vectors to upsert.
Uploading 2075 vector to Pinecone namespace 'ielts-passages' in batches of 480


100%|██████████| 5/5 [00:00<00:00, 3458.36it/s]

Error in batch upsert: At least one of 'values' or 'sparse_values' must be provided in the vector dictionary.
Error in batch upsert: At least one of 'values' or 'sparse_values' must be provided in the vector dictionary.
Error in batch upsert: At least one of 'values' or 'sparse_values' must be provided in the vector dictionary.
Error in batch upsert: At least one of 'values' or 'sparse_values' must be provided in the vector dictionary.
Error in batch upsert: At least one of 'values' or 'sparse_values' must be provided in the vector dictionary.
Batch upload process finished with errors.

Closing database connection
Database cursor closed.
Database connection closed.

Complete the Text Embedding process.



