In [1]:
import os
import json5
from pinecone import Pinecone, ServerlessSpec
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Document

from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken

import boto3
from botocore.exceptions import NoCredentialsError, ClientError

from config.config_helper import pinecone_api_key, openai_api_key
import yaml

from datetime import datetime

MAX_TOKENS = 4096
# Initialize tokenizer for the embedding model
tokenizer = tiktoken.encoding_for_model("text-embedding-ada-002")       


In [2]:
def load_config(config_file):
    # Load configuration from the YAML file
    with open(config_file, 'r') as file:
        config = yaml.safe_load(file)
    return config

In [3]:
def init_pincone(config):
    """Initialize Pincone API from configuration."""
    
    api_key = config['pinecone']['pinecone_api_key']
    pc = Pinecone(api_key)
    
    return pc

In [4]:
from llama_index.embeddings.openai import OpenAIEmbedding

def init_openai_embedding(config):
    """Initialize OpenAI API for embeddings."""
    
    # Get the OpenAI API key from the configuration
    api_key = config['openai']['chat_gpt_key']
    
    # Initialize OpenAIEmbedding with the correct parameters
    client = OpenAIEmbedding(
        model="text-embedding-ada-002",  # Specify the embedding model
        api_key=api_key  # Pass the API key
    )
    
    return client

In [5]:
# Function to split text into chunks
def chunk_text(text, max_tokens=MAX_TOKENS, overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=max_tokens,
        chunk_overlap=overlap,
        
        # separators=["\n\n", "\n", ".", " "]
        separators=["\n\n", "\n"]
    )
    return text_splitter.split_text(text)


# Function to clean metadata
def clean_metadata(metadata):
    cleaned_metadata = {}
    for key, value in metadata.items():
        if isinstance(value, (str, int, float, bool)):
            cleaned_metadata[key] = value
        elif isinstance(value, list) and all(isinstance(item, str) for item in value):
            cleaned_metadata[key] = value
    return cleaned_metadata


# Create and configure Pinecone index
def initialize_index(pc, index_name="biullmindex"):
    if index_name in pc.list_indexes().names():
        pc.delete_index(name=index_name)
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
    return pc.Index(index_name)


# Function to add documents to Pinecone
def add_documents_to_pinecone(pc, documents, pinecone_index):
    for i, doc in enumerate(documents):
        metadata = clean_metadata(doc.metadata)
        if len(tokenizer.encode(doc.text)) > MAX_TOKENS:
            chunks = chunk_text(doc.text)
            for j, chunk in enumerate(chunks):
                chunk_id = f"doc-{i}-chunk-{j}"
                embedding = client.get_text_embedding(chunk)
                pinecone_index.upsert(
                    vectors=[(chunk_id, embedding, {**metadata, "chunk_id": chunk_id})]
                )
        else:
            embedding = client.get_text_embedding(doc.text)
            pinecone_index.upsert(
                vectors=[(f"doc-{i}", embedding, metadata)]
            )


# Main function to populate the database
def populate_vector_db(config, data_path, index_name="biullmindex"):

    # Initialize Pinecone client
    pc = init_pincone (config)

    # Initialize OpenAI client
    client = init_openai_embedding(config)
    
    pinecone_index = initialize_index(index_name)
    
    documents = []
    for filename in os.listdir(data_path):
        if filename.endswith(".json"):
            file_path = os.path.join(data_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
                text_content = data.get('full_text', '')
                
                metadata_keys = ['course_name', 'summary']
                metadata = {key: value for key, value in data.items() if key in metadata_keys}
           
                if text_content:
                    documents.append(Document(text=text_content, metadata=metadata))
                    
    add_documents_to_pinecone(pc, documents, pinecone_index)

In [6]:
def main():
    """
    Build Embedding Vector DB
    """    
    
    cwd =os.getcwd()
    print("Current working directory:", cwd)
    
    try:
        # Extract parameters from the configuration
        config_file = 'config/config.yaml'
        config = load_config(config_file)
                
        # Json Documents on aws s3 - Production
        region_name = config['aws']['region_name']
        source_bucket_name = config['aws']['txt_extract_bucket_name']
        source_path = config['aws']['txt_extract_path']    
        log_file = 'get_json_sources'
        
        # Json Documents on aws s3 - Develop
        json_path = os.path.join(cwd, 'data','documents','eng')
        print("Jsons docs path:", json_path)

        populate_vector_db(config, json_path, index_name="biullmindex")      
    
    except Exception as e:
        print(f"Main process error: {str(e)}")
        raise

if __name__ == "__main__":
    main()

Current working directory: C:\github_repos\BIU_LLM_Project
Jsons docs path: C:\github_repos\BIU_LLM_Project\data\documents\eng
Main process error: 'str' object has no attribute 'list_indexes'


AttributeError: 'str' object has no attribute 'list_indexes'