# Import libraries

In [4]:
import pandas as pd
import os

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.schema import Document

from datasets import load_dataset

import pinecone

from dotenv import load_dotenv

In [5]:
def upsert_docs(docs_to_upsert, PINECONE_INDEX=None, namespace=None):

    pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENVIRONMENT,
    )
    
    if PINECONE_INDEX not in pinecone.list_indexes():
        pinecone.create_index(PINECONE_INDEX, dimension=1536, metric="cosine", pods=1, pod_type="p1.x1")
        
    print(pinecone.list_indexes())
    print(pinecone.describe_index(PINECONE_INDEX))

    index = pinecone.Index(PINECONE_INDEX)
    print(index.describe_index_stats())

    embedding_function = OpenAIEmbeddings(model="text-embedding-ada-002",
                                   disallowed_special=())
    
    vector_database = Pinecone.from_documents(
        documents=docs_to_upsert,
        embedding=embedding_function,
        index_name=PINECONE_INDEX,
        #namespace=namespace,
    )

    print(index.describe_index_stats())

    return True

def get_index_stats(PINECONE_INDEX):

    pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENVIRONMENT,
    )

    index = pinecone.Index(PINECONE_INDEX)
    print(index.describe_index_stats())

    return True

def delete_namespace(PINECONE_INDEX, namespace):

    pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENVIRONMENT,
    )

    index = pinecone.Index(PINECONE_INDEX)
    index.delete(delete_all=True, namespace=namespace)

    return True

# Load env variables

In [6]:
# Load env variables
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_ENVIRONMENT = os.getenv('PINECONE_ENVIRONMENT')
PINECONE_INDEX = os.getenv('PINECONE_INDEX')

# Load docs

In [None]:
df_docs = pd.read_json(path_or_buf="../pre-processed/RegleGen.json", lines=True)

docs_to_upsert = []
for i, row in df_docs.iterrows():
    docs_to_upsert.append(Document(page_content=row.content, metadata=row.metadata))

In [8]:
df_docs = pd.read_json(path_or_buf="../pre-processed/unige.jsonl", lines=True)

docs_to_upsert = []
for i, row in df_docs.iterrows():
    docs_to_upsert.append(Document(page_content=row.content, metadata=row.metadata))

# Upsert docs

In [19]:
upsert_docs(docs_to_upsert, PINECONE_INDEX=PINECONE_INDEX)

['hackathon']
IndexDescription(name='hackathon', metric='cosine', replicas=1, dimension=1536.0, shards=1, pods=1, pod_type='starter', status={'ready': True, 'state': 'Ready'}, metadata_config=None, source_collection='')
{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}
{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


True

In [21]:
get_index_stats(PINECONE_INDEX)

{'dimension': 1536,
 'index_fullness': 0.00241,
 'namespaces': {'': {'vector_count': 241}},
 'total_vector_count': 241}


True

In [17]:
#pinecone.init(
#    api_key=PINECONE_API_KEY,
#    environment=PINECONE_ENVIRONMENT,
#)

#pinecone.delete_index(PINECONE_INDEX)