In [None]:

! pip install -qU pinecone-client langchain_community cohere

In [None]:
import pandas as pd

df = pd.read_csv('../data/20200325_counsel_chat.csv', encoding='utf-8-sig')
df.info()

In [None]:

df.head(5)

In [None]:
import os
import getpass
index_name = 'ai-agent'
os.environ['PINECONE_API_KEY'] = pinecone_secret_key = getpass.getpass('Enter Pinecone secret key:')
cohere_secret_key = getpass.getpass('Enter Cohere secret key:')

In [None]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=pinecone_secret_key)
if index_name not in pc.list_indexes().names():
  pc.create_index(
    name = index_name,
    dimension = 4096,
    metric = 'cosine',
    spec=ServerlessSpec(
      cloud="aws",
      region="us-west-2"
      )
  )
pc.describe_index(index_name)

In [None]:

from langchain.schema import Document
from langchain.embeddings import CohereEmbeddings
from langchain_community.vectorstores import Pinecone as Pinecone_Langchain
import time

In [None]:
embeddings = CohereEmbeddings(cohere_api_key= cohere_secret_key, user_agent=index_name)

In [None]:
# Prepare documents
documents = []
for index, row in df.iterrows():
    text = (row['questionText'] if pd.notna(row['questionText']) else '') + "\n" + \
           (row['answerText'] if pd.notna(row['answerText']) else '')
    metadata = {
        'question title': row.get('questionTitle', ''),
        'question link': row.get('questionLink', ''),
        'topic': row.get('topic', ''),
        'therapist info': row.get('therapistInfo', ''),
        'therapist URL': row.get('therapistURL', ''),
        'upvotes': row.get('upvotes', 0),
        'views': row.get('views', 0)
    }
    documents.append(Document(page_content=text, metadata=metadata))

# Batching to avoid hitting rate limits
BATCH_SIZE = 50  # adjust depending on your quota (Cohere free tier often allows ~100/minute)
vector_store = None

for i in range(0, len(documents), BATCH_SIZE):
    batch_docs = documents[i:i + BATCH_SIZE]
    
    print(f"Processing batch {i // BATCH_SIZE + 1} of {len(documents) // BATCH_SIZE + 1}...")

    try:
        # Initialize or append to existing index
        if i == 0:
            vector_store = Pinecone_Langchain.from_documents(batch_docs, embeddings, index_name=index_name)
        else:
            vector_store.add_documents(batch_docs)
    except Exception as e:
        print(f"Error in batch {i // BATCH_SIZE + 1}: {str(e)}")
    
    # Wait to respect rate limits (you can adjust sleep time based on your plan)
    time.sleep(10)

print("✅ Done uploading all documents.")

In [None]:


# Load the uploaded dataset
df_2 = pd.read_csv('../data/train.csv')


# Check columns (customize this based on actual column names)
print("Columns in dataset:", df_2.columns)

# Prepare documents from dataset
documents = []
for _, row in df_2.iterrows():
    # Customize page content and metadata based on your actual dataset
    page_content = ""
    if pd.notna(row.get("question")):
        page_content += row["question"] + "\n"
    if pd.notna(row.get("response")):
        page_content += row["response"]

    metadata = {
        "category": row.get("category", ""),
        "sub_category": row.get("sub_category", ""),
        "difficulty": row.get("difficulty", ""),
        # Add more if your dataset has other fields
    }

    documents.append(Document(page_content=page_content, metadata=metadata))

# Upload in batches
BATCH_SIZE = 50
vector_store = None

for i in range(0, len(documents), BATCH_SIZE):
    batch_docs = documents[i:i + BATCH_SIZE]
    
    print(f"Uploading batch {i // BATCH_SIZE + 1} / {(len(documents) - 1) // BATCH_SIZE + 1}")

    try:
        if i == 0:
            vector_store = Pinecone_Langchain.from_documents(batch_docs, embeddings, index_name=index_name)
        else:
            vector_store.add_documents(batch_docs)
    except Exception as e:
        print(f"⚠️ Error during batch {i // BATCH_SIZE + 1}: {str(e)}")

    time.sleep(10)  # avoid hitting Cohere rate limits

print("✅ All documents uploaded to Pinecone!")
