In [1]:
import warnings
warnings.filterwarnings('ignore')
from langchain_core.messages import SystemMessage, HumanMessage
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI

_ = load_dotenv()

In [2]:
import getpass
import os
import time

from pinecone import Pinecone, ServerlessSpec

if not os.getenv("PINECONE_API_KEY"):
    os.environ["PINECONE_API_KEY"] = getpass.getpass("Enter your Pinecone API key: ")

pinecone_api_key = os.environ.get("PINECONE_API_KEY")

pc = Pinecone(api_key=pinecone_api_key)

In [3]:
import time

index_name = "langchain-rag"  # change if desired


existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

# if index_name in existing_indexes:
#     pc.delete_index(
#         name=index_name
#     )

if index_name not in existing_indexes:
    pc.create_index(
            name=index_name,
            dimension=1536,
            metric="cosine",
            spec=ServerlessSpec(cloud="aws", region="us-east-1"),
        )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)
index

<pinecone.data.index.Index at 0x232949fa660>

In [4]:
with open("../all-the-news-3.csv", 'r') as f:
    header = f.readline()
    print(header)

date,year,month,day,author,title,article,url,section,publication



In [5]:
import pandas as pd

df = pd.read_csv('../all-the-news-3.csv', nrows=99)
df.head()

Unnamed: 0,date,year,month,day,author,title,article,url,section,publication
0,2016-12-09 18:31:00,2016,12.0,9,Lee Drutman,We should take concerns about the health of li...,"This post is part of Polyarchy, an independent...",https://www.vox.com/polyarchy/2016/12/9/138983...,,Vox
1,2016-10-07 21:26:46,2016,10.0,7,Scott Davis,Colts GM Ryan Grigson says Andrew Luck's contr...,The Indianapolis Colts made Andrew Luck the h...,https://www.businessinsider.com/colts-gm-ryan-...,,Business Insider
2,2018-01-26 00:00:00,2018,1.0,26,,Trump denies report he ordered Mueller fired,"DAVOS, Switzerland (Reuters) - U.S. President ...",https://www.reuters.com/article/us-davos-meeti...,Davos,Reuters
3,2019-06-27 00:00:00,2019,6.0,27,,France's Sarkozy reveals his 'Passions' but in...,PARIS (Reuters) - Former French president Nico...,https://www.reuters.com/article/france-politic...,World News,Reuters
4,2016-01-27 00:00:00,2016,1.0,27,,Paris Hilton: Woman In Black For Uncle Monty's...,Paris Hilton arrived at LAX Wednesday dressed ...,https://www.tmz.com/2016/01/27/paris-hilton-mo...,,TMZ


In [7]:
from tqdm.auto import tqdm, trange
from langchain_openai import OpenAIEmbeddings
import pandas as pd

embedding = OpenAIEmbeddings(model="text-embedding-3-small")

CHUNK_SIZE = 400
TOTAL_ROWS = 10000
progress_bar = tqdm(total=TOTAL_ROWS)
chunks = pd.read_csv('../all-the-news-3.csv', chunksize=CHUNK_SIZE, nrows=TOTAL_ROWS)

chunk_num = 0
for chunk in chunks:
    titles = chunk['title'].tolist()
    # print(titles)
    embeddings = embedding.embed_documents(titles)
    # print(len(embeddings[0]))  # Should print 1536 if you're using `text-embedding-3-small`
    prepped = [{'id': str(chunk_num * CHUNK_SIZE + i), 'values': embeddings[i],  # Use embeddings[i] directly
                'metadata': {'title': titles[i]}} for i in range(len(titles))]
    chunk_num += 1
    if len(prepped) >= 200:
        index.upsert(prepped)
        prepped = []
    progress_bar.update(len(chunk))

  0%|          | 0/10000 [00:00<?, ?it/s]

APIConnectionError: Connection error.

In [30]:
print(index.describe_index_stats())

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 10000}},
 'total_vector_count': 10000}


In [31]:
from langchain_pinecone import PineconeVectorStore

def get_recommendations(pinecone_index, search_term, top_k=2):
    pinecone = PineconeVectorStore(index=pinecone_index, embedding=embedding)
    results = pinecone.similarity_search(
    query=search_term,
    k=top_k,
)
    return results
reco = get_recommendations(index, 'obama')
reco

Found document with no `text` key. Skipping.
Found document with no `text` key. Skipping.


[]