In [3]:
import warnings
warnings.filterwarnings('ignore')
from langchain_core.messages import SystemMessage, HumanMessage
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI

_ = load_dotenv()

In [4]:
import getpass
import os
import time

from pinecone import Pinecone, ServerlessSpec

if not os.getenv("PINECONE_API_KEY"):
    os.environ["PINECONE_API_KEY"] = getpass.getpass("Enter your Pinecone API key: ")

pinecone_api_key = os.environ.get("PINECONE_API_KEY")

pc = Pinecone(api_key=pinecone_api_key)

In [5]:
import time

index_name = "langchain-rag"  # change if desired


existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

# if index_name in existing_indexes:
#     pc.delete_index(
#         name=index_name
#     )

if index_name not in existing_indexes:
    pc.create_index(
            name=index_name,
            dimension=1536,
            metric="cosine",
            spec=ServerlessSpec(cloud="aws", region="us-east-1"),
        )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)
index

<pinecone.data.index.Index at 0x2a306dad580>

### Load the Dataset

**Note:** To access the dataset outside of this course, just copy the following two lines of code and run it (remember to uncomment them first before executing):

!wget -q --show-progress -O all-the-news-3.zip "https://www.dropbox.com/scl/fi/wruzj2bwyg743d0jzd7ku/all-the-news-3.zip?rlkey=rgwtwpeznbdadpv3f01sznwxa&dl=1"

!unzip all-the-news-3.zip

In [6]:
with open("../all-the-news-3.csv", 'r') as f:
    header = f.readline()
    print(header)

date,year,month,day,author,title,article,url,section,publication



In [7]:
import pandas as pd

df = pd.read_csv('../all-the-news-3.csv', nrows=99)
df.head()

Unnamed: 0,date,year,month,day,author,title,article,url,section,publication
0,2016-12-09 18:31:00,2016,12.0,9,Lee Drutman,We should take concerns about the health of li...,"This post is part of Polyarchy, an independent...",https://www.vox.com/polyarchy/2016/12/9/138983...,,Vox
1,2016-10-07 21:26:46,2016,10.0,7,Scott Davis,Colts GM Ryan Grigson says Andrew Luck's contr...,The Indianapolis Colts made Andrew Luck the h...,https://www.businessinsider.com/colts-gm-ryan-...,,Business Insider
2,2018-01-26 00:00:00,2018,1.0,26,,Trump denies report he ordered Mueller fired,"DAVOS, Switzerland (Reuters) - U.S. President ...",https://www.reuters.com/article/us-davos-meeti...,Davos,Reuters
3,2019-06-27 00:00:00,2019,6.0,27,,France's Sarkozy reveals his 'Passions' but in...,PARIS (Reuters) - Former French president Nico...,https://www.reuters.com/article/france-politic...,World News,Reuters
4,2016-01-27 00:00:00,2016,1.0,27,,Paris Hilton: Woman In Black For Uncle Monty's...,Paris Hilton arrived at LAX Wednesday dressed ...,https://www.tmz.com/2016/01/27/paris-hilton-mo...,,TMZ


In [10]:
from tqdm.auto import tqdm, trange
from langchain_openai import OpenAIEmbeddings
import pandas as pd

embedding = OpenAIEmbeddings(model="text-embedding-3-small")

CHUNK_SIZE = 400
TOTAL_ROWS = 10000
progress_bar = tqdm(total=TOTAL_ROWS)
chunks = pd.read_csv('../all-the-news-3.csv', chunksize=CHUNK_SIZE, nrows=TOTAL_ROWS)

chunk_num = 0
for chunk in chunks:
    titles = chunk['title'].tolist()
    print(titles)
    embeddings = embedding.embed_documents(titles)
    # print(len(embeddings[0]))  # Should print 1536 if you're using `text-embedding-3-small`
    prepped = [{'id': str(chunk_num * CHUNK_SIZE + i), 'values': embeddings[i],  # Use embeddings[i] directly
                'metadata': {'title': titles[i]}} for i in range(len(titles))]
    chunk_num += 1
    if len(prepped) >= 200:
        index.upsert(prepped)
        prepped = []
    progress_bar.update(len(chunk))

  0%|          | 0/10000 [00:00<?, ?it/s]

['Exclusive - Malaysia seeks to lay multiple charges against ex-PM Najib over 1MDB: Mahathir', "How much does lithium cost? The industry can't seem to agree", 'The Tantalizing Photographic Figure at 1-54 Contemporary African Art Fair', "Hillary Was Great, but We Want a 'Fire and Fury' Audiobook by Cardi B", 'Johnny Manziel Club Hopping After Indictment', 'Scuba, Eats Everything and More in the DJ World React to the London Attacks', 'Betting the house: investors demand higher premiums for risky Australian mortgage bonds', 'Israel’s public security minister blames Facebook after recent West Bank attacks', 'You’ll Soon Be Able to Search for GIFs Directly On Twitter', 'Bus falls into gorge in Indian Kashmir, killing at least 33', "Exclusive: Conoco moves to take over Venezuelan PDVSA's Caribbean assets - sources", 'Martin Parr Photographs Real, Sad Food', 'Twitter isn’t hosting its annual developers conference, Flight, this year', 'Midterm elections 2018: Russia trying to influence vote, U

In [9]:
print(index.describe_index_stats())

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 10000}},
 'total_vector_count': 10000}


In [15]:
from langchain_pinecone import PineconeVectorStore

def get_recommendations(pinecone_index, search_term, top_k=2):
    embed = embedding.embed_query(search_term)
    res = pinecone_index.query(vector=embed, top_k=top_k, include_metadata=True)
    return res


reco = get_recommendations(index, 'big Israel speech')
for r in reco.matches:
    print(f'{r.score} : {r.metadata["title"]}')

0.547139764 : "Kerry has finally decided to take no for an answer": why he gave his big Israel speech
0.524162531 : Israel to attend U.S.-led Palestinian conference


In [17]:
if index_name in [index.name for index in pc.list_indexes()]:
    pc.delete_index(name=index_name)

pc.create_index(name=index_name, dimension=1536, metric='cosine',
            spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)
articles_index = pc.Index(index_name)

In [24]:
def embed(embeddings, title, prepped, embed_num):
    for embedding in embeddings:
        prepped.append({'id':str(embed_num), 'values':embedding, 'metadata':{'title':title}})
        embed_num += 1
        if len(prepped) >= 100:
            articles_index.upsert(prepped)
            prepped.clear()
    return embed_num

In [25]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

news_data_rows_num = 100

embed_num = 0 #keep track of embedding number for 'id'
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, 
    chunk_overlap=20) # how to chunk each article
prepped = []
df = pd.read_csv('../all-the-news-3.csv', nrows=news_data_rows_num)
articles_list = df['article'].tolist()
titles_list = df['title'].tolist()

for i in range(0, len(articles_list)):
    print(".",end="")
    art = articles_list[i]
    title = titles_list[i]
    if art is not None and isinstance(art, str):
        texts = text_splitter.split_text(art)
        embeddings = embedding.embed_documents(texts)
        embed_num = embed(embeddings, title, prepped, embed_num)

....................................................................................................

In [26]:
articles_index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1000}},
 'total_vector_count': 1000}

In [32]:
reco = get_recommendations(articles_index, 'big Israel speech', top_k=50)
seen = {}
for r in reco.matches:
    title = r.metadata['title']
    if title not in seen:
        print(f'{r.score} : {title}')
        seen[title] = '.'

0.403875828 : Why Jews Are Getting Themselves Arrested at ICE Centers Around the Country
0.317938417 : It's 2018, and these white supremacists are running for office
0.290209 : Trump tells anti-abortion marchers he will support them
0.270024717 : The government official in charge of ethics just harshly condemned Trump’s plan
0.268210411 : “Elizabeth Warren called me!” is turning into a Twitter meme
0.259197772 : How one woman used fashion to reclaim her Muslim American identity
0.248234868 : Video of Deontay Wilder Squaring Off with Contender in Hotel Lobby
0.246738791 : How the Clinton campaign is making #ThatMexicanThing a thing, explained
0.240854725 : Trump keeping options open as Republican feud rages
0.238033742 : How love and marriage are changing, according to 63,000 New York Times wedding announcements
0.236377329 : IEA concerned about Middle East tensions, stands ready to act
0.231086433 : Jermaine Jackson Rips Quincy Jones For Scrubbing Michael's Name From Show
0.230295628 :