In [1]:
import getpass
import os
import time

from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv

load_dotenv()

if not os.getenv("PINECONE_API_KEY"):
    os.environ["PINECONE_API_KEY"] = getpass.getpass("Enter your Pinecone API key: ")

pinecone_api_key = os.environ.get("PINECONE_API_KEY")

pc = Pinecone(api_key=pinecone_api_key)

  from tqdm.autonotebook import tqdm


In [2]:
import time

index_name = "oncehuman-vector-store"  # change if desired

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=768,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

# Embedding Models

In [3]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore

# this is selected because it has better balance between the performance and accuracy
embeddings_model = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")
vector_store = PineconeVectorStore(index=index, embedding=embeddings_model)


In [4]:
from langchain_core.documents import Document 
from datetime import datetime 

documents = [
    Document(
        page_content="Hi, this is my first time using Pinecone!",
        metadata={ 'created_at': datetime(2024, 1, 1).strftime('%Y-%m-%d') },
    )
]

ids = [
    '1'
]

vector_store.add_documents(documents=documents, ids=ids)

['1']

# Pulling Data from Pg and Load to PineCone (Will be automated using Airflow)

In [4]:
import pandas as pd 
from sqlalchemy import create_engine
from datetime import datetime 

USERNAME = 'reddit'
PASSWORD = 'reddit'
DB = 'reddit'
HOST = 'localhost'
PORT = '5432'

today_sql = datetime.now().strftime('%Y-%m-%d')
engine = create_engine(f'postgresql://{USERNAME}:{PASSWORD}@{HOST}:{PORT}/{DB}')
doc_df = pd.read_sql_query(f'SELECT * FROM \"reddit_docs\" WHERE DATE_TRUNC(\'day\', created_at) = \'{today_sql}\';', con=engine)


In [5]:
from langchain_core.documents import Document 

doc_df['created_at_str'] = doc_df['created_at'].dt.strftime('%Y-%m-%d %H:%M:%S')

documents = []
for index, row in doc_df.iterrows():
    documents.append(
        Document(
            page_content=row['document'],
            metadata={ 'created_at': row['created_at_str'], 'source': 'reddit', 'id': row['doc_id'] },
        )
    )

documents[:5]

[Document(metadata={'created_at': '2024-10-14 05:40:35', 'source': 'reddit', 'id': '1g2ugvf'}, page_content='### Post Content\nI\'m One Of The People Who Got Banned Recently For Unknown Reasons\n\nHey, how\'s it going? I hope its better for you than me in this game currently. You may have seen a few posts from folks on NA P\\_Clash-X0004 who\'s friends got permabanned for unknown reasons while driving around. Welp, I\'m one of those friends. My in-game name was Huenheim, and my ID is 150278471. \n\nI\'m mostly making this post so its clear and from one of the people it directly happened to. I want to assure you that I am NOT a cheater. I get that every cheater says that, and I have zero ways to "prove it", but its 100% the truth. I\'m an old Destiny 2 head, and trust me when I say you build up a pretty healthy intolerance for cheating after going through the hell of cheaters in PVP in that game. Honestly, I\'m not sure me or any my crew will ever return to this game, which is a real sh

In [6]:
print("Total documents count:", len(documents))

Total documents count: 191


In [7]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# 2048 characters is roughly 384 words
splitter = RecursiveCharacterTextSplitter(chunk_size=2048, chunk_overlap=0)
docs = splitter.split_documents(documents=documents)
print("Total chunked doc count:", len(docs))

Total chunked doc count: 824


In [8]:
docs[:5]

[Document(metadata={'created_at': '2024-10-14 05:40:35', 'source': 'reddit', 'id': '1g2ugvf'}, page_content='### Post Content\nI\'m One Of The People Who Got Banned Recently For Unknown Reasons\n\nHey, how\'s it going? I hope its better for you than me in this game currently. You may have seen a few posts from folks on NA P\\_Clash-X0004 who\'s friends got permabanned for unknown reasons while driving around. Welp, I\'m one of those friends. My in-game name was Huenheim, and my ID is 150278471. \n\nI\'m mostly making this post so its clear and from one of the people it directly happened to. I want to assure you that I am NOT a cheater. I get that every cheater says that, and I have zero ways to "prove it", but its 100% the truth. I\'m an old Destiny 2 head, and trust me when I say you build up a pretty healthy intolerance for cheating after going through the hell of cheaters in PVP in that game. Honestly, I\'m not sure me or any my crew will ever return to this game, which is a real sh

In [9]:
from uuid import uuid4

ids = [str(uuid4()) for _ in range(len(docs))]
vector_store.add_documents(documents=docs, ids=ids)

['86c89048-1a5b-418f-92f4-7c9abd7b979f',
 'e77e5415-7eb5-4432-9ae0-4182a8f3a7ff',
 'fae77c88-cf84-4c19-a3f5-9bab6903001f',
 'd5f63133-813f-4620-a52e-2787029b26f6',
 '31760b72-f952-4b18-9f40-26298f121311',
 '070f6163-3b4b-4385-9d66-953a3dddd0fb',
 '8b230978-e954-45eb-8eea-02c8022f92d7',
 '4cf12d66-cd13-4e5e-82a8-13b5bfba87c2',
 '9c90815d-33e3-46bb-8d14-5f3451bbcf94',
 '19a68d18-0ba5-4c7d-ade1-1a76b7fc02cc',
 'eabab8aa-2797-45ff-b590-e548d78ba9fa',
 '364c31b7-3afd-4bba-bda0-4f2639fc49b8',
 '888375c0-190d-416c-94d6-65a29b901a7e',
 '709783d2-2d3b-4906-bcf9-a98caf5bda4a',
 '5ee4754f-7c8a-4fe2-b6ac-23e031feed63',
 '6b1d8066-4757-4cc9-ab53-b0fc2a0a44d7',
 '76fe92f2-8b33-4670-b5a6-51785cf0d2cf',
 '55f79861-2216-4f49-a663-7373af45829c',
 'a5ea70c7-aa57-4647-805d-8b6908843d01',
 '30956db3-8948-4212-9024-ee2b64b5fbf1',
 '97bdee07-cb6b-4f0e-9b64-6cb0bb3cf30f',
 '4bbfcf0c-019e-425e-891b-80d59f5d31b0',
 '1433657e-dc09-45c2-a3f8-aa9e8ae102d8',
 '2e38d800-8fa8-41b7-bd3b-83a766c9b4fa',
 '8f4d0ece-59c1-

In [12]:
vector_store.search(query='NGL my vehicle has done some amazing feats', search_type='mmr', k=5, fetch_k=20)

[Document(metadata={'created_at': '2024-10-14 05:43:20', 'id': '1fpk0i1', 'source': 'reddit'}, page_content='NGL my vehicle has done some amazing feats bouncing off the environment but never that high and it’s good for a laugh but never been, however the fact you all touched the sky box is pretty alarming though I almost think the person knew about doing knew the risks and didn’t tell you guys as well and wouldn’t surprise me at all if you all aren’t the first people to be banned because of it\n\nPlay stupid games win stupid prizes?\n\n\nExploiting something in the game to make the vehicles do something it wasn\'t meant to do.\n\n\nBan seems heavyhanded, but y\'all literally gave them a reason. Offered yourselves up on a platter to the ban algorithm.\n\nI have also been banned for that hive Vehicle (My in game name is Valentera). I have been fighting with both the in game ban appeal and the discord ban appeal processes. All attempts have failed, I decided to go to reddit and saw this p