In [1]:
import getpass
import os
import time

from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv

load_dotenv()

if not os.getenv("PINECONE_API_KEY"):
    os.environ["PINECONE_API_KEY"] = getpass.getpass("Enter your Pinecone API key: ")

pinecone_api_key = os.environ.get("PINECONE_API_KEY")

pc = Pinecone(api_key=pinecone_api_key)

  from tqdm.autonotebook import tqdm


In [2]:
import time

index_name = "oncehuman-vector-store"  # change if desired

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=768,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

# Embedding Models

In [34]:
from langchain_huggingface.embeddings.huggingface_endpoint import HuggingFaceEndpointEmbeddings
from langchain_pinecone import PineconeVectorStore

# this is selected because it has better balance between the performance and accuracy
embeddings_model = HuggingFaceEndpointEmbeddings(model='sentence-transformers/all-mpnet-base-v2')
vector_store = PineconeVectorStore(index=index, embedding=embeddings_model)


In [4]:
from langchain_core.documents import Document 
from datetime import datetime 

documents = [
    Document(
        page_content="Hi, this is my first time using Pinecone!",
        metadata={ 'created_at': datetime(2024, 1, 1).strftime('%Y-%m-%d') },
    )
]

ids = [
    '1'
]

vector_store.add_documents(documents=documents, ids=ids)

['1']

### Pulling Data from Pg and Load to PineCone (Will be automated using Airflow)

In [24]:
import pandas as pd 
from sqlalchemy import create_engine
from datetime import datetime 

USERNAME = 'reddit'
PASSWORD = 'reddit'
DB = 'reddit'
HOST = 'localhost'
PORT = '5432'

engine = create_engine(f'postgresql://{USERNAME}:{PASSWORD}@{HOST}:{PORT}/{DB}')
doc_df = pd.read_sql_query(f'SELECT * FROM \"reddit_docs\" WHERE DATE_TRUNC(\'day\', created_at) IN (\'2024-10-14\', \'2024-10-15\');', con=engine)
post_df = pd.read_sql_query(f'SELECT * FROM \"reddit_posts\"', con=engine)
joined_df = pd.merge(doc_df, post_df, left_on='doc_id', right_on='id', how='left')


In [27]:
from langchain_core.documents import Document 

REDDIT_HOST = 'https://reddit.com'

joined_df['created_at_str'] = joined_df['created_at_x'].dt.strftime('%Y-%m-%d %H:%M:%S')
joined_df['source_url'] = REDDIT_HOST + joined_df['permalink']

documents = []
for index, row in joined_df.iterrows():
    documents.append(
        Document(
            page_content=row['document'],
            metadata={ 'created_at': row['created_at_str'], 'source': 'reddit', 'source_url': row['source_url'], 'id': row['doc_id'] },
        )
    )

documents[:5]

[Document(metadata={'created_at': '2024-10-14 05:40:35', 'source': 'reddit', 'source_url': 'https://reddit.com/r/OnceHumanOfficial/comments/1g2ugvf/im_one_of_the_people_who_got_banned_recently_for/', 'id': '1g2ugvf'}, page_content='### Post Content\nI\'m One Of The People Who Got Banned Recently For Unknown Reasons\n\nHey, how\'s it going? I hope its better for you than me in this game currently. You may have seen a few posts from folks on NA P\\_Clash-X0004 who\'s friends got permabanned for unknown reasons while driving around. Welp, I\'m one of those friends. My in-game name was Huenheim, and my ID is 150278471. \n\nI\'m mostly making this post so its clear and from one of the people it directly happened to. I want to assure you that I am NOT a cheater. I get that every cheater says that, and I have zero ways to "prove it", but its 100% the truth. I\'m an old Destiny 2 head, and trust me when I say you build up a pretty healthy intolerance for cheating after going through the hell o

In [28]:
print("Total documents count:", len(documents))

Total documents count: 224


In [29]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# 2048 characters is roughly 384 words
splitter = RecursiveCharacterTextSplitter(chunk_size=2048, chunk_overlap=0)
docs = splitter.split_documents(documents=documents)
print("Total chunked doc count:", len(docs))

Total chunked doc count: 897


In [30]:
docs[:5]

[Document(metadata={'created_at': '2024-10-14 05:40:35', 'source': 'reddit', 'source_url': 'https://reddit.com/r/OnceHumanOfficial/comments/1g2ugvf/im_one_of_the_people_who_got_banned_recently_for/', 'id': '1g2ugvf'}, page_content='### Post Content\nI\'m One Of The People Who Got Banned Recently For Unknown Reasons\n\nHey, how\'s it going? I hope its better for you than me in this game currently. You may have seen a few posts from folks on NA P\\_Clash-X0004 who\'s friends got permabanned for unknown reasons while driving around. Welp, I\'m one of those friends. My in-game name was Huenheim, and my ID is 150278471. \n\nI\'m mostly making this post so its clear and from one of the people it directly happened to. I want to assure you that I am NOT a cheater. I get that every cheater says that, and I have zero ways to "prove it", but its 100% the truth. I\'m an old Destiny 2 head, and trust me when I say you build up a pretty healthy intolerance for cheating after going through the hell o

In [31]:
from uuid import uuid4

ids = [str(uuid4()) for _ in range(len(docs))]
vector_store.add_documents(documents=docs, ids=ids)

['00a20330-2160-4ab9-b483-046d741e8f44',
 'c3495e4a-8d3e-4b2b-a449-77d500978a4d',
 'fbecb596-f075-4536-8cfb-0a9a34fc1470',
 '33d5f24b-7d9f-4314-8642-ae78ce86f6e1',
 '2f295e4c-44d3-44b8-a52e-d5aadd243646',
 '8e0d3de5-b764-4cd0-851a-c8b064d58ba0',
 '06d751a6-0b94-48a3-82d9-b8e7bf5ce93e',
 '8591caa4-d113-4317-ad73-8d8304568486',
 'a80c43a0-7444-4f28-a12f-3286be53891f',
 'b8c3c0ee-2504-41fa-8251-20f72fa83dfa',
 '17abe11f-dd4a-419c-aa62-20fef0840e0f',
 '6ba12d00-c895-4892-94cd-23190b66f21e',
 '56a03509-16ae-4656-b851-a05ed84f4e60',
 '3aded414-90c3-41d4-aa88-8f40da96512e',
 'c01f091a-8ff9-4f68-b0bc-d111e2f87d2c',
 '2399a761-8865-4a01-a50e-fa1e85b84054',
 'a899e982-15f4-4810-b7a2-536c5582ef41',
 '20330319-91d2-4cee-b822-1401949f849e',
 '2edf5ed5-be07-4a86-a27d-60975ad23711',
 '4ac9ca36-7e27-4f00-a10b-cfa79f3b9854',
 '763ff566-00e1-4807-b33a-bcd6be4fe513',
 '0d8d274e-3845-47ca-acf5-696d0482bcf8',
 '342e4c50-1701-48a8-83a6-3d4fec2b97a9',
 '2c6deb5c-b8bc-407f-bd70-4c7f07f19f03',
 '3eda874c-a37b-

In [32]:
vector_store.search(query='NGL my vehicle has done some amazing feats', search_type='mmr', k=5, fetch_k=20)

[Document(metadata={'created_at': '2024-09-18 16:17:46', 'id': '415fbfa4-a5cc-41ba-9b40-2bbdd2d2ed86', 'source': 'OnceHuman Wiki', 'source_url': 'https://once-human.fandom.com/wiki/Vehicle'}, page_content="Durability: 9,000 (last \\+ 900\\)\r\nSpeed: 2\\.8 (last \\+ 0\\.1\\)\r\nHandling: 4\\.8 (last \\+ 0\\.2\\)\r\nGas Tank Size: 13,000 (last \\+ 3,000\\)\r\nRetro Coupe\\[]\r\n--------------\r\nThe Retro Coupe is a beefy car fit for an apocalypse. Resembling a 1965\\-1970 Ford Mustang, this muscle car can be unlocked after unlocking Four\\-Wheeler in the Memetics under Building. The capacity takes 2 players (including the driver) with the passenger seat.\r\n**Tier One**\r\nStats:\r\nDurability: 4,200\r\nSpeed: 3\\.5\r\nHandling: 2\\.6\r\nGas Tank Size: 6,000\r\nTrailer Head\\[]\r\n---------------\r\nThe Trailer Head is a cargo truck that you can use to (as far as the game's trailer goes) make your very own motor home! This monster truck can be unlocked after creating a hive garage and 

### Loading OnceHuman Wiki data to PineCone

In [4]:
import pandas as pd

wiki_df = pd.read_json('data/data.json')
wiki_df.head(5)

Unnamed: 0,date_scraped,title,source_url,text
0,2024-09-18 16:17:40,Once Human Wiki | Fandom,https://once-human.fandom.com/wiki/Main_Page,**Welcome to the Once Human Wiki**\[]\r\n=====...
1,2024-09-18 16:17:41,Once Human | Once Human Wiki | Fandom,https://once-human.fandom.com/wiki/Once_Human,**BACKGROUND STORY**\[]\r\n-------------------...
2,2024-09-18 16:17:43,Wish Machine | Once Human Wiki | Fandom,https://once-human.fandom.com/wiki/Wish_Machine,"> ""Behold, mortal, before you is the great wil..."
3,2024-09-18 16:17:44,Memetics | Once Human Wiki | Fandom,https://once-human.fandom.com/wiki/Memetics,Memetics grant Formulas that players can use t...
4,2024-09-18 16:17:45,Boss & Monsters | Once Human Wiki | Fandom,https://once-human.fandom.com/wiki/Boss_%26_Mo...,---\r\n**NPCs**\[]\r\n===========\r\n[![Veroni...


In [12]:
from langchain_core.documents import Document 
from uuid import uuid4 

wiki_df['document'] = '### ' + wiki_df['title'] + '\n\n' + wiki_df['text']
wiki_df['id'] = [str(uuid4()) for _ in range(wiki_df.shape[0])]

documents = []
for index, row in wiki_df.iterrows():
    documents.append(
        Document(
            page_content=row['document'],
            metadata={ 'created_at': row['date_scraped'], 'source': 'OnceHuman Wiki', 'source_url': row['source_url'], 'id': row['id'] },
        )
    )

documents[:5]

 Document(metadata={'created_at': '2024-09-18 16:17:41', 'source': 'OnceHuman Wiki', 'source_url': 'https://once-human.fandom.com/wiki/Once_Human', 'id': '62505df2-5527-429e-ab9b-9c4dbf58b158'}, page_content='### Once Human | Once Human Wiki | Fandom\n\n**BACKGROUND STORY**\\[]\r\n-----------------------\r\nThere have been countless organizations secretly researching the origin of species and civilizations in human history. The most successful organization in modern times is the "Rosetta", whose predecessor was the Rosetta Institute established in 1967 by French human historian Professor Rachael·Pierre·Luberni(Professor Luberni discovered the Rosetta Stone). Rosetta was dedicated to the study of pre\\-human civilization and pan\\-cosmology. Then it gradually became a multinational giant in the communication and information industry in public perception, and secretly extended all around the world. Professor Luberni devoted himself to the research on the evolution of ancient human civili

In [13]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=2048, chunk_overlap=0)
docs = splitter.split_documents(documents=documents)

ids = ids = [str(uuid4()) for _ in range(len(docs))]
vector_store.add_documents(documents=docs, ids=ids)

['44a65c9e-8f0c-43d3-b973-dca5056995eb',
 'd7529746-e369-4c91-a71a-a21f451da177',
 '164bf692-4c2e-4063-b173-8ff9f293c2b6',
 '8e6c3cf4-1022-4257-beb5-a5e15aa5ea9a',
 'cc717eb7-2a7a-41c9-ab49-96bf8b5203af',
 'e6e4f368-203a-4833-bc22-3f939eacfba6',
 '271c91cc-7989-4d3d-9c23-2418876f4b05',
 '4ea7b7d0-7901-4022-bc13-89bff63faf43',
 'af32b0e9-1697-465b-a37d-fa45ee8c2a77',
 '9b92761e-001f-4e24-94bc-5659f2c6efb8',
 '207dd85d-1e87-4d87-bbb4-52636216a497',
 '81f89232-0c2c-4101-afea-87261ca4055b',
 'eae575eb-93c8-45dc-b2a4-5874d8833640',
 '5085d45d-a0c1-4b7d-b9d9-2f929a1eb2f5',
 '0198fe57-d0a3-40ce-983e-ffdd39ae7646',
 'c69a687b-79e7-4dac-b9f4-26f92fef8a56',
 '7978d96f-51ec-4a5d-b322-850537536a45',
 '90822fda-f749-4610-a037-c0e30a8faed8',
 '90db9873-fd9c-480f-a187-df1b37035aa4',
 'a2c0c31a-8a4c-40ba-bf80-ba2ece11e37b',
 '0236bd5f-4281-446d-9638-3c88337cf9a6',
 '5b2d1554-4efc-47bf-a93a-6b2bffb641c3',
 '0bc9d070-10e3-4aea-bcad-d1494364157f',
 '9db2f1f5-08f9-4a6e-9dbe-560aa21baf61',
 '37124057-849e-