### Initializing Pinecone Object

In [1]:
import getpass
import os
import time

from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv

load_dotenv()

if not os.getenv("PINECONE_API_KEY"):
    os.environ["PINECONE_API_KEY"] = getpass.getpass("Enter your Pinecone API key: ")

pinecone_api_key = os.environ.get("PINECONE_API_KEY")

pc = Pinecone(api_key=pinecone_api_key)

  from tqdm.autonotebook import tqdm


In [2]:
import time

index_name = "oncehuman-vector-store"  # change if desired

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=768,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

### Connecting to Embedding Endpoint

In [3]:
from langchain_huggingface.embeddings.huggingface_endpoint import HuggingFaceEndpointEmbeddings
from langchain_pinecone import PineconeVectorStore

embedding_endpoint = HuggingFaceEndpointEmbeddings(model='sentence-transformers/all-mpnet-base-v2')
vector_store = PineconeVectorStore(index=index, embedding=embedding_endpoint)

### Loading data from Postgres and Wiki

In [4]:
import pandas as pd 
from sqlalchemy import create_engine
from datetime import datetime 

USERNAME = 'reddit'
PASSWORD = 'reddit'
DB = 'reddit'
HOST = 'localhost'
PORT = '5432'

engine = create_engine(f'postgresql://{USERNAME}:{PASSWORD}@{HOST}:{PORT}/{DB}')
doc_df = pd.read_sql_query(f'SELECT * FROM \"reddit_docs\" WHERE DATE_TRUNC(\'day\', created_at) IN (\'2024-10-14\', \'2024-10-15\');', con=engine)
post_df = pd.read_sql_query(f'SELECT * FROM \"reddit_posts\"', con=engine)
joined_df = pd.merge(doc_df, post_df, left_on='doc_id', right_on='id', how='left')


In [5]:
from langchain_core.documents import Document 

REDDIT_HOST = 'https://reddit.com'

joined_df['created_at_str'] = joined_df['created_at_x'].dt.strftime('%Y-%m-%d %H:%M:%S')
joined_df['source_url'] = REDDIT_HOST + joined_df['permalink']

documents = []
for index, row in joined_df.iterrows():
    documents.append(
        Document(
            page_content=row['document'],
            metadata={ 'created_at': row['created_at_str'], 'source': 'reddit', 'source_url': row['source_url'], 'id': row['doc_id'] },
        )
    )

documents[:5]

[Document(metadata={'created_at': '2024-10-14 05:40:35', 'source': 'reddit', 'source_url': 'https://reddit.com/r/OnceHumanOfficial/comments/1g2ugvf/im_one_of_the_people_who_got_banned_recently_for/', 'id': '1g2ugvf'}, page_content='### Post Content\nI\'m One Of The People Who Got Banned Recently For Unknown Reasons\n\nHey, how\'s it going? I hope its better for you than me in this game currently. You may have seen a few posts from folks on NA P\\_Clash-X0004 who\'s friends got permabanned for unknown reasons while driving around. Welp, I\'m one of those friends. My in-game name was Huenheim, and my ID is 150278471. \n\nI\'m mostly making this post so its clear and from one of the people it directly happened to. I want to assure you that I am NOT a cheater. I get that every cheater says that, and I have zero ways to "prove it", but its 100% the truth. I\'m an old Destiny 2 head, and trust me when I say you build up a pretty healthy intolerance for cheating after going through the hell o

In [6]:
print("Total documents count:", len(documents))

Total documents count: 224


In [7]:
import pandas as pd

wiki_df = pd.read_json('data/data.json')
wiki_df.head(5)

Unnamed: 0,date_scraped,title,source_url,text
0,2024-09-18 16:17:40,Once Human Wiki | Fandom,https://once-human.fandom.com/wiki/Main_Page,**Welcome to the Once Human Wiki**\[]\r\n=====...
1,2024-09-18 16:17:41,Once Human | Once Human Wiki | Fandom,https://once-human.fandom.com/wiki/Once_Human,**BACKGROUND STORY**\[]\r\n-------------------...
2,2024-09-18 16:17:43,Wish Machine | Once Human Wiki | Fandom,https://once-human.fandom.com/wiki/Wish_Machine,"> ""Behold, mortal, before you is the great wil..."
3,2024-09-18 16:17:44,Memetics | Once Human Wiki | Fandom,https://once-human.fandom.com/wiki/Memetics,Memetics grant Formulas that players can use t...
4,2024-09-18 16:17:45,Boss & Monsters | Once Human Wiki | Fandom,https://once-human.fandom.com/wiki/Boss_%26_Mo...,---\r\n**NPCs**\[]\r\n===========\r\n[![Veroni...


In [8]:
from uuid import uuid4 

wiki_df['document'] = '### ' + wiki_df['title'] + '\n\n' + wiki_df['text']
wiki_df['id'] = [str(uuid4()) for _ in range(wiki_df.shape[0])]

for index, row in wiki_df.iterrows():
    documents.append(
        Document(
            page_content=row['document'],
            metadata={ 'created_at': row['date_scraped'], 'source': 'OnceHuman Wiki', 'source_url': row['source_url'], 'id': row['id'] },
        )
    )

documents[:5]

[Document(metadata={'created_at': '2024-10-14 05:40:35', 'source': 'reddit', 'source_url': 'https://reddit.com/r/OnceHumanOfficial/comments/1g2ugvf/im_one_of_the_people_who_got_banned_recently_for/', 'id': '1g2ugvf'}, page_content='### Post Content\nI\'m One Of The People Who Got Banned Recently For Unknown Reasons\n\nHey, how\'s it going? I hope its better for you than me in this game currently. You may have seen a few posts from folks on NA P\\_Clash-X0004 who\'s friends got permabanned for unknown reasons while driving around. Welp, I\'m one of those friends. My in-game name was Huenheim, and my ID is 150278471. \n\nI\'m mostly making this post so its clear and from one of the people it directly happened to. I want to assure you that I am NOT a cheater. I get that every cheater says that, and I have zero ways to "prove it", but its 100% the truth. I\'m an old Destiny 2 head, and trust me when I say you build up a pretty healthy intolerance for cheating after going through the hell o

In [9]:
print("Total documents count:", len(documents))

Total documents count: 729


### Splitting Documents

In [10]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# 2048 characters is roughly 384 words
splitter = RecursiveCharacterTextSplitter(chunk_size=2048, chunk_overlap=0)
docs = splitter.split_documents(documents=documents)
print("Total chunked doc count:", len(docs))

Total chunked doc count: 1711


### Partition the document list and load data to pinecone chunk by chunk

In [11]:
def split_into_chunks(lst, chunk_size):
    chunks = []
    
    for i in range(0, len(lst), chunk_size):
        chunks.append(lst[i:i + chunk_size])
    
    return chunks

In [12]:
from uuid import uuid4

import os 
import time
os.environ['CURL_CA_BUNDLE'] = ''

ids = [str(uuid4()) for _ in range(len(docs))]

chunked_docs = split_into_chunks(docs, 5)
chunked_ids = split_into_chunks(ids, 5)
total_chunks = len(chunked_docs)
current_chunk = 0
failed_chunks_index = []

for index in range(total_chunks):
    try:
        chunked_doc = chunked_docs[index]
        chunked_id = chunked_ids[index]
        vector_store.add_documents(documents=chunked_doc, ids=chunked_id)

        current_chunk += 1
        print(f"Loading another chunk... ({current_chunk}/{total_chunks})")
        time.sleep(2)

    except:
        print(f"Failed to load chunk # {current_chunk}")
        current_chunk += 1
        failed_chunks_index.append(index)
        time.sleep(2)

Loading another chunk... (1/343)
Loading another chunk... (2/343)
Loading another chunk... (3/343)
Loading another chunk... (4/343)
Loading another chunk... (5/343)
Loading another chunk... (6/343)
Loading another chunk... (7/343)
Loading another chunk... (8/343)
Loading another chunk... (9/343)
Loading another chunk... (10/343)
Loading another chunk... (11/343)
Loading another chunk... (12/343)
Loading another chunk... (13/343)
Loading another chunk... (14/343)
Loading another chunk... (15/343)
Loading another chunk... (16/343)
Loading another chunk... (17/343)
Loading another chunk... (18/343)
Loading another chunk... (19/343)
Loading another chunk... (20/343)
Loading another chunk... (21/343)
Loading another chunk... (22/343)
Loading another chunk... (23/343)
Loading another chunk... (24/343)
Loading another chunk... (25/343)
Loading another chunk... (26/343)
Loading another chunk... (27/343)
Loading another chunk... (28/343)
Loading another chunk... (29/343)
Loading another chunk..

In [13]:
failed_chunks_index

[]