In [1]:
import os
import time
import warnings
import weaviate
from weaviate.classes.init import Auth
from langchain_weaviate import WeaviateVectorStore
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from dotenv import load_dotenv, find_dotenv
from tqdm.notebook import tqdm
from pprint import pprint
import pandas as pd

In [3]:
load_dotenv(find_dotenv())

True

In [4]:
# Configurazione dell'embedder
model = 'sentence-transformers/all-mpnet-base-v2'
embedder = HuggingFaceEmbeddings(model_name=model, model_kwargs={'device': 'cuda'})

In [6]:
# Creazione del DataFrame
df = pd.read_csv('../data/story_cleaned.csv')

In [7]:
records = df.to_dict('records')

documents = [
    Document(page_content=f"{s['title']}", metadata={'url': s['url'], 'hn_id': s['id']})
    for s in records
]
print("Length of documents: ", len(documents))
print("Example of document: ", documents[0])

Length of documents:  1632537
Example of document:  page_content='Dead Duck Day marks that time a scientist witnessed gay duck necrophilia' metadata={'url': 'https://arstechnica.com/science/2023/06/dead-duck-day-marks-that-time-a-scientist-witnessed-gay-duck-necrophilia/', 'hn_id': 36204593}


In [8]:
def batch(chunks, n_max=10000):
    batches = []
    current_batch = []
    count = 0

    for c in chunks:
        chunk_length = len(c.page_content)
        
        if count + chunk_length >= n_max:
            batches.append(current_batch)
            current_batch = [c]
            count = chunk_length
        else:
            current_batch.append(c)
            count += chunk_length

    if current_batch:
        batches.append(current_batch)
    
    return batches

batches = batch(documents, n_max=100000)
print("Number of batches: ", len(batches))

Number of batches:  824


In [9]:
# Best practice: store your credentials in environment variables
weaviate_url = os.environ["WEAVIATE_URL"]
weaviate_api_key = os.environ["WEAVIATE_API_KEY"]

# Connect to Weaviate Cloud
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=weaviate_url,
    auth_credentials=Auth.api_key(weaviate_api_key),
)

print(client.is_ready())

True


In [None]:
first_batch = batches[0]
db = WeaviateVectorStore.from_documents(first_batch, embedder, client=client)

#db = WeaviateVectorStore(
#    client=client,
#    index_name=os.getenv("WEAVIATE_INDEX_NAME"),
#    text_key="text",
#    embedding=embedder
#)

In [11]:
other_batches = batches[1:]

failed_batches = []

for batch_docs in tqdm(other_batches[:150], desc="Adding batches to Weaviate Cloud"):
    success = False
    retries = 3
    while not success and retries > 0:
        try:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                db.add_documents(batch_docs)
            success = True
        except Exception as e:
            print(f"Error: {e}")
            retries -= 1
            if retries > 0:
                print("Retrying...")
                time.sleep(5)  # wait for 5 seconds before retrying
            else:
                print("Failed to add documents after multiple retries.")
                failed_batches.append(batch_docs)

print(f"Number of failed batches: {len(failed_batches)}")

Adding batches to Weaviate Cloud:   0%|          | 0/150 [00:00<?, ?it/s]

Number of failed batches: 0


In [12]:
while failed_batches:
    batch_docs = failed_batches.pop()
    print("\33[0;33mRemaining failed batches\33[0m: ", len(failed_batches))
    success = False
    retries = 3
    while not success and retries > 0:
        try:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                db.add_documents(batch_docs)
            success = True
        except Exception as e:
            print(f"Error: {e}")
            retries -= 1
            if retries > 0:
                print("Retrying...")
                time.sleep(5)  # wait for 5 seconds before retrying
            else:
                print("Failed to add documents after multiple retries.")
                failed_batches.append(batch_docs)

In [13]:
if client.is_ready():
    res = await db.asimilarity_search("biology", k=500)
    print("Length of res: ", len(res))
    for r in res[:5]:
        pprint(r)
else:
    print("Weaviate is not ready yet.")

Length of res:  500
Document(metadata={'hn_id': 33605559.0, 'url': 'https://ithinkbiology.in'}, page_content='iThink Biology')
Document(metadata={'hn_id': 34985029.0, 'url': 'https://cell.substack.com/p/burrito-biology'}, page_content='Biology Is a Burrito')
Document(metadata={'hn_id': 34107280.0, 'url': 'https://www.quantamagazine.org/major-biology-discoveries-from-2022-20221221'}, page_content='The Year in Biology')
Document(metadata={'hn_id': 31721499.0, 'url': 'https://science.xyz//'}, page_content='Science')
Document(metadata={'hn_id': 31075753.0, 'url': 'https://www.nybooks.com/articles/2022/04/21/why-biology-is-not-destiny-genetic-lottery-kathryn-harden/'}, page_content='Why Biology Is Not Destiny')
