In [1]:
!pip install -U transformers
!pip install dspy-ai
!pip install weaviate-client
!pip install --upgrade llama-index weaviate-client

Collecting transformers
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.44.2-py3-none-any.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m115.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.42.4
    Uninstalling transformers-4.42.4:
      Successfully uninstalled transformers-4.42.4
Successfully installed transformers-4.44.2
Collecting dspy-ai
  Downloading dspy_ai-2.4.13-py3-none-any.whl.metadata (39 kB)
Collecting backoff (from dspy-ai)
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Collecting datasets (from dspy-ai)
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting openai<2.0.0,>=0.28.1 (from dspy-ai)
  Downloading o

In [7]:
import weaviate
from weaviate.embedded import EmbeddedOptions
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter
import time

# HuggingFace token
hf_token = #insertyourkey

# Connect to Weaviate client in embedded mode with increased timeout
client = weaviate.Client(
    embedded_options=EmbeddedOptions(
        additional_env_vars={
            "HUGGINGFACE_API_KEY": hf_token,
            "QUERY_DEFAULTS_LIMIT": "25",
            "AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED": "true",
            "PERSISTENCE_DATA_PATH": "./data",
            "DEFAULT_VECTORIZER_MODULE": "text2vec-huggingface",
            "ENABLE_MODULES": "text2vec-huggingface",
        }
    ),
    additional_headers={
        "X-HuggingFace-Api-Key": hf_token
    }
)

index_name = "SolutionWriteups"

# Create Weaviate schema
schema = {
   "classes": [
       {
           "class": index_name,
           "properties": [
               {
                   "name": "content",
                   "dataType": ["text"]
               }
           ]
       }
   ]
}

# Delete existing data collection if it already exists from a previous run
if client.schema.exists(index_name):
    client.schema.delete_class(index_name)
client.schema.create(schema)

# Load and chunk your document
with open('output_ocr.txt', 'r', encoding='utf-8') as file:
    text = file.read()

document = Document(text=text)
parser = SentenceSplitter(chunk_size=250, chunk_overlap=20)
nodes = parser.get_nodes_from_documents([document])

# Add nodes to Weaviate
batch_size = 100
max_retries = 5
retry_delay = 10

with client.batch as batch:
    batch.batch_size = batch_size
    for i, node in enumerate(nodes):
        properties = {
            "content": node.text
        }

        retries = 0
        while retries < max_retries:
            try:
                client.batch.add_data_object(properties, index_name)
                break
            except Exception as e:
                print(f"Error on node {i}: {str(e)}")
                retries += 1
                if retries < max_retries:
                    print(f"Retrying in {retry_delay} seconds...")
                    time.sleep(retry_delay)
                else:
                    print(f"Failed to insert node {i} after {max_retries} attempts")

        if i % batch_size == 0 and i > 0:
            print(f"Inserted {i} nodes")

print(f"Attempted to insert all {len(nodes)} nodes")

# Verify the insertion
count = client.query.aggregate(index_name).with_meta_count().do()
print(f"Number of documents in Weaviate: {count['data']['Aggregate'][index_name][0]['meta']['count']}")

INFO:weaviate-client:embedded weaviate is already listening on port 8079
INFO:weaviate-client:Embedded weaviate wasn't listening on ports http:8079 & grpc:50060, so starting embedded weaviate again
INFO:weaviate-client:Started /root/.cache/weaviate-embedded: process ID 5999


Inserted 100 nodes
Inserted 200 nodes
Inserted 300 nodes
Inserted 400 nodes
Inserted 500 nodes
Inserted 600 nodes
Inserted 700 nodes
Inserted 800 nodes
Inserted 900 nodes
Inserted 1000 nodes
Inserted 1100 nodes
Inserted 1200 nodes
Inserted 1300 nodes
Inserted 1400 nodes
Inserted 1500 nodes
Inserted 1600 nodes
Inserted 1700 nodes
Inserted 1800 nodes
Inserted 1900 nodes
Inserted 2000 nodes
Inserted 2100 nodes
Inserted 2200 nodes
Inserted 2300 nodes
Inserted 2400 nodes
Inserted 2500 nodes
Inserted 2600 nodes
Inserted 2700 nodes
Inserted 2800 nodes
Inserted 2900 nodes
Attempted to insert all 2945 nodes
Number of documents in Weaviate: 2945


In [11]:
def basic_search(query, limit=5):
    result = (
        client.query
        .get(index_name, ["content"])
        .with_bm25(query=query)
        .with_limit(limit)
        .do()
    )
    return result['data']['Get'][index_name]

# Example usage
search_results = basic_search("what is chemical engineering")
for item in search_results:
    print(item['content'])

from the University of Michigan. He was successively Vice President and Director of Research of the Flintkote company Dean of Engineering at the Polytechnic Institute of Brooklyn, and the re ja refolds Professor in Chemical Engineering at North caroling State university He served one term as President of the america Institute of Chemical engineers sultan c SMITH (B.Chem., Chem.E., core university is Professor Emeritus of Chemical Engineering at corneal university where he joined the faculty in 1946. He was Director of Continuing Engineering Education at corneal from 1965 to 1971, and Director of the School of Chemical Engineering from 1975 to 1983. He retired from act give teaching in 1986. Before joining the faculty at corneal he was employed as a chemo kcal engineer by evil upon de devours and cop He has served as a consultant on process development to do point america cyanamide and many other companies as well as government agencies He is a member of the america Chemical Society and