In [2]:

import os, sys
import pandas as pd

rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)

In [5]:
from weaviate_helper import setup_weaviate_interface_async, setup_weaviate_interface

In [6]:
print(sys.path)

['/home/misge/Documents/Projects/tenx/Teammate/team-mate', '/home/misge/Documents/Projects/tenx/Teammate/team-mate/RAG', '/home/misge/miniconda3/lib/python311.zip', '/home/misge/miniconda3/lib/python3.11', '/home/misge/miniconda3/lib/python3.11/lib-dynload', '', '/home/misge/Documents/Projects/tenx/Teammate/team-mate/.venv/lib/python3.11/site-packages']


In [7]:
from weaviate_helper import setup_weaviate_interface_async
import asyncio

async def get_weaviate_interface():
    weaviate_interface = await setup_weaviate_interface_async()
    return weaviate_interface

async def main():
    weaviate_interface = setup_weaviate_interface()
    interface = await get_weaviate_interface()
    return interface 

interface = await main()

In [8]:
await interface.client.get_schema()

{'classes': [{'class': 'Document',
   'description': 'A document class to store documents used for knowledge base',
   'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
    'cleanupIntervalSeconds': 60,
    'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
   'multiTenancyConfig': {'enabled': False},
   'properties': [{'dataType': ['text'],
     'description': 'The title of the document',
     'indexFilterable': True,
     'indexSearchable': True,
     'name': 'title',
     'tokenization': 'word'},
    {'dataType': ['text'],
     'description': 'The entire content of the document',
     'indexFilterable': True,
     'indexSearchable': True,
     'name': 'content',
     'tokenization': 'word'},
    {'dataType': ['int'],
     'description': 'The word count of the content',
     'indexFilterable': True,
     'indexSearchable': False,
     'name': 'wordCount'},
    {'dataType': ['text'],
     'description': 'The URL of the document',
     'indexFilterable': True,
 

In [9]:
class_info = {
    "class": "Job",
    "description": "Job postings for searching and filtering", 
    "properties": [
        {
            "name": "title",
            "dataType": ["text"],
            "description": "Title of the job posting",
            "vectorizer": "text2vec-contextionary"
        },
        {
            "name": "company",
            "dataType": ["text"],
            "description": "Name of the company",
            "vectorizer": "text2vec-contextionary"
        },
        {
            "name": "company_link",
            "dataType": ["string"],
            "description": "URL of the company website"
        },
        {
            "name": "place",
            "dataType": ["text"],
            "description": "Location of the job (city, country, remote)",
            "vectorizer": "text2vec-contextionary"
        },
        {
            "name": "date",
            "dataType": ["string"],
            "description": "Date the job was posted"
        },
        {
            "name": "apply_link",
            "dataType": ["string"],
            "description": "URL to apply for the job"
        },
        {
            "name": "description",
            "dataType": ["text"],
            "description": "Full job description (optional)",
            "vectorizer": "text2vec-contextionary"
        }
    ]
}


In [10]:
await interface.client.delete_class(class_name="Job")

In [11]:
await interface.client.create_class(class_info)

In [12]:
await interface.client.get_schema()

{'classes': [{'class': 'Document',
   'description': 'A document class to store documents used for knowledge base',
   'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
    'cleanupIntervalSeconds': 60,
    'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
   'multiTenancyConfig': {'enabled': False},
   'properties': [{'dataType': ['text'],
     'description': 'The title of the document',
     'indexFilterable': True,
     'indexSearchable': True,
     'name': 'title',
     'tokenization': 'word'},
    {'dataType': ['text'],
     'description': 'The entire content of the document',
     'indexFilterable': True,
     'indexSearchable': True,
     'name': 'content',
     'tokenization': 'word'},
    {'dataType': ['int'],
     'description': 'The word count of the content',
     'indexFilterable': True,
     'indexSearchable': False,
     'name': 'wordCount'},
    {'dataType': ['text'],
     'description': 'The URL of the document',
     'indexFilterable': True,
 

In [13]:
import json

def validate_json(file_path):
    try:
        with open(file_path, 'r') as file:
            json.load(file)
        print("JSON schema is valid.")
    except json.JSONDecodeError as e:
        print(f"JSON schema is invalid: {e}")

validate_json("../weaviate_helper/schema.json")


JSON schema is valid.


In [14]:
import pandas as pd
import asyncio
from weaviate_helper import setup_weaviate_interface_async
import httpx
import json

async def get_weaviate_interface():
    weaviate_interface = await setup_weaviate_interface_async()
    return weaviate_interface

def validate_and_clean_data(row):
    if not isinstance(row['date'], str):
        row['date'] = str(row['date'])
    for field in ['title', 'company', 'company_link', 'place', 'date', 'apply_link', 'description']:
        if pd.isna(row[field]):
            row[field] = ""
    return row

async def populate_database(csv_file_path):
    df = pd.read_csv(csv_file_path)
    
    columns_to_keep = ['title', 'company', 'company_link', 'place', 'date', 'apply_link', 'description']
    df = df[columns_to_keep]
    
    df = df.apply(validate_and_clean_data, axis=1)
    interface = await get_weaviate_interface()
    
    objects = []
    for _, row in df.iterrows():
        new_object = {
            "title": row.title,
            "company": row.company,
            "company_link": row.company_link,
            "place": row.place,
            "date": row.date,
            "apply_link": row.apply_link,
            "description": row.description
        }
        objects.append(new_object)
    
    batch_size = 200
    
    for i in range(0, len(objects), batch_size):
        batch = objects[i:i+batch_size]
        try:
            success = interface.client.batch_create_objects(batch, class_name="Job")
            if success:
                print("Batch created successfully")
            else:
                print("Batch creation faild")
      
        except httpx.HTTPStatusError as e:
            print(f"HTTP error occurred: {e.response.status_code} - {e.response.text}")
        except Exception as e:
            print(f"An error occurred while creating batch objects: {e}")

    print("Batch create objects succeeded")

csv_file_path = "all_nov_jobs.csv"

await populate_database(csv_file_path)




Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
Batch created successfully
B

  success = interface.client.batch_create_objects(batch, class_name="Job")
  await populate_database(csv_file_path)


In [47]:
import weaviate
import os

client = weaviate.Client("http://localhost:8080")  

response = (
    client.query
    .get("Job", ["title", "company", "company_link", "place", "date", "apply_link", "description"])
    .with_near_text({
        "concepts": ["Remote software engineer positions"]
    })
    .with_limit(10)
    .with_additional(["distance"])
    .do()
)

print(response)
# Print the results
# for result in response['data']['Get']['Job']:
#     print(f"Title: {result['title']}")
#     print(f"Company: {result['company']}")
#     print(f"Company Link: {result['company_link']}")
#     print(f"Place: {result['place']}")
#     print(f"Date: {result['date']}")
#     print(f"Apply Link: {result['apply_link']}")
#     print(f"Description: {result['description']}")
#     print(f"Distance: {result['_additional']['distance']}")
#     print("----")


{'errors': [{'locations': [{'column': 20, 'line': 1}], 'message': 'Unknown argument "nearText" on field "Job" of type "GetObjectsObj". Did you mean "nearVector" or "nearObject"?', 'path': None}]}


In [52]:
import weaviate
import os
from weaviate.classes.query import MetadataQuery

client = weaviate.Client("http://localhost:8080", "v1.23.7")

job_collection = client.collections.get("Job")

response = job_collection.generate.near_text(
    query="Remote software engineer positions",
    limit=10,
    return_metadata=MetadataQuery(distance=True)
)

for o in response.objects:
    print(o.properties)
    print(o.metadata.distance)


AttributeError: 'Client' object has no attribute 'collections'

In [54]:
import weaviate

# Connect to Weaviate with the specified version
client = weaviate.Client("http://localhost:8080", "v1.23.7")

# Specify the class (collection) you want to search
class_name = "Job"

# Perform the semantic search
response = (
    client.query
    .get(class_name, ["title", "company", "company_link", "place", "date", "apply_link", "description"])
    .with_near_text({
        "concepts": ["Remote software engineer positions"]
    })
    .with_limit(10)
    .do()
)

response
# Print the results
# for result in response["data"]["Get"][class_name]:
#     print(f"Title: {result['title']}, Company: {result['company']}, Place: {result['place']}")


            your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.

            For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
            For code migration, see: https://weaviate.io/developers/weaviate/client-libraries/python/v3_v4_migration
            
                    authentication. Are you sure this is correct?


{'errors': [{'locations': [{'column': 20, 'line': 1}],
   'message': 'Unknown argument "nearText" on field "Job" of type "GetObjectsObj". Did you mean "nearVector" or "nearObject"?',
   'path': None}]}

In [57]:
import weaviate

# Connect to Weaviate with the specified version
client = weaviate.Client("http://localhost:8080", "v1.23.7")

# Specify the class (collection) you want to search
class_name = "Job"

# Perform the semantic search
response = (
    client.query
    .get(class_name, ["title", "company", "company_link", "place", "date", "apply_link", "description"])
    .with_near_vector({
        "vector": [0.5, 0.3, -0.2],  
        "certainty": 0.2
    })
    .with_limit(10)
    .do()
)

# Print the results
for result in response["data"]["Get"][class_name]:
    print(f"Title: {result['title']}, Company: {result['company']}, Place: {result['place']}")


In [36]:
import weaviate

print(weaviate.__version__)

4.6.4


In [37]:
query = "Remote software engineer positions"
results = await search_weaviate(query)

for result in results:
    print(f"Title: {result['title']}, Company: {result['company']}, Place: {result['place']}")

NameError: name 'search_weaviate' is not defined