In [5]:

import os, sys
import pandas as pd

rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)

In [6]:
from weaviate_helper import setup_weaviate_interface_async, setup_weaviate_interface

In [7]:
class_info = {
    "class": "Job",
    "description": "Job postings for searching and filtering", 
    "properties": [
        {
            "name": "title",
            "dataType": ["text"],
            "description": "Title of the job posting"
        },
        {
            "name": "company",
            "dataType": ["text"],
            "description": "Name of the company"
        }
    ],
    "vectorizer": "text2vec-openai",
     "moduleConfig": {
        "text2vec-openai": {
            "vectorizeClassName": False,
            "model": "ada",
            "modelVersion": "002",
            "type": "text"
        },
        "generative-openai": {
          "model": "gpt-3.5-turbo"
        }
    },
}

chunk_info = {
    "class": "JobDescriptionChunk",
    "description": "Chunks of job descriptions linked to job postings", 
    "properties": [
        {
            "name": "job_id",
            "dataType": ["uuid"],
            "description": "Reference to the job posting"
        },
        {
            "name": "content",
            "dataType": ["text"],
            "description": "Chunk of the job description"
        },
        {
            "name": "order",
            "dataType": ["int"],
            "description": "Order of the chunk in the full description"
        }
    ],
    "vectorizer": "text2vec-openai",
    "moduleConfig": {
        "text2vec-openai": {
            "vectorizeClassName": False,
            "model": "ada",
            "modelVersion": "002",
            "type": "text"
        },
        "generative-openai": {
            "model": "gpt-3.5-turbo"
        }
    }
}


In [8]:
await interface.client.delete_class(class_name="Job")
await interface.client.create_class(class_info)
await interface.client.delete_class(class_name="JobDescriptionChunk")
await interface.client.create_class(chunk_info)

NameError: name 'interface' is not defined

In [9]:
import pandas as pd
from typing import List
import asyncio
from weaviate_helper import setup_weaviate_interface_async
import httpx
from uuid import uuid4

CHUNK_SIZE = 500  # Define the maximum length for each chunk

def validate_and_clean_data(row: pd.Series) -> pd.Series:
    for field in ['title', 'company', 'description']:
        if pd.isna(row[field]):
            row[field] = ""
    return row

def chunk_text(text: str, chunk_size: int) -> List[str]:
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

async def populate_database(csv_file_path: str) -> None:
    df = pd.read_csv(csv_file_path)
    columns_to_keep = ['title', 'company', 'description']
    df = df[columns_to_keep]
    df = df.apply(validate_and_clean_data, axis=1)
    
    inteface = await get_weaviate_interface()
    objects = []
    chunks = []
    for _, row in df.iterrows():
        job_id = str(uuid4())
        new_object = {
            "id": job_id,
            "title": row.title,
            "company": row.company
        }
        objects.append(new_object)
        
        description_chunks = chunk_text(row.description, CHUNK_SIZE)
        for order, chunk in enumerate(description_chunks):
            chunk_object = {
                "job_id": job_id,
                "content": chunk,
                "order": order
            }
            chunks.append(chunk_object)

    batch_size = 200
    
    for i in range(0, len(objects), batch_size):
        batch = objects[i:i+batch_size]
        try:
            success = await interface.client.batch_create_objects(batch, class_name="Job")
            if success:
                print(f"Job batch {i//batch_size + 1} created successfully")
            else:
                print(f"Job batch {i//batch_size + 1} creation failed")
        except httpx.HTTPStatusError as e:
            print(f"HTTP error occurred: {e.response.status_code} - {e.response.text}")
        except Exception as e:
            print(f"An error occurred while creating job batch objects: {e}")

    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i+batch_size]
        try:
            success = await interface.client.batch_create_objects(batch, class_name="JobDescriptionChunk")
            if success:
                print(f"Chunk batch {i//batch_size + 1} created successfully")
            else:
                print(f"Chunk batch {i//batch_size + 1} creation failed")
        except httpx.HTTPStatusError as e:
            print(f"HTTP error occurred: {e.response.status_code} - {e.response.text}")
        except Exception as e:
            print(f"An error occurred while creating chunk batch objects: {e}")

    print("Batch create objects process completed")

csv_file_path = "data/all_nov_jobs.csv"

await populate_database(csv_file_path)


An error occurred while creating job batch objects: name 'interface' is not defined
An error occurred while creating job batch objects: name 'interface' is not defined
An error occurred while creating job batch objects: name 'interface' is not defined
An error occurred while creating job batch objects: name 'interface' is not defined
An error occurred while creating job batch objects: name 'interface' is not defined
An error occurred while creating job batch objects: name 'interface' is not defined
An error occurred while creating job batch objects: name 'interface' is not defined
An error occurred while creating job batch objects: name 'interface' is not defined
An error occurred while creating job batch objects: name 'interface' is not defined
An error occurred while creating job batch objects: name 'interface' is not defined
An error occurred while creating job batch objects: name 'interface' is not defined
An error occurred while creating job batch objects: name 'interface' is not 