In [103]:

import os, sys
import pandas as pd

rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)

In [104]:
df = pd.read_csv("data/all_nov_jobs.csv")

In [105]:
from weaviate_helper import setup_weaviate_interface_async, setup_weaviate_interface

In [106]:
class_info = {
    "class": "Job",
    "description": "Job postings for searching and filtering", 
    "properties": [
        {
            "name": "job_id",
            "dataType": ["uuid"],
            "description": "Reference to the job posting"
        },
        {
            "name": "title",
            "dataType": ["text"],
            "description": "Title of the job posting"
        },
        {
            "name": "company",
            "dataType": ["text"],
            "description": "Name of the company"
        }
    ],
    "vectorizer": "text2vec-openai",
     "moduleConfig": {
        "text2vec-openai": {
            "vectorizeClassName": False,
            "model": "ada",
            "modelVersion": "002",
            "type": "text"
        },
        "generative-openai": {
          "model": "gpt-3.5-turbo"
        }
    },
}

chunk_info = {
    "class": "JobDescriptionChunk",
    "description": "Chunks of job descriptions linked to job postings", 
    "properties": [
        {
            "name": "job_id",
            "dataType": ["uuid"],
            "description": "Reference to the job posting"
        },
        {
            "name": "content",
            "dataType": ["text"],
            "description": "Chunk of the job description"
        },
        {
            "name": "order",
            "dataType": ["int"],
            "description": "Order of the chunk in the full description"
        }
    ],
    "vectorizer": "text2vec-openai",
    "moduleConfig": {
        "text2vec-openai": {
            "vectorizeClassName": False,
            "model": "ada",
            "modelVersion": "002",
            "type": "text"
        },
        "generative-openai": {
            "model": "gpt-3.5-turbo"
        }
    }
}


In [107]:
async def get_weaviate_interface():
    return await setup_weaviate_interface()

interface = await get_weaviate_interface()

In [108]:
await interface.client.delete_class(class_name="Job")
await interface.client.create_class(class_info)
await interface.client.delete_class(class_name="JobDescriptionChunk")
await interface.client.create_class(chunk_info)

In [109]:
import pandas as pd
from typing import List
import asyncio
from weaviate_helper import setup_weaviate_interface_async
import httpx
from uuid import uuid4

CHUNK_SIZE = 500  

def validate_and_clean_data(row: pd.Series) -> pd.Series:
    for field in ['title', 'company', 'description']:
        if pd.isna(row[field]):
            row[field] = ""
    return row

def chunk_text(text: str, chunk_size: int) -> List[str]:
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

async def populate_database(csv_file_path: str) -> None:
    df = pd.read_csv(csv_file_path)
    columns_to_keep = ['title', 'company', 'description']
    df = df.head(100)
    df = df[columns_to_keep]
    df_cleaned = df.copy()

    df_cleaned.dropna(subset=['title', 'company', 'description'], inplace=True)
    df_cleaned = df_cleaned[(df_cleaned['title'].str.strip() != '') & 
                            (df_cleaned['company'].str.strip() != '') & 
                            (df_cleaned['description'].str.strip() != '')]

    df = df_cleaned.copy()
    interface = await get_weaviate_interface()
    objects = []
    chunks = []
    for _, row in df.iterrows():
        job_id = str(uuid4())
        new_object = {
            "job_id": job_id,
            "title": row.title,
            "company": row.company
        }
        objects.append(new_object)
        
        description_chunks = chunk_text(row.description, CHUNK_SIZE)
        for order, chunk in enumerate(description_chunks):
            chunk_object = {
                "job_id": job_id,
                "content": chunk,
                "order": order
            }
            chunks.append(chunk_object)

    batch_size = 50
    
    for i in range(0, len(objects), batch_size):
        batch = objects[i:i+batch_size]
        try:
            success = await interface.client.batch_create_objects(batch, class_name="Job")
            print("success:",response)
            if  success:
                print(f"Job batch {i//batch_size + 1} created successfully")
            else:
                print(f"Job batch {i//batch_size + 1} creation failed")
        except httpx.HTTPStatusError as e:
            print(f"HTTP error occurred: {e.response.status_code} - {e.response.text}")
        except Exception as e:
            print(f"An error occurred while creating job batch objects: {e}")

    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i+batch_size]
        try:
            success = await interface.client.batch_create_objects(batch, class_name="JobDescriptionChunk")
            print("response:", success)
            if success:
                print(f"Chunk batch {i//batch_size + 1} created successfully")
            else:
                print(f"Chunk batch {i//batch_size + 1} creation failed")
        except httpx.HTTPStatusError as e:
            print(f"HTTP error occurred: {e.response.status_code} - {e.response.text}")
        except Exception as e:
            print(f"An error occurred while creating chunk batch objects: {e}")

    print("Batch create objects process completed")

csv_file_path = "data/all_nov_jobs.csv"

await populate_database(csv_file_path)


SyntaxError: expected 'except' or 'finally' block (1301142579.py, line 76)

In [72]:
import weaviate
import os
from weaviate.classes.query import MetadataQuery

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
print(OPENAI_API_KEY)
client = weaviate.Client(
    url="http://localhost:8080",
    additional_headers={
        "X-OpenAI-API-Key":OPENAI_API_KEY
    }
    )

sk-proj-kPyH2UGZaqI4sFaSzByLT3BlbkFJoT2GkSNS3adHdetqL6sH


            your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.

            For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
            For code migration, see: https://weaviate.io/developers/weaviate/client-libraries/python/v3_v4_migration
            


In [73]:
client.query.aggregate("Job").with_meta_count().do()

{'data': {'Aggregate': {'Job': [{'meta': {'count': 98}}]}}}

In [76]:
response = (
    client.query
    .get("Job", ["title", "company", "job_id"])
    .with_near_text({
        "concepts": "Find me remote software enineering rolls"
    })
    .with_limit(10)
    .with_additional(["distance", "id"])
    .do()
)

In [77]:
response

{'data': {'Get': {'Job': [{'_additional': {'distance': 0.138883,
      'id': '8cad4258-e7d2-4ee0-9134-337cf44d6984'},
     'company': 'Actalent',
     'job_id': 'aca85448-f6a5-4503-b1c5-9d26cf7b2e43',
     'title': 'Software Engineer- REMOTE'},
    {'_additional': {'distance': 0.138883,
      'id': 'c3a45129-d40c-441f-9899-6b3717903a73'},
     'company': 'Actalent',
     'job_id': '90da1d44-e398-4112-9c65-6fe2edf41e17',
     'title': 'Software Engineer- REMOTE'},
    {'_additional': {'distance': 0.14451176,
      'id': '281bd420-812b-4978-834b-e0bbc72c2ed9'},
     'company': 'Actalent',
     'job_id': '959437d4-433b-4271-8cdf-a7985c4f2e64',
     'title': 'AUTOSAR Software Engineer - REMOTE'},
    {'_additional': {'distance': 0.14451176,
      'id': 'bed495d4-66d8-4e48-9bb5-a34f302e8e7a'},
     'company': 'Actalent',
     'job_id': '93196327-c982-47c5-b93d-91c3bb909804',
     'title': 'AUTOSAR Software Engineer - REMOTE'},
    {'_additional': {'distance': 0.14451176,
      'id': 'b52e66

In [78]:
import re
response = (
    client.query
    .get("Job", ["title", "company", "job_id"])
    .with_near_text({
        "concepts": ["find software engineering jobs"]
    })
    .with_limit(10)
    .with_additional(["distance"])
)

job_results = response.do() 


for job in job_results['data']['Get']['Job']:
    job_id = job["job_id"]

    description_query = (
        client.query
        .get("JobDescriptionChunk", ["content"])
        .with_where({
            "path": ["job_id"],
            "operator": "Equal",
            "valueText": job_id
        })
    )

    description_response = description_query.do()  # Use .do() to execute the query
    descriptions = description_response['data']['Get']['JobDescriptionChunk']

    description_list = []
    for desc in descriptions:
        description_list.append(desc['content'].strip())
    description = ''.join(description_list)
    print(description)
    print("...................")

*Description:*

Embedded software development for a variety of projects primarily working with C++ o Looking for roughly 1+ years of software development (Primarily in embedded software or firmware) o Working on several projects at any given time. (2 main projects currently) o Will be working on a variety of motor controllers o Will be working with electromechanical medical devices o Parts of the week devoted to process documentation, design and risk analysis o Parts of week dedicated to hands on time with the development and testing o Opportunity to work hand’s on with the product, work with devices and fixtures
Skills:*


C++, linux, embedded software, Embedded c, Multithreading, Rtos, bare metal

Top Skills Details:*


C++,linux,embedded software

Additional Skills & Qualifications:*


2 years-experience (up to 5 or so) is preferred, but not required.

An internship or some other experience while in school would be fine.

C/C++ knowledge is required, but C++ is preferred over C.
Oth

In [80]:
import re

all_job_context = ""
for job in job_results['data']['Get']['Job']:
  job_id = job["job_id"]
  title = job.get("title", "") 
  company = job.get("company", "") 

  description_query = (
      client.query
      .get("JobDescriptionChunk", ["content"])
      .with_where({
          "path": ["job_id"],
          "operator": "Equal",
          "valueText": job_id
      })
  )

  description_response = description_query.do()
  descriptions = description_response['data']['Get']['JobDescriptionChunk']

  description_list = []
  for desc in descriptions:
      description_list.append(desc['content'].strip())
  description = ''.join(description_list)

  job_context = f"Title: {title}\nCompany: {company}\nDescription:\n{description}\n---\n"
  all_job_context += job_context



In [97]:
user_question = "find remote software engineering jogs"

In [98]:
clean_context = all_job_context.rstrip('---\n')  # Remove separator beforehand
final_prompt = "I have been provided with a set of job descriptions, each containing a title, company, and description. These descriptions are presented below:\n" + clean_context + "  **[End of Descriptions]**\n\nGiven these job descriptions, what job description best addresses the following question:\n" + user_question


In [99]:
final_prompt

"I have been provided with a set of job descriptions, each containing a title, company, and description. These descriptions are presented below:\nTitle: Software Test Engineer\nCompany: Actalent\nDescription:\n*Description:*\n\nEmbedded software development for a variety of projects primarily working with C++ o Looking for roughly 1+ years of software development (Primarily in embedded software or firmware) o Working on several projects at any given time. (2 main projects currently) o Will be working on a variety of motor controllers o Will be working with electromechanical medical devices o Parts of the week devoted to process documentation, design and risk analysis o Parts of week dedicated to hands on time with the development and testing o Opportunity to work hand’s on with the product, work with devices and fixtures\nSkills:*\n\n\nC++, linux, embedded software, Embedded c, Multithreading, Rtos, bare metal\n\nTop Skills Details:*\n\n\nC++,linux,embedded software\n\nAdditional Skill

In [100]:
from openai import OpenAI
client = OpenAI(api_key = OPENAI_API_KEY)

def get_completion(
    messages: list[dict[str, str]],
    model: str = "gpt-3.5-turbo",
    max_tokens=500,
    temperature=0,
    stop=None,
    seed=123,
    tools=None,
    logprobs=None,
    top_logprobs=None,
) -> str:
    """Return the completion of the prompt.
    @parameter messages: list of dictionaries with keys 'role' and 'content'.
    @parameter model: the model to use for completion. Defaults to 'davinci'.
    @parameter max_tokens: max tokens to use for each prompt completion.
    @parameter temperature: the higher the temperature, the crazier the text
    @parameter stop: token at which text generation is stopped
    @parameter seed: random seed for text generation
    @parameter tools: list of tools to use for post-processing the output.
    @parameter logprobs: whether to return log probabilities of the output tokens or not.
    @returns completion: the completion of the prompt.
    """

    params = {
        "model": model,
        "messages": messages,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "stop": stop,
        "seed": seed,
        "logprobs": logprobs,
        "top_logprobs": top_logprobs,
    }
    if tools:
        params["tools"] = tools

    completion = client.chat.completions.create(**params)
    return completion


In [101]:
response = get_completion(
    [{
        "role": "user",
        "content": final_prompt,
    }]
)

In [102]:
response.choices[0].message.content

"Title: Software Engineer- REMOTE\nCompany: Actalent\nDescription:\nDescription:* Actalent is seeking an Cybersecurity Software Engineer with advanced C/C++, Python, or Java programming experience to contribute to defense and industrial cybersecurity technologies.\n\n\nThe software will work to develop software that automates and protects embedded systems.\n\nThis position is responsible for hands-on development of Industrial Internet of Things (IIoT), networking, robotics and automation products that incorporate advanced embedded and wireless cybersecurity technologies. These technologies will extend the state-of-the-art in cybersecurity and have both Commercial and Defense applications Your knowledge of cybersecurity technologies and their vulnerabilities enables you to develop the software needed to protect systems from modern threats. You don't configure/manage the security of embedded devices, you design, develop, implement, and test their security solutions.\n\nSkills:* C++, C/C+