In [None]:
# Kor!
from kor.extraction import create_extraction_chain
from kor.nodes import Object, Text, Number

# LangChain Models
from langchain_google_genai import ChatGoogleGenerativeAI

# Standard Helpers
import pandas as pd
import requests
import time
import json
from datetime import datetime

# Text Helpers
from bs4 import BeautifulSoup
from markdownify import markdownify as md


# Project Hepers
import getpass
import os
from dotenv import load_dotenv, find_dotenv

In [None]:
load_dotenv(find_dotenv(), override=True)

In [None]:
if 'GOOGLE_API_KEY' not in os.environ:
    os.environ['GOOGLE_API_KEY'] = getpass.getpass('Provide your Google API Key: ')

In [None]:
llm = ChatGoogleGenerativeAI(model='gemini-1.5-flash-002', temperature=0.9)

In [9]:
def load_broad_tokens(file_path="Data/board_tokens.txt"):
    try:
        with open(file_path, "r") as file:
            return [line.strip() for line in file.readlines()]
    except FileNotFoundError:
        print(f"File {file_path} not found.")
        return []

In [None]:
def pull_from_greenhouse(board_token):
    url = f'https://boards-api.greenhouse.io/v1/boards/{board_token}/jobs?content=true'
    
    try:
        response = requests.get(url)
    except:
        print ("Whoops, error")
        return
        
    status_code = response.status_code
    
    jobs = response.json()['jobs']
    
    print (f"{board_token}: {status_code}, Found {len(jobs)} jobs")
    
    return jobs

In [None]:
extractor = Object(
    id="job_description",
    description="""
        A detailed description of a job listing including core responsibilities, required skills, educational requirements, experience level, preferred qualifications, and compensation and benefits.
    """,
    attributes=[
        Text(
            id="core_responsibilities",
            description="The main duties and tasks associated with the job"
        ),
        Text(
            id="required_skills",
            description="The skills that are necessary to perform the job"
        ),
        Text(
            id="educational_requirements",
            description="The educational background required for the job"
        ),
        Text(
            id="experience_level",
            description="The level of experience required for the job"
        ),
        Text(
            id="preferred_qualifications",
            description="Additional qualifications that are preferred but not required"
        ),
        Text(
            id="compensation_and_benefits",
            description="The compensation and benefits offered for the job"
        )
    ],
    examples=[
        (
            "Manage a team of software engineers. Required Skills: Proficient in Python and Java. Educational Requirements: Bachelor's degree in Computer Science. Experience Level: 5+ years of experience. Preferred Qualifications: Experience with cloud computing. Compensation and Benefits: Competitive salary and health benefits.",
            [
                {"core_responsibilities": "Manage a team of software engineers"},
                {"required_skills": "Proficient in Python and Java"},
                {"educational_requirements": "Bachelor's degree in Computer Science"},
                {"experience_level": "5+ years of experience"},
                {"preferred_qualifications": "Experience with cloud computing"},
                {"compensation_and_benefits": "Competitive salary and health benefits"}
            ]
        ),
        (
            "Develop and maintain web applications. Required Skills: Knowledge of HTML, CSS, and JavaScript. Educational Requirements: Associate's degree in Information Technology. Experience Level: 2+ years of experience. Preferred Qualifications: Familiarity with React.js. Compensation and Benefits: Annual bonus and retirement plan.",
            [
                {"core_responsibilities": "Develop and maintain web applications"},
                {"required_skills": "Knowledge of HTML, CSS, and JavaScript"},
                {"educational_requirements": "Associate's degree in Information Technology"},
                {"experience_level": "2+ years of experience"},
                {"preferred_qualifications": "Familiarity with React.js"},
                {"compensation_and_benefits": "Annual bonus and retirement plan"}
            ]
        )
    ],
    many=True,
)

In [None]:
def process_job_description(job):
    soup = BeautifulSoup(job['content'], 'html.parser')
    return soup.get_text()

In [None]:
def process_jobs_and_save(tokens, extractor, output_file="data.json"):
    all_jobs_data = []
    
    for token in tokens:
        jobs = pull_from_greenhouse(token)
        
        for job in jobs:
            description = process_job_description(job)
            text = md(description)

            chain = create_extraction_chain(llm, extractor, input_formatter="triple_quotes")
            extraction = chain.invoke(input=text)["data"]
            
            job_data = {
                "description": description,
                "extraction": extraction
            }
            all_jobs_data.append(job_data)
    
    with open(output_file, "w") as json_file:
        json.dump(all_jobs_data, json_file, indent=4)

In [None]:
tokens = load_broad_tokens(file_path="Data/board_tokens.txt")

In [None]:
total_tokens = len(tokens)
split_1 = int(total_tokens * 0.5)
split_2 = int(total_tokens * 0.8)

tokens_train = tokens[:split_1]
tokens_val = tokens[split_1:split_2]
tokens_test = tokens[split_2:]

In [None]:
process_jobs_and_save(tokens_train, extractor, output_file="Data/train.json")

In [None]:
process_jobs_and_save(tokens_val, extractor, output_file="Data/val.json")

In [None]:
process_jobs_and_save(tokens_test, extractor, output_file="Data/test.json")