In [1]:
%load_ext autoreload
%autoreload 2

In [37]:
from unstructured.partition.html import partition_html
from rich import inspect, print
from bs4 import BeautifulSoup
from tqdm.auto import tqdm
import tiktoken

from pydantic import BaseModel, Field, computed_field
from typing import List, Optional

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service

from io import BytesIO
import time
import sys
import os

os.chdir("/mnt/arrakis/sietch/Resume/")
sys.path.append("src/")


from applybot import utils
client = utils.get_client()

In [41]:
class JobDescription(BaseModel):
    """Job description extracted from a job posting."""

    title: str = Field(..., description="Title of the job posting.")
    company: str = Field(..., description="Company of the job posting.")
    location: str = Field(..., description="Location of the job posting.")
    pay_range: str = Field(..., description="Pay range of the job posting.")
    description: str = Field(..., description="Description of the job posting.")
    roles: List[str] = Field(..., description="Roles the job posting will entail.")
    qualifications: List[str] = Field(..., description="Qualifications required for the job posting.")
    stand_outs: Optional[List[str]] = Field(None, description="Qualities that would make an application stand out.")

    @computed_field
    def full_text(self) -> str:
        """Full text of the job description"""

        roles = '\n'.join([f"- {role} " for role in self.roles])
        qualifications = '\n'.join([f"- {qualification} " for qualification in self.qualifications])
        stand_outs = '\n'.join([f"- {stand_out} " for stand_out in self.stand_outs]) if self.stand_outs is not None else ""

        return f"""
# {self.title} at {self.company}

**Location:** {self.location}
**Pay range:** {self.pay_range}

## Description

{self.description}

## Roles

{roles}

## Requirements

{qualifications}

## Stand outs

{stand_outs}
"""

    def __str__(self):
        return f"'{self.title}' at {self.company}"


class MaybeJobDescription(BaseModel):
    result: Optional[JobDescription] = Field(None, description="Job description extracted from a job posting.")
    error: Optional[str] = Field(None, description="Error message if the job description could not be extracted or if the text was not a job posting.")
    message: Optional[str] = Field(None, description="A helpful message to the user.")

    def __bool__(self):
        return self.result is not None

In [4]:
with open("src/applybot/jobs_bookmarks.txt") as f:
    urls = f.readlines()

In [5]:
DRIVER="/usr/local/chromedriver"
BINARY="/usr/bin/brave-browser"

# TODO: headless mode
opts = Options()
opts.binary_location = BINARY
service = Service()
driver = webdriver.Chrome(service=service, options=opts)
driver.implicitly_wait(10)

In [16]:
# for url in tqdm(urls):
#     driver.get(url)

#     soup = BeautifulSoup(driver.page_source, 'html.parser')
#     text = soup.text.encode('utf-8').decode('utf-8')
    
#     print(len(encoding.encode(text)))





In [34]:
url = urls[7]
driver.get(url)
time.sleep(1.5)


soup = BeautifulSoup(driver.page_source, 'lxml')
text = soup.text.encode('utf-8').decode('utf-8')
text = '  '.join([s for s in text.split("\n") if s])

In [42]:
JD = utils.extract_object(
    prompt="Extract the job description from the text below.",
    text=text,
    cls=MaybeJobDescription,
)

In [43]:
print(JD.result.full_text)