In [2]:
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

In [3]:

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

MODEL = 'gpt-4o-mini'
openai = OpenAI()

In [4]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:    

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [5]:
web = Website("https://www.linkedin.com/jobs/working-student-jobs-germany/")

In [6]:
link_system_prompt = "You are provided with a list of links for a job on a webpage. \
You are able to decide which of the links would be most relevant to the field of Data engineering or Artificial Intelligence, \
such as links for working student jobs in those fields.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "Artificial Intelligence", "url": "https://full.url/goes/here/about"},
        {"type": "Data Engineering": "url": "https://another.full.url/careers"}
    ]
}
"""

In [17]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the job website of {website.url} - "
    
    user_prompt += "please decide which of these are relevant web links that are visible on the company website for a job in the field of Data Engineering or Artificial Intelligence\n"
    user_prompt += "please remove the links which are of no use such as privacy and agreements.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [15]:
def get_links(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
      ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

In [40]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [41]:
system_prompt = "You are an assistant that analyzes the contents of several relevant job openings from a job website \
and creates a short description about the company, a job description and requirements for prospective employees. Respond in markdown.\
Include details of jobs and requirements if you have the information."

In [47]:
def get_brochure_user_prompt(url):
    user_prompt = f"You are looking at a job website\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a description of each job and their relevant content in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [50]:
def create_jobs(url):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [None]:
create_jobs("https://www.linkedin.com/jobs/working-student-jobs-germany/")

Found links: {'links': [{'type': 'Data Engineering', 'url': 'https://de.linkedin.com/jobs/view/data-analytics-working-student-at-beat81-4131023280?position=21&pageNum=0&refId=lONQAnJxkKO%2F6yPdGGnnyw%3D%3D&trackingId=6vcC95RNMzSoyUkjyOHmcw%3D%3D'}, {'type': 'Artificial Intelligence', 'url': 'https://de.linkedin.com/jobs/view/machine-learning-working-student-hybrid-at-dida-4015219243?position=15&pageNum=0&refId=lONQAnJxkKO%2F6yPdGGnnyw%3D%3D&trackingId=I9hJH93N62xKdpSYWQClWA%3D%3D'}, {'type': 'Data Engineering', 'url': 'https://de.linkedin.com/jobs/view/working-student-data-science-f-m-d-at-ocell-4126026022?position=26&pageNum=0&refId=lONQAnJxkKO%2F6yPdGGnnyw%3D%3D&trackingId=1mlEmxEef3p%2FV7M6UPAfMQ%3D%3D'}, {'type': 'Artificial Intelligence', 'url': 'https://de.linkedin.com/jobs/view/working-student-internship-at-software-defined-automation-3837962677?position=53&pageNum=0&refId=lONQAnJxkKO%2F6yPdGGnnyw%3D%3D&trackingId=aW7520pmXgVkfKLJV4PfuQ%3D%3D'}]}
