In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from dotenv import load_dotenv
import os
import time
from datetime import datetime, timedelta
import pandas as pd
import sqlite3
import json

In [3]:
load_dotenv()
user = os.getenv("UID")
password = os.getenv("PASSWORD")

#JOB TYPE: 
# Graduate 
# Internsip 
# Temporary 
# Summer
job_type = "Internship"

<h2>Logging into the Job Portal</h2>

In [4]:
driver = webdriver.Chrome()
driver.get("https://www.cedars.hku.hk/netjobs")
main_page = driver.current_window_handle

# Click on Student Login
student_login = driver.find_element(By.XPATH, "//a[text()='HKU Student']")
student_login.click()

#Input email and log in
email_input = WebDriverWait(driver, timeout=10).until(
    EC.presence_of_element_located((By.ID, "email"))
)
login_button = driver.find_element(By.ID, "login_btn")

email_input.send_keys(user)
login_button.click()

#Input password
password_input = WebDriverWait(driver, timeout=10).until(
    EC.presence_of_element_located((By.ID, "passwordInput"))
)
sign_in_button = driver.find_element(By.ID, "submitButton")
password_input.send_keys(password)
sign_in_button.click()

#Trust Page
continue_button = WebDriverWait(driver, timeout=10).until(
    EC.presence_of_element_located((By.ID, "idSIButton9"))
)
continue_button.click()
time.sleep(3)
#Stay Singed in page
stay_button = WebDriverWait(driver, timeout=10).until(
    EC.presence_of_element_located((By.XPATH, "//input[@type='submit' and @id='idSIButton9']"))
)
stay_button.click()



<h2>Job Portal Page</h2>

**Navigate to Internship Page**

In [5]:
check_box = WebDriverWait(driver, 10).until(
    EC.visibility_of_element_located((By.XPATH, "//input[@type='checkbox']"))
)
check_box.click()
agree_btn = WebDriverWait(driver,10).until(
    EC.element_to_be_clickable((By.ID, "btn-agree"))
)
agree_btn.click()

In [6]:
time.sleep(1)
internship_button = driver.find_element(By.XPATH, f"//a[text()='{job_type} (']")
internship_button.click()

In [7]:

yesterday = datetime.now() - timedelta(1)
yesterday = datetime.strftime(yesterday, '%Y-%m-%d')
yesterday

'2025-07-02'

In [8]:
def clean_data(data):
    # data: list of strings corresponding to job details
    for i in range(len(data)):
        data[i] = data[i].split(':\n')
    return data
        

In [9]:
def get_data(driver):
    WebDriverWait(driver, 10).until(EC.number_of_windows_to_be(2))

    for window_handle in driver.window_handles:
        if window_handle != main_page:
            driver.switch_to.window(window_handle)
            break
    
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, "//div[@id='content']"))
    )
    extract_data = driver.find_elements(By.XPATH, "//div[@id='content']//div[contains(@class, 'crow')]")
    data = [i.text for i in extract_data if i.text.strip()]
    
    driver.close()
    driver.switch_to.window(main_page)
    return data

In [10]:
details = []
jobs = driver.find_elements(By.XPATH, "//table[@id='search_jobs']/tbody/tr")

for i, job in enumerate(jobs):
    job.click()
    data = clean_data(get_data(driver))
    details.append(data)

driver.close()
    

In [11]:

def format_json(job):
    job_json = {}   
    for item in job:
        if len(item) >= 2:  
            key = item[0]
            value = item[1]
            job_json[key] = value
    return json.dumps(job_json)  

In [13]:
details[2]

[['Job ID', 'G2404501'],
 ['Posting Date', '2025-06-26'],
 ['Employment Type', 'Internship'],
 ['Employment Mode', 'Part-time'],
 ['Company Name (Eng)', 'EternityX Marketing Technology Limited'],
 ['Company Name (Chi)', '恆力數宇科技有限公司'],
 ['Nature of Business', 'Advertising / Public Relations / Marketing'],
 ['Origin of Company Ownership', 'Hong Kong'],
 ['No. of Employees', '50-100 employees'],
 ['Position Offered', 'AI-driven Operations Internship (MarTech industry)'],
 ['No. of Vacancies', '1'],
 ['Job Nature', 'IT / Programming'],
 ['Job Description',
  'Overview\nWe are seeking university students interested in entering the marketing technology field with an AI background. You will support the Operations Manager with day-to-day tasks in enhancing business efficiency and workflow automation.\n \nJob Responsibilities',
  'Conduct market research and effectively communicate business findings and insights\nAssist the Operations Manager in designing, developing and testing automation work

**Setting up the Database**

Data types: NULL INTEGER REAL TEXT BLOB

In [3]:
#Set up a connection
conn = sqlite3.connect('jobs.db')
cursor = conn.cursor()
create_table_query = """
CREATE TABLE IF NOT EXISTS jobs (
    JobID TEXT PRIMARY KEY,
    PostingDate TEXT,
    EmploymentType TEXT,
    EmplymentMode TEXT,
    CompanyNameEng TEXT,
    CompanyNameChi TEXT, 
    NatureOfBusiness TEXT,
    OriginOfCompanyOwnership TEXT,
    NoOfEmployees TEXT,
    PositionOffered TEXT,
    NoOfVacancies TEXT,
    JobNature TEXT,
    JobDescription TEXT,
    BasicSalary TEXT,
    WorkLocatin TEXT, 
    WorkMode TEXT,
    FieldsOfStudyRequired TEXT,
    LevelOfAward TEXT,
    NoOfWorkingHoursPerDay INTEGER,
    NoOfWorkingHoursPerWeek INTEGER,
    ClosingDataForApplication TEXT,
    ApplicationShouldBeSubmittedVia TEXT,
    ApplicationDocumentsRequired TEXT
);
"""

cursor.execute(create_table_query)

<sqlite3.Cursor at 0x1daca6f8640>

In [12]:


import nbformat

# Load the notebook
with open('get_jobs.ipynb') as f:
    notebook = nbformat.read(f, as_version=4)

# Extract code cells
code_cells = [cell.source for cell in notebook.cells if cell.cell_type == 'code']

# Write to a .py file
with open('output_script.py', 'w') as f:
    for cell in code_cells:
        f.write(cell + '\n\n')
