In [49]:
import pandas as pd
import numpy as np
import os

- Load saved data from Phase 1

In [50]:
df = pd.read_csv('JobMatch - Saved Linked-in Jobs.csv', index_col=0, parse_dates=['Date Posted'])
df.head()

Unnamed: 0,Position,Company,Location,Status,Posted,Date Posted,URL
0,Data Scientist,Gartner,Barcelona (On-site),Actively recruiting,1d ago,2024-01-19 22:10:46.046425,https://www.linkedin.com/jobs/view/3809116486/...
1,Professional Data Analyst,Volvo Group,Wrocław,Actively recruiting,4d ago,2024-01-16 22:10:46.046425,https://www.linkedin.com/jobs/view/3803021338/...
2,Junior Data Analyst with Python,Unilever,Warsaw (On-site),Actively recruiting,5d ago,2024-01-15 22:10:46.046425,https://www.linkedin.com/jobs/view/3805491270/...
3,Data Analyst in ESG Team,ING Hubs Poland,Warsaw Metropolitan Area (Hybrid),,5d ago,2024-01-15 22:10:46.046425,https://www.linkedin.com/jobs/view/3805428308/...
4,Machine Learning Scientist (m/w/d),Bayer,Berlin (On-site),Actively recruiting,2d ago,2024-01-18 22:10:46.046425,https://www.linkedin.com/jobs/view/3809397054/...


- Read Linked-in login details

In [51]:
login_txt = open('LinkedIn - Logging Details.txt', 'r')
MY_USERNAME = login_txt.readline().replace('Username: ', '').replace('\n', '')
MY_PASSWORD = login_txt.readline().replace('Password: ', '')

## Phase 2: Quantifying jobs fit with LLMs

### 2.1 Get job descriptions from each link in the database

In [52]:
from bs4 import BeautifulSoup
import requests
df.URL[1]

'https://www.linkedin.com/jobs/view/3803021338/?refId=61655319-bf6d-4c8f-8a35-eb979d5982e5&trackingId=a8uI0BBSQz260lAdlP%2BEQA%3D%3D&trk=flagship3_job_home_savedjobs'

In [8]:
from job_match_utils import *

li_match = JobMatch(username = MY_USERNAME,
                    password = MY_PASSWORD)

li_match.login()

li_match.driver.get('https://www.linkedin.com/jobs/view/3785362967/?refId=0b4f4200-cf2a-40c5-ba62-e57d19a3d7f0&trackingId=BYuIlq5PROKjl1ELImutkA%3D%3D&trk=flagship3_job_home_savedjobs')

page_source = li_match.driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')

def get_job_details(job_soup):
    job_details = job_soup.find('div', class_="job-details-jobs-unified-top-card__primary-description-without-tagline mb2").get_text().replace('\n', '')
    job_details = re.split(r'\s·\s', job_details)

    job_position = job_soup.find('div', class_="display-flex justify-space-between flex-wrap").get_text().replace('\n', '').strip()
    out = [job_position] + job_details
    # Sometimes jobs don't have info on status, so we'll have to handle that as well
    return {'Position' : out[0], 'Company' : out[1], 'Location' : out[2], 'Posted' : out[3].replace('Posted ', ''), 'Status' : out[4]}

get_job_details(soup)

In [53]:
driver = webdriver.Chrome()
# Navigate to the LinkedIn login page
driver.get('https://www.linkedin.com/login')

# Enter your email address and password
driver.find_element(By.ID, 'username').send_keys(MY_USERNAME)
driver.find_element(By.ID, 'password').send_keys(MY_PASSWORD)

# Submit the login form
driver.find_element(By.CSS_SELECTOR, '.login__form_action_container button').click()

JOBS_URL = df.URL[1]
driver.get(JOBS_URL)

- Get the job description

In [54]:
import time
def get_job_desc(url):
    # Navigate to job url
    driver.get(url)
    # Get the page source
    time.sleep(1)
    page_source = driver.page_source
    # Parse the HTML using Beautiful Soup
    soup = BeautifulSoup(page_source, 'html.parser')
    # Return job description
    return soup.find('div', id = 'job-details').get_text().replace('\n', '')

In [55]:
from tqdm import tqdm

job_desc = []
for url in tqdm(df.URL):
    job_desc.append(get_job_desc(url))
#df.URL.apply(get_job_desc)

100%|████████████████████████████████████████████████████████████████████████████████| 103/103 [05:46<00:00,  3.36s/it]


In [56]:
df['Job Description'] = job_desc

- Let's see the description of last job offer

In [None]:
df.tail(1)['Job Description'].values[0]

### 2.2 Load baseline document (CV) and fetch its text

In [58]:
import PyPDF2

def read_pdf(file_name):
    # Open the PDF file in binary mode
    with open(file_name, 'rb') as file:
        # Create a PDF file reader object
        pdf_reader = PyPDF2.PdfReader(file)
        
        # Initialize an empty string to store the text
        text = ''
        
        # Loop through each page in the PDF and extract the text
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
            
    return text

In [None]:
baseline = read_pdf('J_Miklaszewski - CV.pdf').split('\n')[:-6] # get rid of the policy statement
baseline = ' '.join(baseline) # ensure the whitespace between inputs
baseline

### 2.3 Text quantification - LLM embeddings

In [60]:
import torch
from scipy.spatial.distance import cosine
from transformers import AutoTokenizer, AutoModel

> Load the BERT LLM Model

In [43]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

tokenizer_config.json: 100%|████████████████████████████████████████████████████████| 28.0/28.0 [00:00<00:00, 2.43kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
config.json: 100%|████████████████████████████████████████████████████████████████████| 570/570 [00:00<00:00, 43.8kB/s]
vocab.txt: 100%|████████████████████████████████████████████████████████████████████| 232k/232k [00:00<00:00, 3.44MB/s]
tokenizer.json: 100%|███████████████████████████████████████████████████████████████| 466k/466k [00:00<00:00, 6.59MB/s]
model.safetensors: 100%|████████████████████████████████████████████████████████████| 440M/440M [00:38<00:00, 11.5MB/s]


> Use BERT to vectorize the text and return it's embeddings

In [44]:
# Function to encode a document into a vector
def encode_embeddings(document):
    inputs = tokenizer(document, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

In [70]:
encode_embeddings(baseline).flatten().shape

(768,)

### 2.4 Goodness of fit - similarity between two texts

> We will now use the embeddings to compute the similarity between two documents

- For measuring how close two texts are, we will start with cosine similarity (1 - cosine distance) which measures the spread between two vectors on multidimentional plane

In [71]:
def cosine_similarity(doc1, doc2):
    vec1 = encode_embeddings(doc1).flatten()
    vec2 = encode_embeddings(doc2).flatten()
    return 1 - cosine(vec1, vec2) # Return 1 - cosine distance

In [62]:
doc1 = baseline
doc2 = df.tail(1)['Job Description'].values[0]

In [73]:
cosine_similarity(doc2, doc1)

0.7888648509979248

In [74]:
def get_cosine_similarity(row):
    return cosine_similarity(baseline, row['Job Description'])

In [76]:
df['Cos_similarity'] = df.apply(get_cosine_similarity, axis=1)

- We would like to complement the measure with probability based dissimilarity metric, such as KL divergence, or its symmetric alternative, Jensen-Shannon distance
- Because this would require extracting low-dimentional features from our text embeddings with PCA or Latent Dirichlet Allocation analysis of document Topics and subsequently arbitrary decide the number of topics to map on lower-dimentional plane, we will leave cosine similarity measure for know and look if the results are meaningfull results

In [88]:
from scipy.special import kl_div
from scipy.spatial.distance import jensenshannon

#### Sort the results by similarity

In [93]:
df.sort_values(by='Cos_similarity', ascending=False)

Unnamed: 0,Position,Company,Location,Status,Posted,Date Posted,URL,Job Description,Cos_similarity
13,Senior Data Scientist,EY,Warsaw (On-site),Actively recruiting,1w ago,NaT,https://www.linkedin.com/jobs/view/3791931828/...,About the job Let us...,0.926469
17,Senior Data Scientist,EY,Wrocław (On-site),Be an early applicant,1w ago,NaT,https://www.linkedin.com/jobs/view/3791932692/...,About the job Let us...,0.926469
39,"Senior Data Scientist, Delivery",Bolt,Berlin,Actively recruiting,1w ago,NaT,https://www.linkedin.com/jobs/view/3789363462/...,About the job We are...,0.925513
23,Data Scientist,Aimsun,Barcelona (Hybrid),,Actively recruiting,NaT,https://www.linkedin.com/jobs/view/3804071816/...,About the job Locati...,0.924985
88,Data Scientist,Winged IT,Warsaw (Remote),Actively recruiting,1mo ago,NaT,https://www.linkedin.com/jobs/view/3782331765/...,About the job Employ...,0.923561
...,...,...,...,...,...,...,...,...,...
44,Data Scientist,SDG Group España,Spain (Remote),,1w ago,NaT,https://www.linkedin.com/jobs/view/3750112164/...,About the job ¡Hola ...,0.652834
4,Machine Learning Scientist (m/w/d),Bayer,Berlin (On-site),Actively recruiting,2d ago,2024-01-18 22:10:46.046425,https://www.linkedin.com/jobs/view/3809397054/...,About the job Bei Ba...,0.636589
63,Data Scientist ML/AI,Swedish Pensions Agency,Stockholm (Hybrid),,2w ago,NaT,https://www.linkedin.com/jobs/view/3797818312/...,About the job Pensio...,0.618433
26,Quantitative Researcher,The Cocktail,Madrid (Hybrid),,Actively recruiting,NaT,https://www.linkedin.com/jobs/view/3774729027/...,About the job Así so...,0.595127


- Now we need to add the methods for dissimiliarity measurements to the JobMatch pipeline
- Once that's done, we will also add:

> LDA / PCA analysis of the baseline (CV) document

> Jensen-Shannon divergence analysis between the baseline document and description of each linked-in job