# Phase 0: Testing

### Set up ChromeDriver with selenium

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By

### Navigate ChromeDriver to LinkedIn log-in page

In [2]:
driver = webdriver.Chrome()
# Navigate to the LinkedIn login page
driver.get('https://www.linkedin.com/login')

### Log-in to LinkedIn with account credentials

- Read Log-in details from external txt file 

In [5]:
login_txt = open('LinkedIn - Logging Details.txt', 'r')
MY_USERNAME = login_txt.readline().replace('Username: ', '').replace('\n', '')
MY_PASSWORD = login_txt.readline().replace('Password: ', '')

In [4]:
# Enter your email address and password
driver.find_element(By.ID, 'username').send_keys(MY_USERNAME)
driver.find_element(By.ID, 'password').send_keys(MY_PASSWORD)

In [5]:
# Submit the login form
driver.find_element(By.CSS_SELECTOR, '.login__form_action_container button').click()

### Navigate to saved LinkedIn jobs

In [11]:
JOBS_URL = 'https://www.linkedin.com/my-items/saved-jobs/?cardType=SAVED'
#driver.get(JOBS_URL)

### Use BeautifulSoup to get source content of jobs page

In [13]:
from bs4 import BeautifulSoup

In [8]:
# Get the page source
page_source = driver.page_source
# Parse the HTML using Beautiful Soup
soup = BeautifulSoup(page_source, 'html.parser')

- Close ChromeDriver once done scrapping

In [9]:
driver.quit()

- Fetch all information on saved jobs

In [10]:
soup.find_all('li', {'class': 'reusable-search__result-container'})[0]

<li class="reusable-search__result-container">
<!-- --><!-- -->
<div class="LPKKpHLTVmvhcAuVeGRdqmzEouqLVTRJFMpI" data-chameleon-result-urn="urn:li:fsd_jobPosting:3803021338" data-view-name="search-entity-result-universal-template">
<div class="linked-area flex-1 cursor-pointer">
<div class="QObBLEvMEqbGOrNtcUZdRvpvJcjOQKhljNvA">
<div class="entity-result__universal-image">
<div class="display-flex align-items-center">
<!-- -->
<a aria-hidden="true" class="app-aware-link scale-down" data-test-app-aware-link="" href="https://www.linkedin.com/jobs/view/3803021338/?refId=0b4f4200-cf2a-40c5-ba62-e57d19a3d7f0&amp;trackingId=1FiBZR7WQJ%2BusIHRlJ27%2FQ%3D%3D&amp;trk=flagship3_job_home_savedjobs" tabindex="-1">
<div class="ivm-image-view-model">
<div class="ivm-view-attr__img-wrapper display-flex">
<!-- --> <img alt="Volvo Group" class="ivm-view-attr__img--centered EntityPhoto-square-3 evi-image lazy-image ember-view" height="48" id="ember33" loading="lazy" src="https://media.licdn.com/dms/ima

- Now, it would be good to write a function that takes an instance of job and returns the following:

> Job Title (Position)
>
> Job Company
>
> Job Location
>
> Job Status (actively recruiting, etc)
>
> Posted Date (we need to convert this to proper timestamp from string [eg. Posted 12h ago])

In [14]:
import re

def jobmatch_get_job_details(job_html):

    # Fetch only text from the job html code
    job_text = job_html.get_text()
    # Replace '\n' tag with ';' so it's easier to return instances of proper text (; is not expected to be used anywere in job parameters)
    job_text = job_text.replace('\n', ';')
    # Compile regex pattern to filter out '-' symbols
    pattern = r'(?<=;)([^;]+)(?=;)'
    job_pars = re.findall(pattern, job_text)
    # Remove reduntant whitespaves left after treating bs4.Tag
    out = [i for i in job_pars if not i.isspace()]

    # Sometimes jobs don't have info on status, so we'll have to handle that as well
    if len(out) == 5:
        return {'Position' : out[0], 'Company' : out[1], 'Location' : out[2], 'Status' : out[3], 'Posted' : out[4].replace('Posted ', '')}
    else:
        return {'Position' : out[0], 'Company' : out[1], 'Location' : out[2], 'Status' : 'N/A', 'Posted' : out[3].replace('Posted ', '')}

In [12]:
job = soup.find_all('li', {'class': 'reusable-search__result-container'})[5]
jobmatch_get_job_details(job)

{'Position': 'ML Engineer / Data Scientist',
 'Company': 'Zurich Insurance',
 'Location': 'Cracow (Hybrid)',
 'Status': ' Actively recruiting',
 'Posted': '2w ago'}

- Now, let's collect this info for all companies

In [None]:
import pandas as pd
job_details_table = [jobmatch_get_job_details(i) for i in soup.find_all('li', {'class': 'reusable-search__result-container'})]
job_details_df = pd.DataFrame.from_dict(job_details_table)
job_details_df

- Since we will use this table in future, let's convert the posted column to proper timestamp

In [14]:
from datetime import datetime, timedelta

def convert_time_lag(row):
    # Compile pattern to search for 'Nd/h ago' patter in Posted column
    match = re.match(r'(\d+)([dh])\s+ago', row['Posted'])
    # If matched
    if match:
        # Split the results to two groups - number of days/hours and time unit
        value, unit = int(match.group(1)), match.group(2)
        # If number of days specified
        if unit == 'd':
            return pd.to_datetime(datetime.now() - timedelta(days=value))
        # If number of hours specified
        elif unit == 'h':
            return pd.to_datetime(datetime.now() - timedelta(hours=value))
    # If the pattern wasn't matched at all return NaN
    return pd.NaT

- We probably won't need intra-day date precision, but will leave it for now in case of further project development

In [15]:
job_details_df['Date Posted'] = job_details_df.apply(convert_time_lag, axis=1)
job_details_df

Unnamed: 0,Position,Company,Location,Status,Posted,Date Posted
0,Professional Data Analyst,Volvo Group,Wrocław,Actively recruiting,3d ago,2024-01-17 11:44:26.308786
1,Junior Data Analyst with Python,Unilever,Warsaw (On-site),Actively recruiting,4d ago,2024-01-16 11:44:26.317035
2,Data Analyst in ESG Team,ING Hubs Poland,Warsaw Metropolitan Area (Hybrid),,4d ago,2024-01-16 11:44:26.318015
3,Machine Learning Scientist (m/w/d),Bayer,Berlin (On-site),Actively recruiting,1d ago,2024-01-19 11:44:26.318015
4,Model Validator Quantitative Analyst MRMC US,UBS,Cracow (On-site),,2d ago,2024-01-18 11:44:26.318015
5,ML Engineer / Data Scientist,Zurich Insurance,Cracow (Hybrid),Actively recruiting,2w ago,NaT
6,Consultant (Senior) for Data Analytics & Data ...,BearingPoint,"Prague, Czechia (Hybrid)",,Actively recruiting,NaT
7,Data Scientist - Generative AI,IBM,Prague (On-site),Actively recruiting,3d ago,2024-01-17 11:44:26.319021
8,Data Scientist - 100% Remoto,Walters People,Barcelona (Remote),Actively recruiting,4d ago,2024-01-16 11:44:26.319021
9,Planning Data Scientist - Consultant,Accenture España,Barcelona,Actively recruiting,4d ago,2024-01-16 11:44:26.319021


- Now that we have all useful job details, it would be nice to have their direct links as well

In [16]:
job.find('a', {'class': 'app-aware-link scale-down'}, href=True)['href']

'https://www.linkedin.com/jobs/view/3785362967/?refId=0b4f4200-cf2a-40c5-ba62-e57d19a3d7f0&trackingId=BYuIlq5PROKjl1ELImutkA%3D%3D&trk=flagship3_job_home_savedjobs'

- Seems that it should be enough to just fetch first hyperlink

In [17]:
def jobmatch_get_job_link(job_html):
    return job_html.find('a', {'class': 'app-aware-link scale-down'}, href=True)['href']

In [18]:
job_details_df['Hyperlink'] = [jobmatch_get_job_link(i) for i in soup.find_all('li', {'class': 'reusable-search__result-container'})]
job_details_df

Unnamed: 0,Position,Company,Location,Status,Posted,Date Posted,Hyperlink
0,Professional Data Analyst,Volvo Group,Wrocław,Actively recruiting,3d ago,2024-01-17 11:44:26.308786,https://www.linkedin.com/jobs/view/3803021338/...
1,Junior Data Analyst with Python,Unilever,Warsaw (On-site),Actively recruiting,4d ago,2024-01-16 11:44:26.317035,https://www.linkedin.com/jobs/view/3805491270/...
2,Data Analyst in ESG Team,ING Hubs Poland,Warsaw Metropolitan Area (Hybrid),,4d ago,2024-01-16 11:44:26.318015,https://www.linkedin.com/jobs/view/3805428308/...
3,Machine Learning Scientist (m/w/d),Bayer,Berlin (On-site),Actively recruiting,1d ago,2024-01-19 11:44:26.318015,https://www.linkedin.com/jobs/view/3809397054/...
4,Model Validator Quantitative Analyst MRMC US,UBS,Cracow (On-site),,2d ago,2024-01-18 11:44:26.318015,https://www.linkedin.com/jobs/view/3808472757/...
5,ML Engineer / Data Scientist,Zurich Insurance,Cracow (Hybrid),Actively recruiting,2w ago,NaT,https://www.linkedin.com/jobs/view/3785362967/...
6,Consultant (Senior) for Data Analytics & Data ...,BearingPoint,"Prague, Czechia (Hybrid)",,Actively recruiting,NaT,https://www.linkedin.com/jobs/view/3715499136/...
7,Data Scientist - Generative AI,IBM,Prague (On-site),Actively recruiting,3d ago,2024-01-17 11:44:26.319021,https://www.linkedin.com/jobs/view/3803082749/...
8,Data Scientist - 100% Remoto,Walters People,Barcelona (Remote),Actively recruiting,4d ago,2024-01-16 11:44:26.319021,https://www.linkedin.com/jobs/view/3805442679/...
9,Planning Data Scientist - Consultant,Accenture España,Barcelona,Actively recruiting,4d ago,2024-01-16 11:44:26.319021,https://www.linkedin.com/jobs/view/3775612118/...


## Putting it all together

> Read all jobs on current page

In [19]:
job_page = soup.find_all('li', {'class': 'reusable-search__result-container'})

> Compile DataFrame with all required entry for job on current page

In [20]:
def jobmatch_page_tab(page):

    # Read basic details of each job on current page
    job_details = [jobmatch_get_job_details(i) for i in page]
    job_details = pd.DataFrame.from_dict(job_details)

    # Convert posted time lag to TimeStamp
    job_details['Date Posted'] = job_details.apply(convert_time_lag, axis=1)

    # Attach page URLs
    job_details['URL'] = [jobmatch_get_job_link(i) for i in page]

    return job_details

In [21]:
jobmatch_page_tab(job_page)

Unnamed: 0,Position,Company,Location,Status,Posted,Date Posted,URL
0,Professional Data Analyst,Volvo Group,Wrocław,Actively recruiting,3d ago,2024-01-17 11:44:57.473810,https://www.linkedin.com/jobs/view/3803021338/...
1,Junior Data Analyst with Python,Unilever,Warsaw (On-site),Actively recruiting,4d ago,2024-01-16 11:44:57.475804,https://www.linkedin.com/jobs/view/3805491270/...
2,Data Analyst in ESG Team,ING Hubs Poland,Warsaw Metropolitan Area (Hybrid),,4d ago,2024-01-16 11:44:57.475804,https://www.linkedin.com/jobs/view/3805428308/...
3,Machine Learning Scientist (m/w/d),Bayer,Berlin (On-site),Actively recruiting,1d ago,2024-01-19 11:44:57.476798,https://www.linkedin.com/jobs/view/3809397054/...
4,Model Validator Quantitative Analyst MRMC US,UBS,Cracow (On-site),,2d ago,2024-01-18 11:44:57.477795,https://www.linkedin.com/jobs/view/3808472757/...
5,ML Engineer / Data Scientist,Zurich Insurance,Cracow (Hybrid),Actively recruiting,2w ago,NaT,https://www.linkedin.com/jobs/view/3785362967/...
6,Consultant (Senior) for Data Analytics & Data ...,BearingPoint,"Prague, Czechia (Hybrid)",,Actively recruiting,NaT,https://www.linkedin.com/jobs/view/3715499136/...
7,Data Scientist - Generative AI,IBM,Prague (On-site),Actively recruiting,3d ago,2024-01-17 11:44:57.477795,https://www.linkedin.com/jobs/view/3803082749/...
8,Data Scientist - 100% Remoto,Walters People,Barcelona (Remote),Actively recruiting,4d ago,2024-01-16 11:44:57.478832,https://www.linkedin.com/jobs/view/3805442679/...
9,Planning Data Scientist - Consultant,Accenture España,Barcelona,Actively recruiting,4d ago,2024-01-16 11:44:57.479829,https://www.linkedin.com/jobs/view/3775612118/...


- Now that we have a framework to compile a simple table from one job page, we will run the function on all pages of saved jobs
- It looks like LinkeIn organizes their URLs in such a way that it's enough to add '&start=10' to go to next set of jobs from 'SAVED'

## Reading number of saved pages to scan

### FIXME - Fix reading the numebr of pages (read just the max index int in the CSS object)

- The following HTML object contains all of numbers (and links) to indexes of job pages

In [41]:
pages_number_obj = soup.find('ul', {'class' : 'artdeco-pagination__pages artdeco-pagination__pages--number'})

- If there are more than 10 pages, linked in will squeeze them with "...", therefore we will just read the last page number

In [45]:
last_page_str = str(pages_number_obj.find_all('li')[-1].find('span')) # Find the number of last saved jobs page
last_page_idx = int(re.findall(r'\d+', last_page_str)[0]) # Read the number from <span> tag and convert it to number
print(f'Number of saved jobs pages found: {last_page_idx}')

Number of saved jobs pages found: 11


- Now we have to create a simple mechanism to compile URL for each job page

In [46]:
saved_jobs_pages = []

for i in range(last_page_idx):
    p_url = JOBS_URL + f'&start={i*10}'
    print(p_url)
    saved_jobs_pages.append(p_url)

https://www.linkedin.com/my-items/saved-jobs/?cardType=SAVED&start=0
https://www.linkedin.com/my-items/saved-jobs/?cardType=SAVED&start=10
https://www.linkedin.com/my-items/saved-jobs/?cardType=SAVED&start=20
https://www.linkedin.com/my-items/saved-jobs/?cardType=SAVED&start=30
https://www.linkedin.com/my-items/saved-jobs/?cardType=SAVED&start=40
https://www.linkedin.com/my-items/saved-jobs/?cardType=SAVED&start=50
https://www.linkedin.com/my-items/saved-jobs/?cardType=SAVED&start=60
https://www.linkedin.com/my-items/saved-jobs/?cardType=SAVED&start=70
https://www.linkedin.com/my-items/saved-jobs/?cardType=SAVED&start=80
https://www.linkedin.com/my-items/saved-jobs/?cardType=SAVED&start=90
https://www.linkedin.com/my-items/saved-jobs/?cardType=SAVED&start=100


- Since each of URL is working fine (including the first 'reduntant' one) we will just use this to compile full table of saved jobs

- This concludes Tesing Phase, we can move on with 'production-ready' code for organizing the jobs in one table

# Phase 1: Dataset of saved linked-in jobs

## Constructing full job dataset from all saved pages

In [30]:
import time
from tqdm import tqdm
class JobMatch:
    # Define global link to the saved jobs
    JOBS_URL = 'https://www.linkedin.com/my-items/saved-jobs/?cardType=SAVED'

    def __init__(self, username, password):

        self.username = username
        self.password = password


    def login(self):

        print('JobMatch: Logging to linked-in, please wait...')
    
        self.driver = webdriver.Chrome()
        time.sleep(2)
        # Navigate to the LinkedIn login page
        self.driver.get('https://www.linkedin.com/login')
        # Wait unti lthe page loads
        time.sleep(5)
        # Enter your email address and password
        self.driver.find_element(By.ID, 'username').send_keys(self.username)
        time.sleep(1)
        self.driver.find_element(By.ID, 'password').send_keys(self.password)
        time.sleep(1)

        # Submit the login form
        self.driver.find_element(By.CSS_SELECTOR, '.login__form_action_container button').click()

        ###TODO: Please add handling of the incorect log-in scenario
        print('JobMatch: Log-in completed!')

    def get_page_soup(self, page_url):

        self.driver.get(page_url)
        # Wait until the page is loaded properly
        time.sleep(5)
        # Get the page source
        page_source = self.driver.page_source
        # Parse the HTML using Beautiful Soup
        soup = BeautifulSoup(page_source, 'html.parser')

        return soup

    def get_n_job_pages(self):
        
        # Get the soup source of tha base page of saved jobs
        base_soup = self.get_page_soup(JOBS_URL)
        # Scan the source for pages index placeholder
        pages_idx_obj = base_soup.find('ul', {'class' : 'artdeco-pagination__pages artdeco-pagination__pages--number'})
        last_page_str = str(pages_idx_obj.find_all('li')[-1].find('span')) # Find the number of last saved jobs page
        # Check how many jobs pages are there
        n_job_pages = int(re.findall(r'\d+', last_page_str)[0])

        return int(n_job_pages)

    def get_saved_pages_urls(self):

        n_job_pages = self.get_n_job_pages()

        pages_urls = []

        for i in range(n_job_pages):
            p_url = JOBS_URL + f'&start={i*10}'
            pages_urls.append(p_url)

        return pages_urls

    def get_job_details(self, job_soup):
        job_details = job_soup.find('div', class_="job-details-jobs-unified-top-card__primary-description-without-tagline mb2").get_text().replace('\n', '')
        job_details = re.split(r'\s·\s', job_details)

        job_position = job_soup.find('div', class_="display-flex justify-space-between flex-wrap").get_text().replace('\n', '').strip()
        out = [job_position] + job_details
        # Sometimes jobs don't have info on status, so we'll have to handle that as well
        
        return {'Position' : out[0], 'Company' : out[1], 'Location' : out[2], 'Posted' : out[3].replace('Posted ', ''), 'Status' : out[4]}

    def convert_time_lag(self, row):
        # Compile pattern to search for 'Nd/h ago' patter in Posted column
        match = re.match(r'(\d+)([dhwmo]{1,2})\sago', row['Posted'])
        # If matched
        if match:
            # Split the results to two groups - number of days/hours and time unit
            value, unit = int(match.group(1)), match.group(2)
            # If number of days specified
            if unit == 'd':
                return pd.to_datetime(datetime.now() - timedelta(days=value))
            # If number of hours specified
            elif unit == 'h':
                return pd.to_datetime(datetime.now() - timedelta(hours=value))
            # If number of weeks given
            elif unit == 'w':
                return pd.to_datetime(datetime.now() - timedelta(days=(value*7)))
            # If number of months given
            elif unit == 'mo':
                return pd.to_datetime(datetime.now() - timedelta(days=(value*30)))
        # If the pattern wasn't matched at all return NaN
        return pd.NaT

    def get_job_link(self, job_html):
        return job_html.find('a', {'class': 'app-aware-link scale-down'}, href=True)['href']

    def jobmatch_page_tab(self, page):

        # Read basic details of each job on current page
        job_details = [self.get_job_details(i) for i in page]
        job_details = pd.DataFrame.from_dict(job_details)

        # Convert posted time lag to TimeStamp
        job_details['Date Posted'] = job_details.apply(self.convert_time_lag, axis=1)
    
        # Attach page URLs
        job_details['URL'] = [self.get_job_link(i) for i in page]

        return job_details

    def compile_jobs_dataframe(self):
        print('JobMatch: Compiling DataFrame with all saved Linke-in jobs, please wait..')

        frames = []

        saved_pages_urls = self.get_saved_pages_urls()
        # For each pages of saved jobs in linkedin
        for page_url in tqdm(saved_pages_urls, desc='Process'):
            #print(f'Scanning job page no.{page_idx}..')
            page_soup = self.get_page_soup(page_url)
            job_page = page_soup.find_all('li', {'class': 'reusable-search__result-container'})
            # Get the full data table for current page
            frames.append(self.jobmatch_page_tab(job_page))

        # Exit chrome session once all done
        self.driver.quit()

        print('JobMatch: DataFrame completed!')
        # Concatenate the tables to one table and return it
        return pd.concat(frames).reset_index(drop=True)

> Step 1: Initialize JobMatch

In [31]:
li_match = JobMatch(username = MY_USERNAME,
                   password = MY_PASSWORD)

> Step 2: Logging in

In [32]:
li_match.login()

JobMatch: Logging to linked-in, please wait...
JobMatch: Log-in completed!


> Step 3: Construct table of characteristics for every job saved

In [None]:
jobs_df = li_match.compile_jobs_dataframe()

In [51]:
jobs_df

Unnamed: 0,Position,Company,Location,Status,Posted,Date Posted,URL
0,Data Scientist,Gartner,Barcelona (On-site),Actively recruiting,1d ago,2024-01-19 22:00:05.670473,https://www.linkedin.com/jobs/view/3809116486/...
1,Professional Data Analyst,Volvo Group,Wrocław,Actively recruiting,4d ago,2024-01-16 22:00:05.670473,https://www.linkedin.com/jobs/view/3803021338/...
2,Junior Data Analyst with Python,Unilever,Warsaw (On-site),Actively recruiting,5d ago,2024-01-15 22:00:05.670473,https://www.linkedin.com/jobs/view/3805491270/...
3,Data Analyst in ESG Team,ING Hubs Poland,Warsaw Metropolitan Area (Hybrid),,5d ago,2024-01-15 22:00:05.670473,https://www.linkedin.com/jobs/view/3805428308/...
4,Machine Learning Scientist (m/w/d),Bayer,Berlin (On-site),Actively recruiting,2d ago,2024-01-18 22:00:05.670473,https://www.linkedin.com/jobs/view/3809397054/...
...,...,...,...,...,...,...,...
98,Applied LLM Researcher,Creative Chaos,Poland (Remote),,Actively recruiting,NaT,https://www.linkedin.com/jobs/view/3736915128/...
99,Senior Data Scientist (Advanced Analytics),SoftServe,Poland (Remote),Actively recruiting,10h ago,2024-01-20 12:01:20.161897,https://www.linkedin.com/jobs/view/3731852302/...
100,Data Scientist (Risk),Revolut,Poland (Remote),Actively recruiting,1w ago,NaT,https://www.linkedin.com/jobs/view/3723326089/...
101,Consultant Quant,AWALEE CONSULTING by Canopee Group,Paris (Hybrid),,Actively recruiting,NaT,https://www.linkedin.com/jobs/view/3725416038/...


- Let's save above table in csv file for later

In [107]:
jobs_df.to_csv('JobMatch - Saved Linked-in Jobs.csv')

- Phase 1 is completed, now we have a nice table with all information required per each saved job
- Now we can move to phase 2, quantifying how fell job fit candidate CV by using one of LLMs available 