## Scraping jobs listing from multiple sites over multiple pages and save them to separate .csv files first, then try to combine them

In [15]:
import requests
from bs4 import BeautifulSoup as bs
from pydantic import BaseModel
from typing import List, Optional
from datetime import datetime, date
import re
import pandas as pd
import pickle

### Specify the search term

In [16]:
search_term = 'python'

In [17]:
page = 1
urls = [
    'https://www.careerjet.pl/{search_term}-praca.html?radius=0&p={page}&sort=date',
    'https://it.pracuj.pl/praca/{search_term};kw?sc=0&pn={page}'
]

In [18]:
sites = [
    'careerjet',
    'pracuj'
]

### Model for listings

In [19]:
class Listing(BaseModel):
    title: str
    link: str
    company: str = None
    location: str = None
    salary: str = None
    description: str = None
    date_added: str | datetime = None
    skills: List[str] = None
    search_term: str = 'python'
    date_searched: date = date.today()

### Data extraction functions

#### careerjet

In [26]:
def extract_careerjet(soup):
    page_objects = []
    listings = soup.find('ul', attrs={'class': 'jobs'})
    listings_arr = listings.find_all('article', attrs={'class': 'job'})
    for listing in listings_arr:
        try:
            listing_link = listing.get('data-url')
            link = f'careerjet.pl{listing_link}'
        except:
            link = ''
            
        try:
            title = listing.header.h2.getText().strip()
        except:
            continue

        try:
            company = listing.find('p', attrs={'class': 'company'}).getText()
        except:
            company = ''

        try:
            location = listing.find('ul', attrs={'class':'location'}).li.getText().strip()
        except:
            location = ''

        try:
            description = listing.find('div', attrs={'class':'desc'}).getText().strip()
        except:
            description = ''
        
        obj = Listing(title = title, link = link, company = company,
                 description = description, location = location)
        page_objects.append(obj)
    print(f'careerjet page {page} objects: ', len(page_objects))
    return page_objects

#### pracuj.pl

In [21]:
# regexes
pracuj_date = re.compile(r'Opublikowana: (.*)')

In [22]:
def extract_pracuj(soup):
    page_objects = []
    listings = soup.find('div', attrs={'data-test': 'section-offers'})
    listings_arr = listings.find_all(class_='c1fljezf')
    
    for listing in listings_arr:
        try:
            listing_link = listing.find('a', attrs={'data-test': 'link-offer'}).get('href')
        except:
            listing_link = ''
        
        try:
            listing_title = listing.find('h2', attrs={'data-test': 'offer-title'}).a.getText()
        except:
            listing_title = ''
    
        try:
            listing_date = listing.find('p', attrs={'data-test':'text-added'}).getText()
            listing_date = pracuj_date.match(listing_date)[0]
        except:
            listing_date = ''
    
        try:
            listing_skills = []
            skills = listing.find_all('span', attrs={'data-test': 'technologies-item'})
            for skill in skills:
                skill = skill.getText()
                listing_skills.append(skill)
        except:
            listing_skills = []
    
        try:
            listing_company = listing.find('a', attrs={'data-test':'text-company-name'}).getText()
        except:
            listing_company = ''
    
        try:
            listing_location = listing.find('h5', attrs={'data-test':'text-regon'}).getText()
        except:
            listing_location = ''
    
        try:
            listing_description = listing.find('span', attrs={'class':'t126uk2l'}).getText()
        except:
            listing_description = ''
    
        try:
            listing_salary = listing.find('span', attrs={'data-test':'offer-salary'}).getText()
        except:
            listing_salary = ''
            
        obj = Listing(title = listing_title, link = listing_link,
                      date_added = listing_date, skills = listing_skills,
                     company = listing_company, location = listing_location,
                     description = listing_description, salary  = listing_salary)
        page_objects.append(obj)
        
    print(f'pracuj.pl page {page} objects: ', len(page_objects))
    return page_objects   

In [27]:
extractors = [
    extract_careerjet,
    extract_pracuj
]

### Looping through the first 5 pages, within looping through the urls / providers

In [24]:
no_pages = 5
today = date.today()
print(today)

2023-11-25


In [28]:
listings_objects = []
for url_index, url in enumerate(urls):
    site_listings = []
    for page in range(1,no_pages+1):
        page_url = url.format(search_term=search_term, page=page)
        page_response = requests.get(page_url)
        page_html = page_response.text
        page_soup = bs(page_html, 'html.parser')
        page_objects = extractors[url_index](page_soup)
        site_listings.extend(page_objects)
    # save listings for one site only
    site_listings_pd = pd.DataFrame([obj.dict() for obj in listings_objects])
    site_listings_pd.to_csv(f'{sites[url_index]}_{search_term}_{no_pages}pages_{today}.csv')
    listings_objects.extend(site_listings)
print('all listings: ', len(listings_objects))

careerjet page 1 objects:  20
careerjet page 2 objects:  20
careerjet page 3 objects:  20
careerjet page 4 objects:  20
careerjet page 5 objects:  20
pracuj.pl page 1 objects:  50
pracuj.pl page 2 objects:  50
pracuj.pl page 3 objects:  50
pracuj.pl page 4 objects:  50
pracuj.pl page 5 objects:  50
all listings:  350


C:\Users\User\AppData\Local\Temp\ipykernel_20812\2036401872.py:12: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  site_listings_pd = pd.DataFrame([obj.dict() for obj in listings_objects])


In [29]:
listings_pd = pd.DataFrame([obj.dict() for obj in listings_objects])
print(listings_pd.head())

                                         title  \
0  Climate Risk - Regulatory Reporting Analyst   
1                            analityk big data   
2                       Software Test Engineer   
3                       Senior DevOps Engineer   
4              Senior Test Automation Engineer   

                                                link    company  \
0  careerjet.pl/jobad/plf5c63585be33eed77ba668c07...  Citigroup   
1  careerjet.pl/jobad/pla642499eec94e311324e64bbe...  Santander   
2  careerjet.pl/jobad/pl798a9ca8cf8affde11c0eaa00...     Harman   
3  careerjet.pl/jobad/pl3ee93a7b0e2e7e038e538e5e2...     Luxoft   
4  careerjet.pl/jobad/pl6c1316b98726ba2a26939895d...     Luxoft   

                location salary  \
0  Warszawa, mazowieckie   None   
1  Warszawa, mazowieckie   None   
2          Łódź, łódzkie   None   
3  Wrocław, dolnośląskie   None   
4  Warszawa, mazowieckie   None   

                                         description date_added skills  \
0  As a suc

C:\Users\User\AppData\Local\Temp\ipykernel_20812\3227278432.py:1: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  listings_pd = pd.DataFrame([obj.dict() for obj in listings_objects])


In [30]:
listings_pd.columns

Index(['title', 'link', 'company', 'location', 'salary', 'description',
       'date_added', 'skills', 'search_term', 'date_searched'],
      dtype='object')

In [34]:
# listings_pd.to_csv(f'2pages_career_pracuj.csv')
sites_string = '_'.join(sites)
# print(companies)
listings_pd.to_csv(f'{sites_string}_{search_term}_{today}.csv')

### Fast pandas operations

In [48]:
df = pd.read_csv('2pages_career_pracuj.csv', index_col=0)
df.head()

Unnamed: 0,title,link,company,location,description,date_added,skills
0,Software Development Engineer in Test,careerjet.pl/jobad/plbaec24941a296374129f0a069...,,,,,
1,Software Development Engineer in Test,careerjet.pl/jobad/pl0aebd21ce203844c2227dd949...,,,,,
2,Software Development Engineer in Test,careerjet.pl/jobad/plc9f07a5262f4157450ec5fe06...,,,,,
3,Software Development Engineer in Test,careerjet.pl/jobad/pl8ca043620f35a2cb25fd68afb...,,,,,
4,Software Development Engineer in Test,careerjet.pl/jobad/pl6063d8bd71361d945b2d89f9e...,,,,,
