## Scraping jobs listing from multiple sites over multiple pages and save them to separate .csv files first, then try to combine them

In [17]:
import requests
from bs4 import BeautifulSoup as bs
from pydantic import BaseModel
from typing import List, Optional
from datetime import datetime
import re
import pandas as pd
from datetime import datetime

### Specify the search term

In [2]:
search_term = 'python'

In [14]:
page = 1
urls = [
    'https://www.careerjet.pl/{search_term}-praca.html?radius=0&p={page}&sort=date',
    'https://it.pracuj.pl/praca/{search_term};kw?sc=0&pn={page}'
]

### Model for listings

In [36]:
class Listing(BaseModel):
    title: str
    link: str
    company: str = None
    location: str = None
    salary: int = None
    description: str = None
    date_added: str | datetime = None
    skills: List[str] = None

### Data extraction functions

In [39]:
def extract_careerjet(soup):
    page_objects = []
    listings = soup.find('ul', attrs={'class': 'jobs'})
    listings_arr = listings.find_all('article', attrs={'class': 'job'})
    for listing in listings_arr:
        listing_link = listing.get('data-url')
        link = f'careerjet.pl{listing_link}'

        title = listings.header.h2.getText().strip()

        obj = Listing(title = title, link = link)
        page_objects.append(obj)
    print(f'careerjet page {page} objects: ', len(page_objects))
    return page_objects

In [40]:
def extract_pracuj(soup):
    page_objects = []
    listings = soup.find('div', attrs={'data-test': 'section-offers'})
    listings_arr = listings.find_all(class_='c1fljezf')
    
    for listing in listings_arr:
        try:
            listing_link = listing.find('a', attrs={'data-test': 'link-offer'}).get('href')
        except:
            listing_link = ''

        try:
            listing_title = listing.find('h2', attrs={'data-test': 'offer-title'}).a.getText()
        except:
            listing_title = ''
            
        obj = Listing(title = listing_title, link = listing_link)
        page_objects.append(obj)
        
    print(f'pracuj.pl page {page} objects: ', len(page_objects))
    return page_objects   

In [41]:
extractors = [
    extract_careerjet,
    extract_pracuj
]

### Looping through the first 5 pages, within looping through the urls / providers

In [42]:
listings_objects = []
for url_index, url in enumerate(urls):
    for page in range(1,2+1):
        page_url = url.format(search_term=search_term, page=page)
        page_response = requests.get(page_url)
        page_html = page_response.text
        page_soup = bs(page_html, 'html.parser')
        page_objects = extractors[url_index](page_soup)
        listings_objects.extend(page_objects)
print('all listings: ', len(listings_objects))

careerjet page 1 objects:  20
careerjet page 2 objects:  20
pracuj.pl page 1 objects:  50
pracuj.pl page 2 objects:  50
all listings:  140


In [43]:
listings_pd = pd.DataFrame([obj.dict() for obj in listings_objects])
print(listings_pd.head())

                                   title  \
0  Software Development Engineer in Test   
1  Software Development Engineer in Test   
2  Software Development Engineer in Test   
3  Software Development Engineer in Test   
4  Software Development Engineer in Test   

                                                link company location  \
0  careerjet.pl/jobad/plbaec24941a296374129f0a069...    None     None   
1  careerjet.pl/jobad/pl0aebd21ce203844c2227dd949...    None     None   
2  careerjet.pl/jobad/plc9f07a5262f4157450ec5fe06...    None     None   
3  careerjet.pl/jobad/pl8ca043620f35a2cb25fd68afb...    None     None   
4  careerjet.pl/jobad/pl6063d8bd71361d945b2d89f9e...    None     None   

  description date_added skills  
0        None       None   None  
1        None       None   None  
2        None       None   None  
3        None       None   None  
4        None       None   None  


C:\Users\User\AppData\Local\Temp\ipykernel_12588\3227278432.py:1: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  listings_pd = pd.DataFrame([obj.dict() for obj in listings_objects])


In [44]:
listings_pd.columns

Index(['title', 'link', 'company', 'location', 'description', 'date_added',
       'skills'],
      dtype='object')

In [46]:
listings_pd.to_csv(f'2pages_career_pracuj.csv')

### Fast pandas operations

In [48]:
df = pd.read_csv('2pages_career_pracuj.csv', index_col=0)
df.head()

Unnamed: 0,title,link,company,location,description,date_added,skills
0,Software Development Engineer in Test,careerjet.pl/jobad/plbaec24941a296374129f0a069...,,,,,
1,Software Development Engineer in Test,careerjet.pl/jobad/pl0aebd21ce203844c2227dd949...,,,,,
2,Software Development Engineer in Test,careerjet.pl/jobad/plc9f07a5262f4157450ec5fe06...,,,,,
3,Software Development Engineer in Test,careerjet.pl/jobad/pl8ca043620f35a2cb25fd68afb...,,,,,
4,Software Development Engineer in Test,careerjet.pl/jobad/pl6063d8bd71361d945b2d89f9e...,,,,,
