## Scraping jobs listing from multiple sites over multiple pages and save them to separate .csv files first, then try to combine them

In [1]:
import requests
from bs4 import BeautifulSoup as bs
from pydantic import BaseModel
from typing import List, Optional
from datetime import datetime, date
import re
import pandas as pd
import pickle
import os

### Specify the search term

In [2]:
search_term = 'python'

In [3]:
page = 1
urls = [
    'https://www.careerjet.pl/{search_term}-praca.html?radius=0&p={page}&sort=date',
    'https://it.pracuj.pl/praca/{search_term};kw?sc=0&pn={page}'
]

In [4]:
sites = [
    'careerjet',
    'pracuj'
]

### Model for listings

In [5]:
class Listing(BaseModel):
    title: str
    link: str
    company: str = None
    location: str = None
    salary: str = None
    description: str = None
    date_added: str | datetime = None
    skills: List[str] = None
    search_term: str = 'python'
    date_searched: date = date.today()

### Data extraction functions

#### careerjet

In [6]:
def extract_careerjet(soup):
    page_objects = []
    try:
        listings = soup.find('ul', attrs={'class': 'jobs'})
        listings_arr = listings.find_all('article', attrs={'class': 'job'})
    except:
        return page_objects
    for listing in listings_arr:
        try:
            listing_link = listing.get('data-url')
            link = f'careerjet.pl{listing_link}'
        except:
            link = ''
            
        try:
            title = listing.header.h2.getText().strip()
        except:
            continue

        try:
            company = listing.find('p', attrs={'class': 'company'}).getText()
        except:
            company = ''

        try:
            location = listing.find('ul', attrs={'class':'location'}).li.getText().strip()
        except:
            location = ''

        try:
            description = listing.find('div', attrs={'class':'desc'}).getText().strip()
        except:
            description = ''
        
        obj = Listing(title = title, link = link, company = company,
                 description = description, location = location)
        page_objects.append(obj)
    print(f'careerjet page {page} objects: ', len(page_objects))
    return page_objects

#### pracuj.pl

In [7]:
# regexes
pracuj_date = re.compile(r'Opublikowana: (.*)')

In [8]:
def extract_pracuj(soup):
    page_objects = []
    try:
        listings = soup.find('div', attrs={'data-test': 'section-offers'})
        listings_arr = listings.find_all(class_='c1fljezf')
    except:
        return page_objects
    
    for listing in listings_arr:
        try:
            listing_link = listing.find('a', attrs={'data-test': 'link-offer'}).get('href')
        except:
            listing_link = ''
        
        try:
            listing_title = listing.find('h2', attrs={'data-test': 'offer-title'}).a.getText()
        except:
            listing_title = ''
    
        try:
            listing_date = listing.find('p', attrs={'data-test':'text-added'}).getText()
            listing_date = pracuj_date.match(listing_date)[0]
        except:
            listing_date = ''
    
        try:
            listing_skills = []
            skills = listing.find_all('span', attrs={'data-test': 'technologies-item'})
            for skill in skills:
                skill = skill.getText()
                listing_skills.append(skill)
        except:
            listing_skills = []
    
        try:
            listing_company = listing.find('a', attrs={'data-test':'text-company-name'}).getText()
        except:
            listing_company = ''
    
        try:
            listing_location = listing.find('h5', attrs={'data-test':'text-regon'}).getText()
        except:
            listing_location = ''
    
        try:
            listing_description = listing.find('span', attrs={'class':'t126uk2l'}).getText()
        except:
            listing_description = ''
    
        try:
            listing_salary = listing.find('span', attrs={'data-test':'offer-salary'}).getText()
        except:
            listing_salary = ''
            
        obj = Listing(title = listing_title, link = listing_link,
                      date_added = listing_date, skills = listing_skills,
                     company = listing_company, location = listing_location,
                     description = listing_description, salary  = listing_salary)
        page_objects.append(obj)
        
    print(f'pracuj.pl page {page} objects: ', len(page_objects))
    return page_objects   

In [9]:
extractors = [
    extract_careerjet,
    extract_pracuj
]

### Create appropriate folders for saving data
search_term folder -> date_folder -> combined/individual/fluff -> files

In [10]:
import os
today = date.today()
print(today)
path = r'../scraping_results/' + search_term + '/' + str(today)
if not os.path.exists(path):
    os.makedirs(path)
path += '/'

2023-11-28


### Looping through the first 5 pages, within looping through the urls / providers

In [11]:
no_pages = 2


In [12]:
listings_objects = []
for url_index, url in enumerate(urls):
    site_listings = []
    for page in range(1,no_pages+1):
        page_url = url.format(search_term=search_term, page=page)
        page_response = requests.get(page_url)
        page_html = page_response.text
        page_soup = bs(page_html, 'html.parser')
        page_objects = extractors[url_index](page_soup)
        site_listings.extend(page_objects)
    # save listings for one site only
    site_listings_pd = pd.DataFrame([obj.dict() for obj in listings_objects])
    path_ind = path + r'individual'
    if not os.path.exists(path_ind):
        os.makedirs(path_ind)
    path_ind += '/'
    site_listings_pd.to_csv(path_ind + f'{sites[url_index]}_{search_term}_{no_pages}pages_{today}.csv')
    listings_objects.extend(site_listings)
print('all listings: ', len(listings_objects))

careerjet page 1 objects:  20
careerjet page 2 objects:  20
pracuj.pl page 1 objects:  50
pracuj.pl page 2 objects:  50
all listings:  140


C:\Users\User\AppData\Local\Temp\ipykernel_14992\1180034899.py:12: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  site_listings_pd = pd.DataFrame([obj.dict() for obj in listings_objects])


In [13]:
listings_pd = pd.DataFrame([obj.dict() for obj in listings_objects])
print(listings_pd.head())

                                        title  \
0               Monitoring Specialist (m/f/d)   
1                             DevOps Engineer   
2  Support Technical Account Manager Cracow 2   
3  Support Technical Account Manager Cracow 4   
4             Senior Test Automation Engineer   

                                                link               company  \
0  careerjet.pl/jobad/pldc3276bab0934fae6b8ed4a63...                Damovo   
1  careerjet.pl/jobad/plf6bbc4809d8132eb7cb141aa7...  Check Point Software   
2  careerjet.pl/jobad/plc11b328054f74b124c5bcce38...                         
3  careerjet.pl/jobad/pl7b01a6ee86fab684983acb9ec...                         
4  careerjet.pl/jobad/pl69754da3dc6d16cc5d7a88bf3...                  Hays   

                location salary  \
0  Warszawa, mazowieckie   None   
1  Warszawa, mazowieckie   None   
2    Kraków, małopolskie   None   
3    Kraków, małopolskie   None   
4    Kraków, małopolskie   None   

                         

C:\Users\User\AppData\Local\Temp\ipykernel_14992\3227278432.py:1: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  listings_pd = pd.DataFrame([obj.dict() for obj in listings_objects])


In [14]:
listings_pd.columns

Index(['title', 'link', 'company', 'location', 'salary', 'description',
       'date_added', 'skills', 'search_term', 'date_searched'],
      dtype='object')

In [15]:
sites_string = '_'.join(sites)
path_comb = path + 'combined'
if not os.path.exists(path_comb):
    os.makedirs(path_comb)
path_comb += '/'
listings_pd.to_csv(path_comb + f'{sites_string}_{search_term}_{today}.csv')

In [16]:
# str(today)

In [17]:
print(path_ind)
print(path_comb)

./scraping_results/python/2023-11-28/individual/
./scraping_results/python/2023-11-28/combined/
