# Scraping latest offers from careerjet.pl

In [19]:
import requests
import json
import pandas as pd
import re
from bs4 import BeautifulSoup as bs
from pydantic import BaseModel
from datetime import datetime, date
from typing import List, Optional
import pickle

In [2]:
search_term = 'python'

### Get listings from a single page

In [3]:
url = f'https://www.careerjet.pl/{search_term}-praca.html?radius=0&p=1&sort=date'

In [4]:
response = requests.get(url)
html = response.text
soup = bs(html, 'html.parser')

### Pickle the response to limit future calls in case of testing

In [21]:
with open ('careerjet_python_soup.pkl', 'wb') as outp:
    pickle.dump(soup, outp, pickle.HIGHEST_PROTOCOL)

#### Getting the response from pickle

In [23]:
with open ('careerjet_python_soup.pkl', 'rb') as inp:
    soup = pickle.load(inp)
    print(soup.prettify())

<!DOCTYPE html>
<html dir="ltr" lang="pl">
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="telephone=no" name="format-detection"/>
  <link crossorigin="" href="https://static.careerjet.org" rel="preconnect"/>
  <link href="https://static.careerjet.org" rel="dns-prefetch"/>
  <link crossorigin="" href="//www.google-analytics.com" rel="preconnect"/>
  <link href="//www.google-analytics.com" rel="dns-prefetch"/>
  <link crossorigin="" href="//www.googletagmanager.com" rel="preconnect"/>
  <link href="//www.googletagmanager.com" rel="dns-prefetch"/>
  <link crossorigin="" href="//www.google.com" rel="preconnect"/>
  <link href="//www.google.com" rel="dns-prefetch"/>
  <link crossorigin="" href="https://cvimg.careerjet.net" rel="preconnect"/>
  <link href="https://cvimg.careerjet.net" rel="dns-prefetch"/>
  <link crossorigin="" href="https://logoimg.careerjet.net" rel="preconnect"/>
  <link href="https://logoimg.careerjet.net" rel=

In [5]:
print(soup.prettify())
with open ("careerjet1.html", "w", encoding='utf-8') as file:
    file.write(str(soup.prettify()))

<!DOCTYPE html>
<html dir="ltr" lang="pl">
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="telephone=no" name="format-detection"/>
  <link crossorigin="" href="https://static.careerjet.org" rel="preconnect"/>
  <link href="https://static.careerjet.org" rel="dns-prefetch"/>
  <link crossorigin="" href="//www.google-analytics.com" rel="preconnect"/>
  <link href="//www.google-analytics.com" rel="dns-prefetch"/>
  <link crossorigin="" href="//www.googletagmanager.com" rel="preconnect"/>
  <link href="//www.googletagmanager.com" rel="dns-prefetch"/>
  <link crossorigin="" href="//www.google.com" rel="preconnect"/>
  <link href="//www.google.com" rel="dns-prefetch"/>
  <link crossorigin="" href="https://cvimg.careerjet.net" rel="preconnect"/>
  <link href="https://cvimg.careerjet.net" rel="dns-prefetch"/>
  <link crossorigin="" href="https://logoimg.careerjet.net" rel="preconnect"/>
  <link href="https://logoimg.careerjet.net" rel=

In [6]:
listings = soup.find('ul', attrs={'class': 'jobs'})
listings

<ul class="jobs" data-page="1">
<li>
<article class="job clicky" data-as="0" data-o="0" data-url="/jobad/plf5c63585be33eed77ba668c07d0b2574">
<header>
<h2>
<a href="/jobad/plf5c63585be33eed77ba668c07d0b2574" title="Climate Risk - Regulatory Reporting Analyst">
        Climate Risk - Regulatory Reporting Analyst
      </a>
</h2>
</header>
<ul class="actions notclicky">
<li>
<form accept-charset="utf-8" action="/jobad/plf5c63585be33eed77ba668c07d0b2574/save" autocomplete="off" class="save-job submit-xhr" id="save-job-plf5c63585be33eed77ba668c07d0b2574" method="post">
<input name="csrf_token" type="hidden" value="7deb113d564528069f3f5dba56f64eaf94ff5e5e"/>
<input name="xht" type="hidden" value="#save-job-plf5c63585be33eed77ba668c07d0b2574"/>
<input name="mode" type="hidden" value="1"/>
<button class="" data-cjutd="f42c0a592446654ab2342360f636d808be37f13afddce631c1208d600596c6e21faa884165f96b3de76740cdaad815f513004a9bce76e57ecbb3ccf0cb5dd950e91363665f047702b2b3706d0895a2ed63dd8633c56ced3da

### Basic data (title, link) extraction

In [11]:
class Listing(BaseModel):
    title: str
    link: str
    company: str = None
    location: str = None
    salary: str = None
    description: str = None
    date_added: str | datetime = None
    skills: List[str] = None
    search_term: str = 'python'
    date_searched: date = date.today()

In [12]:
listings_arr = listings.find_all('article', attrs={'class': 'job'})
listings_arr[:5]

[<article class="job clicky" data-as="0" data-o="0" data-url="/jobad/plf5c63585be33eed77ba668c07d0b2574">
 <header>
 <h2>
 <a href="/jobad/plf5c63585be33eed77ba668c07d0b2574" title="Climate Risk - Regulatory Reporting Analyst">
         Climate Risk - Regulatory Reporting Analyst
       </a>
 </h2>
 </header>
 <ul class="actions notclicky">
 <li>
 <form accept-charset="utf-8" action="/jobad/plf5c63585be33eed77ba668c07d0b2574/save" autocomplete="off" class="save-job submit-xhr" id="save-job-plf5c63585be33eed77ba668c07d0b2574" method="post">
 <input name="csrf_token" type="hidden" value="7deb113d564528069f3f5dba56f64eaf94ff5e5e"/>
 <input name="xht" type="hidden" value="#save-job-plf5c63585be33eed77ba668c07d0b2574"/>
 <input name="mode" type="hidden" value="1"/>
 <button class="" data-cjutd="f42c0a592446654ab2342360f636d808be37f13afddce631c1208d600596c6e21faa884165f96b3de76740cdaad815f513004a9bce76e57ecbb3ccf0cb5dd950e91363665f047702b2b3706d0895a2ed63dd8633c56ced3dae56cc182c21b2bad2e2401

In [17]:
listing_link = listings_arr[0].get('data-url')
link = f'careerjet.pl{listing_link}'
link

'careerjet.pl/jobad/plbaec24941a296374129f0a069cb19df4'

In [19]:
listings_arr[0].header.h2.getText().strip()

'Software Development Engineer in Test'

In [13]:
test_listing = listings_arr[0]
test_title = test_listing.header.h2.getText().strip()
print('title: ', test_title)
test_company  = test_listing.find('p', attrs={'class': 'company'}).getText()
print('company: ', test_company)
test_location = test_listing.find('ul', attrs={'class':'location'}).li.getText().strip()
print('location: ', test_location)
test_description = test_listing.find('div', attrs={'class':'desc'}).getText().strip()
print('desc: ', test_description)

title:  Climate Risk - Regulatory Reporting Analyst
company:  Citigroup
location:  Warszawa, mazowieckie
desc:  As a successful Candidate you will be part of Stress Testing and Risk Analytics team.  The role offers exposure to the enterprise-wide and regional level Climate risk stress testin…


In [17]:
listings_objects = []
for listing in listings_arr:
    
    listing_link = listing.get('data-url')
    link = f'careerjet.pl{listing_link}'

    title = listing.header.h2.getText().strip()

    company = listing.find('p', attrs={'class': 'company'}).getText()

    location = listing.find('ul', attrs={'class':'location'}).li.getText().strip()

    description = listing.find('div', attrs={'class':'desc'}).getText().strip()

    obj = Listing(title = title, link = link, company = company,
                 description = description, location = location)
    listings_objects.append(obj)
print(listings_objects[:3])

[Listing(title='Climate Risk - Regulatory Reporting Analyst', link='careerjet.pl/jobad/plf5c63585be33eed77ba668c07d0b2574', company='Citigroup', location='Warszawa, mazowieckie', salary=None, description='As a successful Candidate you will be part of Stress Testing and Risk Analytics team.  The role offers exposure to the enterprise-wide and regional level Climate risk stress testin…', date_added=None, skills=None, search_term='python', date_searched=datetime.date(2023, 11, 25)), Listing(title='analityk big data', link='careerjet.pl/jobad/pla642499eec94e311324e64bbee684b29', company='Santander', location='Warszawa, mazowieckie', salary=None, description='analityk big data   Country: Poland   Czujesz, że masz MOC, którą chcesz podzielić się ze światem? Nasz bank to idealnie miejsce do tego. MOC rozwoju, MOC zespołowości, MOC benefit…', date_added=None, skills=None, search_term='python', date_searched=datetime.date(2023, 11, 25)), Listing(title='Software Test Engineer', link='careerjet.p

In [18]:
listings_pd = pd.DataFrame([obj.dict() for obj in listings_objects])
print(listings_pd.head())

                                         title  \
0  Climate Risk - Regulatory Reporting Analyst   
1                            analityk big data   
2                       Software Test Engineer   
3                       Senior DevOps Engineer   
4              Senior Test Automation Engineer   

                                                link    company  \
0  careerjet.pl/jobad/plf5c63585be33eed77ba668c07...  Citigroup   
1  careerjet.pl/jobad/pla642499eec94e311324e64bbe...  Santander   
2  careerjet.pl/jobad/pl798a9ca8cf8affde11c0eaa00...     Harman   
3  careerjet.pl/jobad/pl3ee93a7b0e2e7e038e538e5e2...     Luxoft   
4  careerjet.pl/jobad/pl6c1316b98726ba2a26939895d...     Luxoft   

                location salary  \
0  Warszawa, mazowieckie   None   
1  Warszawa, mazowieckie   None   
2          Łódź, łódzkie   None   
3  Wrocław, dolnośląskie   None   
4  Warszawa, mazowieckie   None   

                                         description date_added skills  \
0  As a suc

C:\Users\User\AppData\Local\Temp\ipykernel_13088\3227278432.py:1: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  listings_pd = pd.DataFrame([obj.dict() for obj in listings_objects])


In [16]:
# listings_pd.to_csv(f'career_p1.csv')
listings_pd.columns

Index(['title', 'link', 'company', 'location', 'salary', 'description',
       'date_added', 'skills', 'search_term', 'date_searched'],
      dtype='object')

### Looping through pages 1-5

In [39]:
page_objects = []
for page in range(1,5):
    page_url = f'https://www.careerjet.pl/{search_term}-praca.html?radius=0&p={page}&sort=date'
    page_response = requests.get(page_url)
    page_html = page_response.text
    page_soup = bs(html, 'html.parser')
    
    page_listings = page_soup.find('ul', attrs={'class': 'jobs'})
    page_listings_arr = page_listings.find_all('article', attrs={'class': 'job'})
    
    for listing in page_listings_arr:
    
        listing_link = listing.get('data-url')
        link = f'careerjet.pl{listing_link}'

        title = listings.header.h2.getText().strip()

        obj = Listing(title = title, link = link)
        page_objects.append(obj)
print('page objects: ', len(page_objects))

NameError: name 'p' is not defined

In [38]:
page_objects[0]

NameError: name 'page_objects' is not defined

In [None]:
page_df = pd.json_normalize(page_objects)
page_df.to_csv(f'careerjet_python_5pages.csv')

In [None]:
page_df.head()