# Scraping latest offers from pracuj.pl

### Imports

In [41]:
import requests
from bs4 import BeautifulSoup as bs
from pydantic import BaseModel
from typing import List, Optional
from datetime import datetime
import re
import pandas as pd

### Specify search term

In [2]:
search_term = 'python'
url = f'https://it.pracuj.pl/praca/python;kw?sc=0&pn=1'

### Get the response

In [3]:
response = requests.get(url)
html = response.text
soup = bs(html, 'html.parser')

In [10]:
print(soup)

<!DOCTYPE html>
<html lang="pl"><head><meta charset="utf-8"/><link href="https://listing-it.gpcdn.pl" rel="preconnect"/><link href="https://massachusetts.pracuj.pl" rel="preconnect"/><title>Praca python – Pracuj.pl</title><meta content="Szukasz pracy w kategorii IT? Sprawdź oferty pracy w serwisie it.pracuj.pl" name="description"/><meta content="width=device-width, initial-scale=1, maximum-scale=1" name="viewport"/><meta content="index,follow,all,noarchive" name="robots"/><meta content="app-id=386774884" name="apple-itunes-app"/><meta content="app-id=pl.pracuj.android.jobsearcher" name="google-play-app"/><meta content="144844052206605" property="fb:app_id"/><link href="https://it.pracuj.pl/praca" rel="canonical"/><meta content="pl_PL" property="og:locale"/><meta content="article" property="og:type"/><meta content="Praca python – Pracuj.pl" property="og:title"/><meta content="Szukasz pracy w kategorii IT? Sprawdź oferty pracy w serwisie it.pracuj.pl" property="og:description"/><meta con

In [4]:
print(soup.prettify())
with open ("pracuj1.html", "w", encoding='utf-8') as file:
    file.write(str(soup.prettify()))

<!DOCTYPE html>
<html lang="pl">
 <head>
  <meta charset="utf-8"/>
  <link href="https://listing-it.gpcdn.pl" rel="preconnect"/>
  <link href="https://massachusetts.pracuj.pl" rel="preconnect"/>
  <title>
   Praca python – Pracuj.pl
  </title>
  <meta content="Szukasz pracy w kategorii IT? Sprawdź oferty pracy w serwisie it.pracuj.pl" name="description"/>
  <meta content="width=device-width, initial-scale=1, maximum-scale=1" name="viewport"/>
  <meta content="index,follow,all,noarchive" name="robots"/>
  <meta content="app-id=386774884" name="apple-itunes-app"/>
  <meta content="app-id=pl.pracuj.android.jobsearcher" name="google-play-app"/>
  <meta content="144844052206605" property="fb:app_id"/>
  <link href="https://it.pracuj.pl/praca" rel="canonical"/>
  <meta content="pl_PL" property="og:locale"/>
  <meta content="article" property="og:type"/>
  <meta content="Praca python – Pracuj.pl" property="og:title"/>
  <meta content="Szukasz pracy w kategorii IT? Sprawdź oferty pracy w serwi

### Get the postings container

In [40]:
listings = soup.find('div', attrs={'data-test': 'section-offers'})
# listings
with open ("pracuj_listings.txt", "w", encoding="utf-8") as file:
    file.write(str(listings))

In [6]:
listings_arr = listings.find_all(class_='c1fljezf')
# listings_arr[:5]

[<div class="c1fljezf"><a class="core_n194fgoq" data-test="link-offer" href="https://www.pracuj.pl/praca/software-quality-engineer-krakow-aleja-jana-pawla-ii-43,oferta,1002949341"></a><div class="c1s2myew"><div class="i35ayzj"><div class="tiles_cz8pc9w"><a class="core_btsqgu core_n194fgoq" data-test="link-company-profile" href="https://pracodawcy.pracuj.pl/company/20001636"><picture><img alt="TechnipFMC" class="core_ia9ocxs" height="65" loading="lazy" src="https://logos.gpcdn.pl/loga-firm/20001636/2c580000-43a8-f403-50c4-08d6dec4e958_280x280.png" width="65"/></picture></a></div></div><div class="c13gi8t"><div class="b1blp41s"><div class="c1wygkax"><span class="tiles_spqrqpy core_c1s7lqrd" data-test="add-to-favourites"><span aria-hidden="true" class="core_ig18o8w size-medium position-center"><svg fill="none" viewbox="0 0 20 20"><path d="M9.605 1.271a.419.419 0 0 1 .783 0l2.12 6.008h5.984a.418.418 0 0 1 .272.737l-5 4.146 2.092 6.287a.419.419 0 0 1-.644.47l-5.216-3.827-5.219 3.827a.419.41

In [39]:
test_listing = listings_arr[6]
print(test_listing)
pracuj_date = re.compile(r'Opublikowana: (.*)')
listing_date = test_listing.find('p', attrs={'data-test':'text-added'}).getText()
print(listing_date)
listing_date = pracuj_date.match(listing_date)
print(listing_date)

<div class="c1fljezf"><div class="c1s2myew"><div class="i35ayzj"><div class="tiles_cz8pc9w"><a class="core_btsqgu core_n194fgoq" data-test="link-company-profile" href="https://pracodawcy.pracuj.pl/company/1074019482"><picture><img alt="Convista Poland" class="core_ia9ocxs" height="65" loading="lazy" src="https://logos.gpcdn.pl/loga-firm/1074019482/9b030000-5dac-0015-4319-08dad158b078_280x280.png" width="65"/></picture></a></div></div><div class="c13gi8t"><div class="b1blp41s"><div class="c1wygkax"><h2 class="tiles_b1yuv00i" data-test="offer-title">DevOps Engineer</h2><span class="s1jki39v" data-test="offer-salary">12 000–33 600 zł / mies. (zal. od umowy)</span><div class="hide-on-mobile tiles_cegq0mb" data-test="section-company"><div class="tiles_cz8pc9w"><a class="core_btsqgu core_n194fgoq" data-test="link-company-profile" href="https://pracodawcy.pracuj.pl/company/1074019482"><picture><img alt="Convista Poland" class="core_ia9ocxs" height="65" loading="lazy" src="https://logos.gpcdn.

### Pydantic class for extracted objects

In [45]:
class Listing(BaseModel):
    title: str
    link: str
    date: str
    skills: List[str] = []

### Extracting the data

In [25]:
# regexes
pracuj_date = re.compile(r'Opublikowana: (.*)')

In [46]:
listings_objects = []

for listing in listings_arr:
    try:
        listing_link = listing.find('a', attrs={'data-test': 'link-offer'}).get('href')
    except:
        listing_link = ''
    
    try:
        listing_title = listing.find('h2', attrs={'data-test': 'offer-title'}).a.getText()
    except:
        listing_title = ''

    try:
        listing_date = listing.find('p', attrs={'data-test':'text-added'}).getText()
        listing_date = pracuj_date.match(listing_date)[0]
    except:
        listing_date = ''

    try:
        listing_skills = []
        skills = listing.find_all('span', attrs={'data-test': 'technologies-item'})
        for skill in skills:
            skill = skill.getText()
            listing_skills.append(skill)
    except:
        listing_skills = []
    
    obj = Listing(title = listing_title, link = listing_link,
                  date=listing_date, skills = listing_skills)
    listings_objects.append(obj)

print(listings_objects[:3])

[Listing(title='Software Quality Engineer', link='https://www.pracuj.pl/praca/software-quality-engineer-krakow-aleja-jana-pawla-ii-43,oferta,1002949341', date='', skills=['Python', 'Git', 'Jira']), Listing(title='Embedded Software Engineer', link='https://www.pracuj.pl/praca/embedded-software-engineer-krakow-aleja-jana-pawla-ii-43,oferta,1002949336', date='', skills=['Python', 'Git', 'C++', 'Linux']), Listing(title='Inżynier Automatyk-Programista', link='https://www.pracuj.pl/praca/inzynier-automatyk-programista-orzesze,oferta,1002995077', date='', skills=[])]


In [47]:
listings_objects[0]

Listing(title='Software Quality Engineer', link='https://www.pracuj.pl/praca/software-quality-engineer-krakow-aleja-jana-pawla-ii-43,oferta,1002949341', date='', skills=['Python', 'Git', 'Jira'])

In [48]:
listings_pd = pd.DataFrame([obj.dict() for obj in listings_objects])
print(listings_pd.head())

                                    title  \
0               Software Quality Engineer   
1              Embedded Software Engineer   
2          Inżynier Automatyk-Programista   
3  Senior Data Modeler / Business Analyst   
4        Artificial Intelligence Engineer   

                                                link date  \
0  https://www.pracuj.pl/praca/software-quality-e...        
1  https://www.pracuj.pl/praca/embedded-software-...        
2  https://www.pracuj.pl/praca/inzynier-automatyk...        
3  https://www.pracuj.pl/praca/senior-data-modele...        
4  https://www.pracuj.pl/praca/artificial-intelli...        

                            skills  
0              [Python, Git, Jira]  
1        [Python, Git, C++, Linux]  
2                               []  
3  [SQL, noSQL, Python, SAP/4hana]  
4      [Python, Azure DevOps, Git]  


C:\Users\User\AppData\Local\Temp\ipykernel_19876\3227278432.py:1: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  listings_pd = pd.DataFrame([obj.dict() for obj in listings_objects])


In [22]:
# pracuj_df = pd.json_normalize(listings_objects)
# pracuj_df.to_csv(f'pracuj_python_1page.csv')

In [50]:
listings_pd.columns

Index(['title', 'link', 'date', 'skills'], dtype='object')

In [None]:
# listings_pd[['']]