# Scraping latest offers from pracuj.pl

### Imports

In [24]:
import requests
from bs4 import BeautifulSoup as bs
from pydantic import BaseModel
from typing import List, Optional
from datetime import datetime, date
import re
import pandas as pd
import pickle

### Specify search term

In [2]:
search_term = 'python'
url = f'https://it.pracuj.pl/praca/python;kw?sc=0&pn=1'

### Get the response

In [3]:
response = requests.get(url)
html = response.text
soup = bs(html, 'html.parser')

### Pickle the response to limit future calls in case of testing

In [25]:
with open ('pracuj_python_soup.pkl', 'wb') as outp:
    pickle.dump(soup, outp, pickle.HIGHEST_PROTOCOL)

#### Getting the response from pickle

In [27]:
with open ('pracuj_python_soup.pkl', 'rb') as inp:
    soup = pickle.load(inp)
    print(soup.prettify())

<!DOCTYPE html>
<html lang="pl">
 <head>
  <meta charset="utf-8"/>
  <link href="https://listing-it.gpcdn.pl" rel="preconnect"/>
  <link href="https://massachusetts.pracuj.pl" rel="preconnect"/>
  <title>
   Praca python – Pracuj.pl
  </title>
  <meta content="Szukasz pracy w kategorii IT? Sprawdź oferty pracy w serwisie it.pracuj.pl" name="description"/>
  <meta content="width=device-width, initial-scale=1, maximum-scale=1" name="viewport"/>
  <meta content="index,follow,all,noarchive" name="robots"/>
  <meta content="app-id=386774884" name="apple-itunes-app"/>
  <meta content="app-id=pl.pracuj.android.jobsearcher" name="google-play-app"/>
  <meta content="144844052206605" property="fb:app_id"/>
  <link href="https://it.pracuj.pl/praca" rel="canonical"/>
  <meta content="pl_PL" property="og:locale"/>
  <meta content="article" property="og:type"/>
  <meta content="Praca python – Pracuj.pl" property="og:title"/>
  <meta content="Szukasz pracy w kategorii IT? Sprawdź oferty pracy w serwi

In [28]:
print(soup.prettify())
with open ("pracuj1.html", "w", encoding='utf-8') as file:
    file.write(str(soup.prettify()))

<!DOCTYPE html>
<html lang="pl">
 <head>
  <meta charset="utf-8"/>
  <link href="https://listing-it.gpcdn.pl" rel="preconnect"/>
  <link href="https://massachusetts.pracuj.pl" rel="preconnect"/>
  <title>
   Praca python – Pracuj.pl
  </title>
  <meta content="Szukasz pracy w kategorii IT? Sprawdź oferty pracy w serwisie it.pracuj.pl" name="description"/>
  <meta content="width=device-width, initial-scale=1, maximum-scale=1" name="viewport"/>
  <meta content="index,follow,all,noarchive" name="robots"/>
  <meta content="app-id=386774884" name="apple-itunes-app"/>
  <meta content="app-id=pl.pracuj.android.jobsearcher" name="google-play-app"/>
  <meta content="144844052206605" property="fb:app_id"/>
  <link href="https://it.pracuj.pl/praca" rel="canonical"/>
  <meta content="pl_PL" property="og:locale"/>
  <meta content="article" property="og:type"/>
  <meta content="Praca python – Pracuj.pl" property="og:title"/>
  <meta content="Szukasz pracy w kategorii IT? Sprawdź oferty pracy w serwi

### Get the postings container

In [6]:
listings = soup.find('div', attrs={'data-test': 'section-offers'})
# listings
with open ("pracuj_listings.txt", "w", encoding="utf-8") as file:
    file.write(str(listings))

In [7]:
listings_arr = listings.find_all(class_='c1fljezf')
# listings_arr[:5]

In [8]:
test_listing = listings_arr[6]
print(test_listing)
pracuj_date = re.compile(r'Opublikowana: (.*)')
listing_date = test_listing.find('p', attrs={'data-test':'text-added'}).getText()
print(listing_date)
listing_date = pracuj_date.match(listing_date)
print(listing_date)

<div class="c1fljezf"><a class="core_n194fgoq" data-test="link-offer" href="https://www.pracuj.pl/praca/junior-excel-python-modeller-warszawa,oferta,1002961681"></a><div class="c1s2myew"><div class="i35ayzj"><div class="tiles_cz8pc9w"><a class="core_btsqgu core_n194fgoq" data-test="link-company-profile" href="https://pracodawcy.pracuj.pl/company/1074070985"><picture><img alt="Standard Chartered Bank" class="core_ia9ocxs" height="65" loading="lazy" src="https://logos.gpcdn.pl/loga-firm/1074070985/2c580000-43a8-f403-566e-08da0b4bf9e9_280x280.png" width="65"/></picture></a></div></div><div class="c13gi8t"><div class="b1blp41s"><div class="c1wygkax"><div class="tiles_jvbgvfy"><span class="super core_j1iz21zj" data-test="text-super-offer" style="--j1iz21zj-0:none">Superoferta</span></div><span class="tiles_spqrqpy core_c1s7lqrd" data-test="add-to-favourites"><span aria-hidden="true" class="core_ig18o8w size-medium position-center"><svg fill="none" viewbox="0 0 20 20"><path d="M9.605 1.271a.

### Pydantic class for extracted objects

In [15]:
class Listing(BaseModel):
    title: str
    link: str
    company: str = None
    location: str = None
    salary: str = None
    description: str = None
    date_added: str | datetime = None
    skills: List[str] = None
    search_term: str = 'python'
    date_searched: date = date.today()

### Extracting the data

In [13]:
# regexes
pracuj_date = re.compile(r'Opublikowana: (.*)')
pracuj_salary = re.compile(r'')

In [20]:
listings_objects = []

for listing in listings_arr:
    try:
        listing_link = listing.find('a', attrs={'data-test': 'link-offer'}).get('href')
    except:
        listing_link = ''
    
    try:
        listing_title = listing.find('h2', attrs={'data-test': 'offer-title'}).a.getText()
    except:
        listing_title = ''

    try:
        listing_date = listing.find('p', attrs={'data-test':'text-added'}).getText()
        listing_date = pracuj_date.match(listing_date)[0]
    except:
        listing_date = ''

    try:
        listing_skills = []
        skills = listing.find_all('span', attrs={'data-test': 'technologies-item'})
        for skill in skills:
            skill = skill.getText()
            listing_skills.append(skill)
    except:
        listing_skills = []

    try:
        listing_company = listing.find('a', attrs={'data-test':'text-company-name'}).getText()
    except:
        listing_company = ''

    try:
        listing_location = listing.find('h5', attrs={'data-test':'text-regon'}).getText()
    except:
        listing_location = ''

    try:
        listing_description = listing.find('span', attrs={'class':'t126uk2l'}).getText()
    except:
        listing_description = ''

    try:
        listing_salary = listing.find('span', attrs={'data-test':'offer-salary'}).getText()
    except:
        listing_salary = ''
        
    obj = Listing(title = listing_title, link = listing_link,
                  date_added = listing_date, skills = listing_skills,
                 company = listing_company, location = listing_location,
                 description = listing_description, salary  = listing_salary)
    listings_objects.append(obj)

print(listings_objects[:3])

[Listing(title='Senior React / UI Developer', link='https://www.pracuj.pl/praca/senior-react-ui-developer-warszawa,oferta,1002998094', company='', location='', salary='26\xa0880–30\xa0240\xa0zł netto (+\xa0VAT)\xa0/ mies.', description='We are looking for the Senior React Developer with the UI experience., , You will join a small, new, and self-sufficient team, who will be focused on developing an iterative concept through to production and help launch a new greenfield product...', date_added='', skills=['React.js', 'JavaScript', 'TypeScript', 'HTML', 'Dagre', 'ReactFlow', 'Chart.js', 'Plotly.js'], search_term='python', date_searched=datetime.date(2023, 11, 25)), Listing(title='Senior IT Analyst', link='https://www.pracuj.pl/praca/senior-it-analyst-warszawa-dobra-40,oferta,1002986808', company='', location='', salary='', description='If you are a team player who is passionate about solving problems, making things well explained and documented, someone who can use data to answer questio

In [17]:
listings_objects[0]

Listing(title='Senior React / UI Developer', link='https://www.pracuj.pl/praca/senior-react-ui-developer-warszawa,oferta,1002998094', company='', location='', salary='26\xa0880–30\xa0240\xa0zł netto (+\xa0VAT)\xa0/ mies.', description='We are looking for the Senior React Developer with the UI experience., , You will join a small, new, and self-sufficient team, who will be focused on developing an iterative concept through to production and help launch a new greenfield product...', date_added=None, skills=['React.js', 'JavaScript', 'TypeScript', 'HTML', 'Dagre', 'ReactFlow', 'Chart.js', 'Plotly.js'], search_term='python', date_searched=datetime.date(2023, 11, 25))

In [21]:
listings_pd = pd.DataFrame([obj.dict() for obj in listings_objects])
print(listings_pd.head())

                               title  \
0        Senior React / UI Developer   
1                  Senior IT Analyst   
2                    DevOps Engineer   
3                  Junior IT Analyst   
4  Data Engineer for Voice Assistant   

                                                link company location  \
0  https://www.pracuj.pl/praca/senior-react-ui-de...                    
1  https://www.pracuj.pl/praca/senior-it-analyst-...                    
2  https://www.pracuj.pl/praca/devops-engineer-wa...                    
3  https://www.pracuj.pl/praca/junior-it-analyst-...                    
4  https://www.pracuj.pl/praca/data-engineer-for-...                    

                                   salary  \
0  26 880–30 240 zł netto (+ VAT) / mies.   
1                                           
2                                           
3                                           
4                                           

                                         descript

C:\Users\User\AppData\Local\Temp\ipykernel_18396\3227278432.py:1: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  listings_pd = pd.DataFrame([obj.dict() for obj in listings_objects])


In [22]:
# pracuj_df = pd.json_normalize(listings_objects)
# pracuj_df.to_csv(f'pracuj_python_1page.csv')

In [22]:
listings_pd.columns

Index(['title', 'link', 'company', 'location', 'salary', 'description',
       'date_added', 'skills', 'search_term', 'date_searched'],
      dtype='object')

In [None]:
# listings_pd[['']]