In [29]:
import pandas as pd
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine, text
from json import loads
from requests import get
import re
from langdetect import detect

In [30]:
# connect to the database

dotenv_path = os.path.join("..", "scraping", ".env")
load_dotenv(dotenv_path)
DATABASE_URL = f"postgresql://{os.getenv('DB_USER')}:{os.getenv('DB_PASSWORD')}@{os.getenv('DB_HOST')}:{os.getenv('DB_PORT')}/{os.getenv('DB_NAME')}"

engine = create_engine(DATABASE_URL)

query = "SELECT * FROM job_postings WHERE description IS NOT NULL;"
original_df = pd.read_sql(query, engine)
df = original_df.copy()

query2 =  """
    SELECT job_sources.*
    FROM job_sources 
    INNER JOIN job_postings AS j 
    ON job_sources.job_id = j.job_id
    WHERE j.description IS NOT NULL;
"""
original_df2 = pd.read_sql(query2, engine)
df2 = original_df2.copy()

In [31]:
df.head()

Unnamed: 0,id,job_id,title,location,salary,experience_level,job_type,employment_type,description,responsibilities,requirements,skills,benefits,company,description_criteria,language,description_en
0,6538,4158196588,Rituals - Wholesale Intern (Warsaw),"Warsaw, Mazowieckie, Poland (On-site)",,,,,"W dziale Wholesale & Travel Retail dbamy o to,...",,,,,Rituals Cosmetics Polska,Seniority level Internship Employment type Int...,pl,"""In the Wholesale section. ""As well as externa..."
1,348291,4188786744,Senior GenAI ML Engineer (Python / TensorFlow ...,Poland (Remote),,,,,Dla naszego klienta 1dea poszukujemy osoby na ...,,,,,inhire.io,Seniority level Mid-Senior level Employment ty...,pl,"""For our client 1Dea we are looking for a pers..."
2,298352,4188003130,Opiekun Magazynu JYSK Szczecin Auchan Kołbasko...,"Kołbaskowo, Zachodniopomorskie, Poland (On-site)",,,,,Opis firmyOpis oferty pracy Czy jesteś gotow...,,,,,JYSK,Seniority level Associate Employment type Full...,pl,"""Description of the company's jobs. Are you re..."
3,298373,4188215321,Niemcy Ślusarz/ od 14E netto/h,"Zagranica, Podkarpackie, Poland (On-site)",,,,,Agencja Pracy EURO WELT (KRAZ 12809) to certyf...,,,,,Agencja Pośrednictwa Pracy Servis HR,Seniority level Entry level Employment type Fu...,pl,"""EURO WELT Employment Agency (KRAZ 12809) is a..."
4,298409,4185293278,Główny Księgowy/Główna Księgowa,"Cracow, Małopolskie, Poland (Hybrid)",,,,,REKRUTACJA NA STANOWISKO:Główny_a Księgowy_a ...,,,,,Stowarzyszenie WIOSNA,Seniority level Mid-Senior level Employment ty...,pl,"""Recruitment for the position: Główny_A accoun..."


In [32]:
df2.head()

Unnamed: 0,id,job_id,source,job_url,date_posted,is_active
0,1,4158353862,LinkedIn,https://www.linkedin.com/jobs/view/4158353862,2025-02-21,True
1,2,4138407547,LinkedIn,https://www.linkedin.com/jobs/view/4138407547,2025-02-21,True
2,3,4144364769,LinkedIn,https://www.linkedin.com/jobs/view/4144364769,2025-02-21,True
3,4,4083706042,LinkedIn,https://www.linkedin.com/jobs/view/4083706042,2025-02-21,True
4,5,4154765012,LinkedIn,https://www.linkedin.com/jobs/view/4154765012,2025-02-21,True


In [33]:
def detect_language(text):
    try:
        return detect(text)
    except:
        return "unknown"

In [34]:
def update_description(text):
    sentences = re.split(r'(?<=[a-ząęśćźżół])(?=[A-ZĄĘŚĆŹŻÓŁ])', text)
    return ", ".join(sentences)

df["description"] = df["description"].apply(update_description)

In [35]:


def split_text(text, max_length=450):
    """Splits text into chunks, ensuring each is under the max_length."""
    sentences = re.split(r'([.,;])', text)
    chunks = []
    temp_chunk = ""

    for sentence in sentences:
        if temp_chunk and len(temp_chunk) + len(sentence) + 1 > max_length:
            chunks.append(temp_chunk.strip())
            temp_chunk = sentence
        else:
            temp_chunk = f"{temp_chunk} {sentence}".strip() if temp_chunk else sentence
    if temp_chunk:
        chunks.append(temp_chunk.strip())
    
    return chunks


def translate_text(text):
    """Translates text using the free Google API."""
    url = f'https://translate.googleapis.com/translate_a/single?client=gtx&dt=t&sl=pl&tl=en&q="{str(text)}"'
    try:
        response = get(url)
        translated_json = loads(response.text)
        return translated_json[0][0][0] if translated_json else text
    except Exception as e:
        print(f"Error translating: {e}")
        return text

def translate_full_text(text):
    """Splits, translates, and reconstructs the translated text."""
    chunks = split_text(text)
    translated_chunks = [translate_text(chunk) for chunk in chunks]
    return ". ".join(translated_chunks)

In [36]:
from tqdm import tqdm
from sqlalchemy.exc import OperationalError
import time
import sys
import math
tqdm.pandas()

with engine.connect() as connection:
    connection.execute(text("ALTER TABLE job_postings ADD COLUMN IF NOT EXISTS description_en TEXT;"))
    connection.execute(text("ALTER TABLE job_postings ADD COLUMN IF NOT EXISTS language VARCHAR(10);"))

    connection.commit()

df = pd.read_sql("SELECT * FROM job_postings WHERE description IS NOT NULL;", engine)
df["language"] = df["language"].fillna('')


missing_language_rows = df[df["language"] == ""]
if not missing_language_rows.empty:
    print(f"Detecting language for {len(missing_language_rows)} descriptions...")
    df.loc[df["language"] == "", "language"] = df.loc[df["language"] == "", "description"].apply(detect_language)

    with engine.begin() as conn:
        for _, row in df[df["language"] != ""].iterrows():
            conn.execute(
                text("UPDATE job_postings SET language = :language WHERE job_id = :job_id"),
                {"language": row["language"], "job_id": row["job_id"]}
            )

df["description_en"] = df.get("description_en", pd.Series([None]*len(df)))

df_to_translate = df[(df["language"] == "pl") & (df["description_en"].isnull())].copy()
total = len(df_to_translate)
print(f"Total Polish descriptions to translate: {total}")

skip = False
if total < 5:
    print("No Polish descriptions to translate.")
    skip = True

if not skip:
    chunk_size = math.ceil(total / 25)
    
    for i in range(0, total, chunk_size):
        chunk = df_to_translate.iloc[i:i+chunk_size].copy()
        print(f"\n🔸 Translating chunk {i//chunk_size + 1}/25")
        time.sleep(3)
        chunk["translated_description"] = chunk["description"].progress_apply(translate_full_text)
    
        # Save to DB
        for _, row in chunk.iterrows():
            try:
                with engine.begin() as conn:
                    conn.execute(
                        text("UPDATE job_postings SET description_en = :desc_en WHERE job_id = :job_id"),
                        {"desc_en": row["translated_description"], "job_id": row["job_id"]}
                    )
            except OperationalError as e:
                print(f"OperationalError for job_id={row['job_id']}: {e}")
                time.sleep(3)
    
        print(f"Chunk {i//chunk_size + 1} saved to database.")


Total Polish descriptions to translate: 0
No Polish descriptions to translate.


In [37]:
df["final_description"] = df.apply(lambda row: row["description_en"] if pd.notnull(row["description_en"]) else row["description"], axis=1)

In [38]:
df.head()

Unnamed: 0,id,job_id,title,location,salary,experience_level,job_type,employment_type,description,responsibilities,requirements,skills,benefits,company,description_criteria,language,description_en,final_description
0,6538,4158196588,Rituals - Wholesale Intern (Warsaw),"Warsaw, Mazowieckie, Poland (On-site)",,,,,"W dziale Wholesale & Travel Retail dbamy o to,...",,,,,Rituals Cosmetics Polska,Seniority level Internship Employment type Int...,pl,"""In the Wholesale section. ""As well as externa...","""In the Wholesale section. ""As well as externa..."
1,348291,4188786744,Senior GenAI ML Engineer (Python / TensorFlow ...,Poland (Remote),,,,,Dla naszego klienta 1dea poszukujemy osoby na ...,,,,,inhire.io,Seniority level Mid-Senior level Employment ty...,pl,"""For our client 1Dea we are looking for a pers...","""For our client 1Dea we are looking for a pers..."
2,298352,4188003130,Opiekun Magazynu JYSK Szczecin Auchan Kołbasko...,"Kołbaskowo, Zachodniopomorskie, Poland (On-site)",,,,,Opis firmyOpis oferty pracy Czy jesteś gotow...,,,,,JYSK,Seniority level Associate Employment type Full...,pl,"""Description of the company's jobs. Are you re...","""Description of the company's jobs. Are you re..."
3,298373,4188215321,Niemcy Ślusarz/ od 14E netto/h,"Zagranica, Podkarpackie, Poland (On-site)",,,,,Agencja Pracy EURO WELT (KRAZ 12809) to certyf...,,,,,Agencja Pośrednictwa Pracy Servis HR,Seniority level Entry level Employment type Fu...,pl,"""EURO WELT Employment Agency (KRAZ 12809) is a...","""EURO WELT Employment Agency (KRAZ 12809) is a..."
4,298409,4185293278,Główny Księgowy/Główna Księgowa,"Cracow, Małopolskie, Poland (Hybrid)",,,,,REKRUTACJA NA STANOWISKO:Główny_a Księgowy_a ...,,,,,Stowarzyszenie WIOSNA,Seniority level Mid-Senior level Employment ty...,pl,"""Recruitment for the position: Główny_A accoun...","""Recruitment for the position: Główny_A accoun..."


In [39]:
df["final_description"].iloc[0]

'"In the Wholesale section. "As well as external partners. Wholesale Intern will help the team in the areas of: data management, communication, analyzes and support of Back Office. If you want to learn more about sales both off - and online,". "This is an ideal internship for you! Your tasks: data management and their analysis based on reports in the Excel program - the basis of your activities arartly preparing files with data - tracking inputs and going out to the customer, updating basic data and support for planning inquiries. Ordering marketing materials from agencies for 7 markets.". "Taking care of the database - collecting information from Account Managers and Marketing Department. Creating monthly marketing reports. Support in management and orders management, daily control/marketing control on our partners\' pages. Updating documents related to wholesale sales. Inspire knowledge and skills! Your profile: You have a ground knowledge and skills in the field of Excel programs . 

In [40]:
df[df["description_criteria"].isnull()].index


Index([2675, 3415, 9103, 9104, 17038, 23273, 23585, 24119, 27201, 28037, 28144,
       28585],
      dtype='int64')

In [41]:
df.iloc[4913]


id                                                                  43895
job_id                                                         4173783845
title                                               Tester Automatyzujący
location                                                  Poland (Remote)
salary                                                               None
experience_level                                                     None
job_type                                                             None
employment_type                                                      None
description              ➡️ Jesteśmy częścią grupyALTEN– wiodącego, eu...
responsibilities                                                     None
requirements                                                         None
skills                                                               None
benefits                                                             None
company                               

In [42]:
df.drop(index=df[df["description_criteria"].isnull()].index, inplace=True)
df.iloc[4913]

id                                                                  16098
job_id                                                         4146989283
title                                 Middle Front End Engineer (Angular)
location                            Warsaw, Mazowieckie, Poland (On-site)
salary                                                               None
experience_level                                                     None
job_type                                                             None
employment_type                                                      None
description              About Us: Voyagu is a travel-tech startup hel...
responsibilities                                                     None
requirements                                                         None
skills                                                               None
benefits                                                             None
company                               

In [43]:
# Extracting job type
def extract_job_type(text):
    result = []
    if "remote" in text.lower():
        result.append("Remote")
    elif "hybrid" in text.lower():
        result.append("Hybrid")
    elif "on-site" in text.lower() or "office" in text.lower():
        result.append("On-site")
    return ", ".join(result) if result else None


df["job_type"] = df["location"].apply(extract_job_type)
df["job_type"].value_counts()

job_type
Hybrid     9553
Remote     8981
On-site    8612
Name: count, dtype: int64

In [44]:
print(df["job_type"].isnull().sum())

df["final_description"][df["job_type"].isnull()].apply(extract_job_type).value_counts()


2395


final_description
Remote     768
Hybrid     535
On-site    475
Name: count, dtype: int64

In [45]:
df["job_type"] = df["job_type"].fillna(df["final_description"].apply(extract_job_type))
print(df["job_type"].isnull().sum())


617


In [46]:
def extract_employment_type(text):
    result = []
    if "contract" in text.lower():
        result.append("Contract")
    if "full-time" in text.lower():
        result.append("Full-time")
    if "part-time" in text.lower():
        result.append("Part-time")
    if "b2b" in text.lower():
        result.append("B2B")
    
    return ", ".join(result) if result else None

df["employment_type"] = df["description_criteria"].apply(extract_employment_type)
df["employment_type"].value_counts()


employment_type
Full-time              25685
Contract                2515
Part-time                412
Contract, Full-time        2
Name: count, dtype: int64

In [47]:
print(df["employment_type"].isnull().sum())

df["final_description"][df["employment_type"].isnull()].apply(extract_employment_type).value_counts()

927


final_description
Contract                          176
Contract, Full-time                43
Full-time                          33
Contract, B2B                      14
Part-time                           8
B2B                                 7
Contract, Part-time                 6
Full-time, Part-time                6
Contract, Full-time, Part-time      5
Name: count, dtype: int64

In [48]:
df["employment_type"] = df["employment_type"].fillna(df["final_description"].apply(extract_employment_type))

print(df["employment_type"].isnull().sum())

629


In [49]:
df[df["language"]=="pl"].head(5)

Unnamed: 0,id,job_id,title,location,salary,experience_level,job_type,employment_type,description,responsibilities,requirements,skills,benefits,company,description_criteria,language,description_en,final_description
0,6538,4158196588,Rituals - Wholesale Intern (Warsaw),"Warsaw, Mazowieckie, Poland (On-site)",,,On-site,,"W dziale Wholesale & Travel Retail dbamy o to,...",,,,,Rituals Cosmetics Polska,Seniority level Internship Employment type Int...,pl,"""In the Wholesale section. ""As well as externa...","""In the Wholesale section. ""As well as externa..."
1,348291,4188786744,Senior GenAI ML Engineer (Python / TensorFlow ...,Poland (Remote),,,Remote,Full-time,Dla naszego klienta 1dea poszukujemy osoby na ...,,,,,inhire.io,Seniority level Mid-Senior level Employment ty...,pl,"""For our client 1Dea we are looking for a pers...","""For our client 1Dea we are looking for a pers..."
2,298352,4188003130,Opiekun Magazynu JYSK Szczecin Auchan Kołbasko...,"Kołbaskowo, Zachodniopomorskie, Poland (On-site)",,,On-site,Full-time,Opis firmyOpis oferty pracy Czy jesteś gotow...,,,,,JYSK,Seniority level Associate Employment type Full...,pl,"""Description of the company's jobs. Are you re...","""Description of the company's jobs. Are you re..."
3,298373,4188215321,Niemcy Ślusarz/ od 14E netto/h,"Zagranica, Podkarpackie, Poland (On-site)",,,On-site,Full-time,Agencja Pracy EURO WELT (KRAZ 12809) to certyf...,,,,,Agencja Pośrednictwa Pracy Servis HR,Seniority level Entry level Employment type Fu...,pl,"""EURO WELT Employment Agency (KRAZ 12809) is a...","""EURO WELT Employment Agency (KRAZ 12809) is a..."
4,298409,4185293278,Główny Księgowy/Główna Księgowa,"Cracow, Małopolskie, Poland (Hybrid)",,,Hybrid,Full-time,REKRUTACJA NA STANOWISKO:Główny_a Księgowy_a ...,,,,,Stowarzyszenie WIOSNA,Seniority level Mid-Senior level Employment ty...,pl,"""Recruitment for the position: Główny_A accoun...","""Recruitment for the position: Główny_A accoun..."


In [50]:
# Count how many jobs don't contain both job_type and employment_type
missing_job_type_and_employment_type = df[df["job_type"].isnull() & df["employment_type"].isnull()]
print(f"Number of jobs without both job_type and employment_type: {len(missing_job_type_and_employment_type)}")


Number of jobs without both job_type and employment_type: 3


In [51]:
def clean_description(text):
    text = text.replace('"', '')
    text = text.strip()
    return text

df["final_description"] = df["final_description"].apply(clean_description)

In [52]:
df.drop(columns=["description", "description_en"], inplace=True)
df.columns

Index(['id', 'job_id', 'title', 'location', 'salary', 'experience_level',
       'job_type', 'employment_type', 'responsibilities', 'requirements',
       'skills', 'benefits', 'company', 'description_criteria', 'language',
       'final_description'],
      dtype='object')