In [1]:
import pandas as pd
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine, text
from json import loads
from requests import get
import re
from langdetect import detect

In [2]:
# connect to the database

dotenv_path = os.path.join("..", "scraping", ".env")
load_dotenv(dotenv_path)
DATABASE_URL = f"postgresql://{os.getenv('DB_USER')}:{os.getenv('DB_PASSWORD')}@{os.getenv('DB_HOST')}:{os.getenv('DB_PORT')}/{os.getenv('DB_NAME')}"

engine = create_engine(DATABASE_URL)

query = "SELECT * FROM job_postings WHERE description IS NOT NULL;"
original_df = pd.read_sql(query, engine)
df = original_df.copy()

query2 =  """
    SELECT job_sources.*
    FROM job_sources 
    INNER JOIN job_postings AS j 
    ON job_sources.job_id = j.job_id
    WHERE j.description IS NOT NULL;
"""
original_df2 = pd.read_sql(query2, engine)
df2 = original_df2.copy()

In [3]:
df.head()

Unnamed: 0,id,job_id,title,location,salary,experience_level,job_type,employment_type,description,responsibilities,requirements,skills,benefits,company,description_criteria,description_en
0,14048,4115475043,Senior AI Engineer proficient in Swift,"Warsaw, Mazowieckie, Poland (On-site)",,,,,We are seeking a highly motivated and experien...,,,,,"AIDA projektai, MB",Seniority level Mid-Senior level Employment ty...,
1,18412,4130485010,Node.js Engineer,"Warsaw, Mazowieckie, Poland (Hybrid)",,,,,Ring Publishing: (http://ringpublishing.com) ...,,,,,Ringier Axel Springer Polska,Seniority level Not Applicable Employment type...,Ring Publishing: (http: // Ringpublishing com)...
2,8,4158353862,Junior Data Scientist (Business Operations),"Warsaw, Mazowieckie, Poland (On-site)",,,,,Job Description: The Business Operations Team...,,,,,Wolt,Seniority level Mid-Senior level Employment ty...,
3,14872,4056210173,Senior Data Integration Engineer (Databricks),Poland (Remote),,,,,We are looking for a Senior Data Integration E...,,,,,EPAM Systems,Seniority level Mid-Senior level Employment ty...,
4,8155,4112977117,Lider/ka zespołu Google Ads,"Poznań, Wielkopolskie, Poland (On-site)",,,,,Widoczni - miejsce dla ludzi nastawionych na ...,,,,,widoczni,Seniority level Executive Employment type Full...,Visible - a place for people focused on succes...


In [4]:
df2.head()

Unnamed: 0,id,job_id,source,job_url,date_posted,is_active
0,1,4158353862,LinkedIn,https://www.linkedin.com/jobs/view/4158353862,2025-02-21,True
1,2,4138407547,LinkedIn,https://www.linkedin.com/jobs/view/4138407547,2025-02-21,True
2,3,4144364769,LinkedIn,https://www.linkedin.com/jobs/view/4144364769,2025-02-21,True
3,4,4083706042,LinkedIn,https://www.linkedin.com/jobs/view/4083706042,2025-02-21,True
4,5,4154765012,LinkedIn,https://www.linkedin.com/jobs/view/4154765012,2025-02-21,True


In [5]:
def detect_language(text):
    try:
        return detect(text)
    except:
        return "unknown"

In [6]:


def split_text(text, max_length=500):
    """Splits text into chunks, ensuring each is under the max_length."""
    sentences = re.split(r'\.\s*', text)
    chunks = []
    temp_chunk = ""

    for sentence in sentences:
        if temp_chunk and len(temp_chunk) + len(sentence) + 1 > max_length:
            chunks.append(temp_chunk.strip())
            temp_chunk = sentence
        else:
            temp_chunk = f"{temp_chunk} {sentence}".strip() if temp_chunk else sentence
    if temp_chunk:
        chunks.append(temp_chunk.strip())
    
    return chunks


def translate_text(text):
    """Translates text using the free Google API."""
    url = f"https://translate.googleapis.com/translate_a/single?client=gtx&dt=t&sl=pl&tl=en&q={text}"
    try:
        response = get(url)
        translated_json = loads(response.text)
        return translated_json[0][0][0] if translated_json else text
    except Exception as e:
        print(f"Error translating: {e}")
        return text

def translate_full_text(text):
    """Splits, translates, and reconstructs the translated text."""
    chunks = split_text(text)
    translated_chunks = [translate_text(chunk) for chunk in chunks]
    return " ".join(translated_chunks)


In [7]:
from tqdm import tqdm
from sqlalchemy.exc import OperationalError
import time
import math
tqdm.pandas()

with engine.connect() as connection:
    connection.execute(text("ALTER TABLE job_postings ADD COLUMN IF NOT EXISTS description_en TEXT;"))
    connection.commit()
    
df["language"] = df["description"].apply(detect_language)
df["description_en"] = df.get("description_en", pd.Series([None]*len(df)))

df_to_translate = df[(df["language"] == "pl") & (df["description_en"].isnull())].copy()
total = len(df_to_translate)
chunk_size = math.ceil(total / 20)

for i in range(0, total, chunk_size):
    chunk = df_to_translate.iloc[i:i+chunk_size].copy()
    print(f"\n🔸 Translating chunk {i//chunk_size + 1}/10")

    chunk["translated_description"] = chunk["description"].progress_apply(translate_full_text)

    # Save to DB
    for _, row in chunk.iterrows():
        try:
            with engine.begin() as conn:
                conn.execute(
                    text("UPDATE job_postings SET description_en = :desc_en WHERE job_id = :job_id"),
                    {"desc_en": row["translated_description"], "job_id": row["job_id"]}
                )
        except OperationalError as e:
            print(f"OperationalError for job_id={row['job_id']}: {e}")
            time.sleep(3)

    print(f"Chunk {i//chunk_size + 1} saved to database.")



🔸 Translating chunk 1/10


100%|██████████| 366/366 [05:38<00:00,  1.08it/s]


Chunk 1 saved to database.

🔸 Translating chunk 2/10


100%|██████████| 366/366 [05:43<00:00,  1.07it/s]


Chunk 2 saved to database.

🔸 Translating chunk 3/10


100%|██████████| 366/366 [27:36<00:00,  4.53s/it]


Chunk 3 saved to database.

🔸 Translating chunk 4/10


 58%|█████▊    | 214/366 [21:11<16:47,  6.63s/it]

Error translating: Expecting value: line 1 column 1 (char 0)


100%|██████████| 366/366 [38:00<00:00,  6.23s/it]


Chunk 4 saved to database.

🔸 Translating chunk 5/10


 11%|█▏        | 42/366 [03:19<13:44,  2.54s/it]

Error translating: Expecting value: line 1 column 1 (char 0)


100%|██████████| 366/366 [28:20<00:00,  4.65s/it]


Chunk 5 saved to database.

🔸 Translating chunk 6/10


  2%|▏         | 8/366 [01:13<1:48:04, 18.11s/it]

Error translating: Expecting value: line 1 column 1 (char 0)


 44%|████▍     | 161/366 [11:43<12:36,  3.69s/it] 

Error translating: Expecting value: line 1 column 1 (char 0)


100%|██████████| 366/366 [25:18<00:00,  4.15s/it]


Chunk 6 saved to database.

🔸 Translating chunk 7/10


100%|██████████| 366/366 [09:10<00:00,  1.50s/it]


Chunk 7 saved to database.

🔸 Translating chunk 8/10


 51%|█████▏    | 188/366 [06:31<09:37,  3.24s/it]

Error translating: Expecting value: line 1 column 1 (char 0)


 81%|████████▏ | 298/366 [13:19<03:39,  3.23s/it]

Error translating: Expecting value: line 1 column 1 (char 0)


100%|██████████| 366/366 [17:24<00:00,  2.85s/it]


Chunk 8 saved to database.

🔸 Translating chunk 9/10


100%|██████████| 366/366 [19:31<00:00,  3.20s/it]


Chunk 9 saved to database.

🔸 Translating chunk 10/10


100%|██████████| 366/366 [20:51<00:00,  3.42s/it]


Chunk 10 saved to database.

🔸 Translating chunk 11/10


100%|██████████| 366/366 [21:03<00:00,  3.45s/it]


Chunk 11 saved to database.

🔸 Translating chunk 12/10


100%|██████████| 366/366 [18:15<00:00,  2.99s/it]


Chunk 12 saved to database.

🔸 Translating chunk 13/10


100%|██████████| 366/366 [19:56<00:00,  3.27s/it]


Chunk 13 saved to database.

🔸 Translating chunk 14/10


100%|██████████| 366/366 [20:57<00:00,  3.44s/it]


Chunk 14 saved to database.

🔸 Translating chunk 15/10


100%|██████████| 366/366 [19:43<00:00,  3.23s/it]


Chunk 15 saved to database.

🔸 Translating chunk 16/10


100%|██████████| 366/366 [21:34<00:00,  3.54s/it]


Chunk 16 saved to database.

🔸 Translating chunk 17/10


100%|██████████| 366/366 [22:42<00:00,  3.72s/it]


Chunk 17 saved to database.

🔸 Translating chunk 18/10


100%|██████████| 366/366 [30:24<00:00,  4.98s/it]


Chunk 18 saved to database.

🔸 Translating chunk 19/10


 50%|█████     | 183/366 [17:37<23:04,  7.57s/it]

Error translating: Expecting value: line 1 column 1 (char 0)


100%|██████████| 366/366 [30:23<00:00,  4.98s/it]


Chunk 19 saved to database.

🔸 Translating chunk 20/10


100%|██████████| 350/350 [21:27<00:00,  3.68s/it]


Chunk 20 saved to database.
