In [30]:
import pandas as pd
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine, text
from json import loads
from requests import get
import re
from langdetect import detect

In [31]:
# connect to the database

dotenv_path = os.path.join("..", "scraping", ".env")
load_dotenv(dotenv_path)
DATABASE_URL = f"postgresql://{os.getenv('DB_USER')}:{os.getenv('DB_PASSWORD')}@{os.getenv('DB_HOST')}:{os.getenv('DB_PORT')}/{os.getenv('DB_NAME')}"

engine = create_engine(DATABASE_URL)

query = "SELECT * FROM job_postings WHERE description IS NOT NULL;"
original_df = pd.read_sql(query, engine)
df = original_df.copy()

query2 =  """
    SELECT job_sources.*
    FROM job_sources 
    INNER JOIN job_postings AS j 
    ON job_sources.job_id = j.job_id
    WHERE j.description IS NOT NULL;
"""
original_df2 = pd.read_sql(query2, engine)
df2 = original_df2.copy()

In [32]:
df.head()

Unnamed: 0,id,job_id,title,location,salary,experience_level,job_type,employment_type,description,responsibilities,requirements,skills,benefits,company,description_criteria
0,14048,4115475043,Senior AI Engineer proficient in Swift,"Warsaw, Mazowieckie, Poland (On-site)",,,,,We are seeking a highly motivated and experien...,,,,,"AIDA projektai, MB",Seniority level Mid-Senior level Employment ty...
1,8,4158353862,Junior Data Scientist (Business Operations),"Warsaw, Mazowieckie, Poland (On-site)",,,,,Job Description: The Business Operations Team...,,,,,Wolt,Seniority level Mid-Senior level Employment ty...
2,14872,4056210173,Senior Data Integration Engineer (Databricks),Poland (Remote),,,,,We are looking for a Senior Data Integration E...,,,,,EPAM Systems,Seniority level Mid-Senior level Employment ty...
3,9,4138407547,Stażysta / Stażystka w dziale analizy danych,"Poznań, Wielkopolskie, Poland (Hybrid)",,,,,Opis stanowiska: Zbieranie i organizacja dany...,,,,,Praca.pl,Seniority level Internship Employment type Ful...
4,15360,4161565375,Service Reliability Engineer,"Warsaw, Mazowieckie, Poland (Hybrid)",,,,,"Within CIB ITO Production organisation, Produc...",,,,,BNP Paribas CIB,Seniority level Entry level Employment type Fu...


In [33]:
df2.head()

Unnamed: 0,id,job_id,source,job_url,date_posted,is_active
0,1,4158353862,LinkedIn,https://www.linkedin.com/jobs/view/4158353862,2025-02-21,True
1,2,4138407547,LinkedIn,https://www.linkedin.com/jobs/view/4138407547,2025-02-21,True
2,3,4144364769,LinkedIn,https://www.linkedin.com/jobs/view/4144364769,2025-02-21,True
3,4,4083706042,LinkedIn,https://www.linkedin.com/jobs/view/4083706042,2025-02-21,True
4,5,4154765012,LinkedIn,https://www.linkedin.com/jobs/view/4154765012,2025-02-21,True


In [34]:
def detect_language(text):
    try:
        return detect(text)
    except:
        return "unknown"

In [35]:


def split_text(text, max_length=500):
    """Splits text into chunks, ensuring each is under the max_length."""
    sentences = re.split(r'\.\s*', text)
    chunks = []
    temp_chunk = ""

    for sentence in sentences:
        if temp_chunk and len(temp_chunk) + len(sentence) + 1 > max_length:
            chunks.append(temp_chunk.strip())
            temp_chunk = sentence
        else:
            temp_chunk = f"{temp_chunk} {sentence}".strip() if temp_chunk else sentence
    if temp_chunk:
        chunks.append(temp_chunk.strip())
    
    return chunks


def translate_text(text):
    """Translates text using the free Google API."""
    url = f"https://translate.googleapis.com/translate_a/single?client=gtx&dt=t&sl=pl&tl=en&q={text}"
    try:
        response = get(url)
        translated_json = loads(response.text)
        return translated_json[0][0][0] if translated_json else text
    except Exception as e:
        print(f"Error translating: {e}")
        return text

def translate_full_text(text):
    """Splits, translates, and reconstructs the translated text."""
    chunks = split_text(text)
    translated_chunks = [translate_text(chunk) for chunk in chunks]
    return " ".join(translated_chunks)


In [36]:
from tqdm import tqdm
import math
tqdm.pandas()

with engine.connect() as connection:
    connection.execute(text("ALTER TABLE job_postings ADD COLUMN IF NOT EXISTS description_en TEXT;"))

df["language"] = df["description"].apply(detect_language)
df["description_en"] = df.get("description_en", pd.Series([None]*len(df)))

df_to_translate = df[(df["language"] == "pl") & (df["description_en"].isnull())].copy()
total = len(df_to_translate)
chunk_size = math.ceil(total / 10)

for i in range(0, total, chunk_size):
    chunk = df_to_translate.iloc[i:i+chunk_size].copy()
    print(f"\n🔸 Translating chunk {i//chunk_size + 1}/10")

    chunk["translated_description"] = chunk["description"].progress_apply(translate_full_text)

    # Save to DB
    with engine.begin() as conn:
        for _, row in chunk.iterrows():
            conn.execute(
                text("UPDATE job_postings SET description_en = :desc_en WHERE job_id = :job_id"),
                {"desc_en": row["translated_description"], "job_id": row["job_id"]}
            )
    print(f"✅ Chunk {i//chunk_size + 1} saved to database.")


  0%|          | 22/8114 [00:19<2:02:09,  1.10it/s]


KeyboardInterrupt: 