In [1]:
# import pandas as pd
# from deep_translator import GoogleTranslator

# def translate_to_english(text):
#     return GoogleTranslator(source='auto', target='en').translate(text)

# from deep_translator import GoogleTranslator
# import pandas as pd
from deep_translator import GoogleTranslator
import pandas as pd
from tqdm import tqdm
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

def chunk_text(text, max_len=4900):
    """Split long text into chunks at sentence boundaries"""
    if not isinstance(text, str) or len(text) <= max_len:
        return [text]
    
    chunks = []
    while text:
        chunk = text[:max_len]
        # Find the last natural break point
        last_period = chunk.rfind('. ')
        last_newline = chunk.rfind('\n')
        split_at = max(last_period, last_newline, max_len//2)  # Fallback to middle if no break found
        chunks.append(text[:split_at+1])
        text = text[split_at+1:]
    return chunks

def translate_with_retry(text, max_retries=3, delay=1):
    """Translate with retry logic for API failures"""
    translator = GoogleTranslator(source='auto', target='en')
    
    for attempt in range(max_retries):
        try:
            if len(text) > 4900:
                chunks = chunk_text(text)
                return ' '.join([translator.translate(chunk) for chunk in chunks])
            return translator.translate(text)
        except Exception as e:
            if attempt == max_retries - 1:
                print(f"Failed to translate: {text[:50]}... (Error: {str(e)})")
                return text  # Return original if all retries fail
            time.sleep(delay * (attempt + 1))
    return text

def translate_column_optimized(df, column_name, batch_size=50, workers=4):
    """Optimized translation with parallel processing"""
    # Clean and get unique texts
    unique_texts = df[column_name].dropna().astype(str).unique().tolist()
    
    # Parallel translation
    translated = []
    with ThreadPoolExecutor(max_workers=workers) as executor:
        futures = []
        for i in tqdm(range(0, len(unique_texts), batch_size), desc="Submitting batches"):
            batch = unique_texts[i:i+batch_size]
            futures.append(executor.submit(lambda x: [translate_with_retry(t) for t in x], batch))
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Translating"):
            translated.extend(future.result())
    
    # Create mapping and apply
    translation_map = dict(zip(unique_texts, translated))
    return df[column_name].astype(str).map(translation_map)


In [20]:

df3 = pd.read_csv("stepstone_jobs[51-75].csv")
df2 = pd.read_csv("stepstone_jobs[26-50].csv")
df1 = pd.read_csv("stepstone_jobs[1-25].csv")
df4 = pd.read_csv("stepstone_jobs[76-100].csv")
df5 = pd.read_csv("stepstone_jobs[101-150].csv")
df6 = pd.read_csv("stepstone_jobs[151-190].csv")

DF = pd.concat([df1,df2,df3,df4,df5,df6],ignore_index=True)


# df["title"] = translate_column(df,"title")


In [2]:
DF = pd.read_csv("job_postings.csv")
DF["description"] = translate_column_optimized(DF,"description")
DF.to_csv("job_postings.csv")

Submitting batches: 100%|█████████████████████| 74/74 [00:00<00:00, 2893.46it/s]
Translating: 100%|██████████████████████████████| 74/74 [12:58<00:00, 10.52s/it]


In [5]:
DF["requirements"] = translate_column_optimized(DF,"requirements")
DF.to_csv("job_postings.csv")

Submitting batches: 100%|█████████████████████| 84/84 [00:00<00:00, 3219.96it/s]
Translating: 100%|██████████████████████████████| 84/84 [10:44<00:00,  7.68s/it]


In [17]:
df = pd.read_csv("job_postings.csv")
#df.drop(columns=df.)

df = df.drop([df.columns[0], df.columns[1]], axis = 'columns')

In [21]:
df.to_csv("job_postings.csv")

In [20]:
df

Unnamed: 0,title,company,location,job_type,employment_type,description,requirements
0,Business Analyst (m/w/d) - im Team Data Scienc...,Markant Gruppe,Offenburg,Feste Anstellung,"Homeoffice möglich, Vollzeit",There is a lot of know-how in our medium-sized...,Ongoing studies in the field of computer scien...
1,Procurement Specialist (m/w/d) Data Science & AI,Leadec Management Central Europe BV & Co. KG,Stuttgart,Feste Anstellung,"Homeoffice möglich, Vollzeit","With 14,500 employees worldwide and more than ...",You bring many years of professional experienc...
2,DHBW studies - data science and artificial int...,Liebherr-International Deutschland GmbH,Biberach,,,Monari is a future-oriented and international ...,Successfully completed degree in business info...
3,Work student in the Data Science area with Pyt...,ARAG SE,Düsseldorf,"Studentenjobs, Werkstudent","Homeoffice möglich, Teilzeit",IONITY is a joint venture of the car manufactu...,You have a successfully completed degree in th...
4,Dual studies Data Science & Artificial Intelli...,Deloitte,"Düsseldorf, Mannheim, Stuttgart","Ausbildung, Studium",Vollzeit,ZS2Radätteilen GmbH is one of the leading Germ...,Completed scientific university studies in com...
...,...,...,...,...,...,...,...
4544,IT specialist at the Norderstedt (m/f/d),FERCHAU – Connecting People and Technologies,Norderstedt,Feste Anstellung,"Homeoffice möglich, Vollzeit","Combining people and technologies, designing t...",Completed university education at an applied u...
4545,Project coordinator formulation and process de...,FERCHAU – Connecting People and Technologies,Biberach an der Riß,Feste Anstellung,Vollzeit,You don't want to leave your career developmen...,Practice from the user support: You have well-...
4546,Quercomers production pharmaceutical (m/f/d),FERCHAU – Connecting People and Technologies,Biberach an der Riß,Feste Anstellung,Vollzeit,You don't want to leave your career developmen...,Technical basis with depth: You have several y...
4547,Laborant / cta / bta (M / W / D),FERCHAU – Connecting People and Technologies,Biberach an der Riß,Feste Anstellung,Vollzeit,Finding the best minds for a wide variety of t...,Completed training as an IT system administrat...
