In [9]:
import re
import requests
import random
import time
import pandas as pd
from datetime import datetime

APP_ID = "019ec8e4"
APP_KEY = "2a9e14f02cc7cdff6bedf6aa26fc4314"
COUNTRIES = ["at", "au", "be", "br", "ca", "ch", "de", "es", "fr", "gb", "in", "it", "mx", "nl", "nz", "pl", "sg", "us", "za"]
RESULTS_PER_PAGE = 100
NUM_PAGES = 10
DELAY_RANGE = (5, 10)

JOB_TITLES = [
    "Data Scientist",
    "Data Analyst",
    "Data Engineer",
    "Machine Learning Engineer",
    "Analytics Consultant"
]

KEYWORDS = {
    "Python", "SQL", "R", "Machine Learning", "Deep Learning", "TensorFlow", "PyTorch",
    "Scikit-Learn", "Big Data", "Hadoop", "Spark", "Kafka", "AWS", "Azure", "GCP",
    "Tableau", "Power BI", "Google Analytics", "Snowflake", "Kubernetes", "Docker",
    "Apache Airflow", "ETL", "Data Pipelines", "KPI", "Java", "Pandas"
}

def extract_keywords(description):
    if not description:
        return None
    found_keywords = {kw for kw in KEYWORDS if re.search(rf"\b{kw}\b", description, re.IGNORECASE)}
    return ", ".join(sorted(found_keywords)) if found_keywords else None

all_jobs = []

for job_title in JOB_TITLES:
    for country in COUNTRIES:
        print(f"\n Searching for: {job_title} in {country}")

        for page in range(1, NUM_PAGES + 1):
            print(f"  Fetching page {page} for '{job_title}' in {country}...")

            
            url = (f"https://api.adzuna.com/v1/api/jobs/{country}/search/{page}"
                   f"?app_id={APP_ID}&app_key={APP_KEY}&what={job_title.replace(' ', '%20')}"
                   f"&results_per_page={RESULTS_PER_PAGE}")

            try:
                response = requests.get(url)
                response.raise_for_status()
                data = response.json()

                if "results" not in data:
                    print(f"   No results found for '{job_title}' on page {page}.")
                    continue

                
                for job in data["results"]:
                    latitude = job.get("latitude")
                    longitude = job.get("longitude")
                    salary_min = job.get("salary_min")
                    salary_max = job.get("salary_max")
                    created_time = job.get("created")
                    description_text = job.get("description", "")

                    # Nur Jobs mit Geodaten & Gehalt speichern
                    if (latitude is not None and longitude is not None) and (salary_min is not None or salary_max is not None):
                        extracted_skills = extract_keywords(description_text)

                        # **Nur speichern, wenn Skills vorhanden sind**
                        if extracted_skills:
                            created_date = datetime.strptime(created_time, "%Y-%m-%dT%H:%M:%SZ").date() if created_time else None

                            all_jobs.append({
                                "Job Title": job.get("title"),
                                "Company": job.get("company", {}).get("display_name"),
                                "Location": job.get("location", {}).get("display_name"),
                                "Salary Min": salary_min,
                                "Salary Max": salary_max,
                                "Description": description_text,
                                "Extracted Skills": extracted_skills,
                                "Longitude": longitude,
                                "Latitude": latitude,
                                "URL": job.get("redirect_url"),
                                "Created": created_date
                            })

            except requests.exceptions.RequestException as e:
                print(f"    Error on page {page} for '{job_title}' in {country}: {e}")
                continue

            
            delay = random.uniform(*DELAY_RANGE)
            print(f"  Waiting {round(delay, 2)} seconds before next request...")
            time.sleep(delay)


jobs_df = pd.DataFrame(all_jobs)
csv_filename = "filtered_jobs_with_skills.csv"
jobs_df.to_csv(csv_filename, index=False)

print(f"\n Filtered job data saved to '{csv_filename}'")



🔍 Searching for: Data Scientist in at
  📄 Fetching page 1 for 'Data Scientist' in at...
  ⏳ Waiting 6.05 seconds before next request...
  📄 Fetching page 2 for 'Data Scientist' in at...
  ⏳ Waiting 8.15 seconds before next request...
  📄 Fetching page 3 for 'Data Scientist' in at...
  ⏳ Waiting 5.03 seconds before next request...
  📄 Fetching page 4 for 'Data Scientist' in at...
  ⏳ Waiting 6.71 seconds before next request...
  📄 Fetching page 5 for 'Data Scientist' in at...
  ⏳ Waiting 7.31 seconds before next request...
  📄 Fetching page 6 for 'Data Scientist' in at...
  ⏳ Waiting 5.76 seconds before next request...
  📄 Fetching page 7 for 'Data Scientist' in at...
  ⏳ Waiting 8.05 seconds before next request...
  📄 Fetching page 8 for 'Data Scientist' in at...
  ⏳ Waiting 7.83 seconds before next request...
  📄 Fetching page 9 for 'Data Scientist' in at...
  ⏳ Waiting 8.33 seconds before next request...
  📄 Fetching page 10 for 'Data Scientist' in at...
  ⏳ Waiting 6.08 seconds bef

In [19]:
import re
import requests
import random
import time
import pandas as pd
import os
from datetime import datetime


APP_ID = "019ec8e4"
APP_KEY = "2a9e14f02cc7cdff6bedf6aa26fc4314"
COUNTRIES = ["at", "au", "be", "br", "ca", "ch", "de", "es", "fr", "gb", "in", "it", "mx", "nl", "nz", "pl", "sg", "us", "za"]
RESULTS_PER_PAGE = 100
NUM_PAGES = 10
DELAY_RANGE = (5, 10)


JOB_TITLES = [
    "Data Scientist",
    "Data Analyst",
    "Data Engineer",
    "Machine Learning Engineer",
    "Analytics Consultant"
]


KEYWORDS = {
    "Python", "SQL", "R", "Machine Learning", "Deep Learning", "TensorFlow", "PyTorch",
    "Scikit-Learn", "Big Data", "Hadoop", "Spark", "Kafka", "AWS", "Azure", "GCP",
    "Tableau", "Power BI", "Google Analytics", "Snowflake", "Kubernetes", "Docker",
    "Apache Airflow", "ETL", "Data Pipelines", "KPI", "Java", "Pandas"
}

def extract_keywords(description):
    if not description:
        return None
    found_keywords = {kw for kw in KEYWORDS if re.search(rf"\b{kw}\b", description, re.IGNORECASE)}
    return ", ".join(sorted(found_keywords)) if found_keywords else None

csv_filename = "filtered_jobs_with_skills_plus.csv"
existing_urls = set()

if os.path.exists(csv_filename):
    old_data = pd.read_csv(csv_filename)
    if "URL" in old_data.columns:
        existing_urls = set(old_data["URL"].dropna())

all_jobs = []


for job_title in JOB_TITLES:
    for country in COUNTRIES:
        print(f"\n🔍 Searching for: {job_title} in {country}")

        for page in range(1, NUM_PAGES + 1):
            print(f"  📄 Fetching page {page} for '{job_title}' in {country}...")

            
            url = (f"https://api.adzuna.com/v1/api/jobs/{country}/search/{page}"
                   f"?app_id={APP_ID}&app_key={APP_KEY}&what={job_title.replace(' ', '%20')}"
                   f"&results_per_page={RESULTS_PER_PAGE}")

            try:
                response = requests.get(url)
                response.raise_for_status()
                data = response.json()

                if "results" not in data:
                    print(f"   No results found for '{job_title}' on page {page}.")
                    continue

                
                for job in data["results"]:
                    job_url = job.get("redirect_url")

                     
                    if job_url in existing_urls:
                        continue  

                    latitude = job.get("latitude")
                    longitude = job.get("longitude")
                    salary_min = job.get("salary_min")
                    salary_max = job.get("salary_max")
                    created_time = job.get("created")
                    description_text = job.get("description", "")


                    if (latitude is not None and longitude is not None) and (salary_min is not None or salary_max is not None):
                        extracted_skills = extract_keywords(description_text)

                        if extracted_skills:
                            created_date = datetime.strptime(created_time, "%Y-%m-%dT%H:%M:%SZ").date() if created_time else None

                            all_jobs.append({
                                "Job Title": job.get("title"),
                                "Company": job.get("company", {}).get("display_name"),
                                "Location": job.get("location", {}).get("display_name"),
                                "Salary Min": salary_min,
                                "Salary Max": salary_max,
                                "Description": description_text,
                                "Extracted Skills": extracted_skills,
                                "Longitude": longitude,
                                "Latitude": latitude,
                                "URL": job_url,
                                "Created": created_date
                            })

            except requests.exceptions.RequestException as e:
                print(f"    Error on page {page} for '{job_title}' in {country}: {e}")
                continue

            delay = random.uniform(*DELAY_RANGE)
            print(f"  ⏳ Waiting {round(delay, 2)} seconds before next request...")
            time.sleep(delay)

if all_jobs:
    new_jobs_df = pd.DataFrame(all_jobs)
    new_jobs_df.to_csv(csv_filename, mode="a", header=not os.path.exists(csv_filename), index=False)

    print(f"\n {len(new_jobs_df)} new job(s) added to '{csv_filename}'")

else:
    print("\n No new jobs found. The dataset is already up to date.")



🔍 Searching for: Data Scientist in at
  📄 Fetching page 1 for 'Data Scientist' in at...
  ⏳ Waiting 7.8 seconds before next request...
  📄 Fetching page 2 for 'Data Scientist' in at...
  ⏳ Waiting 6.44 seconds before next request...
  📄 Fetching page 3 for 'Data Scientist' in at...
  ⏳ Waiting 7.3 seconds before next request...
  📄 Fetching page 4 for 'Data Scientist' in at...
  ⏳ Waiting 9.2 seconds before next request...
  📄 Fetching page 5 for 'Data Scientist' in at...
    ❌ Error on page 5 for 'Data Scientist' in at: 503 Server Error: Service Temporarily Unavailable for url: https://api.adzuna.com/v1/api/jobs/at/search/5?app_id=019ec8e4&app_key=2a9e14f02cc7cdff6bedf6aa26fc4314&what=Data%20Scientist&results_per_page=100
  📄 Fetching page 6 for 'Data Scientist' in at...
  ⏳ Waiting 7.62 seconds before next request...
  📄 Fetching page 7 for 'Data Scientist' in at...
  ⏳ Waiting 9.53 seconds before next request...
  📄 Fetching page 8 for 'Data Scientist' in at...
  ⏳ Waiting 5.31 sec

KeyboardInterrupt: 