#API scrapping of all the pages of Google Jobs

In [72]:
import requests

def fetch_all_google_jobs_data(api_key, query, location, hl="en", chips=None):
    """
    Récupère les données de toutes les pages de l'API Google Jobs pour une recherche donnée.

    Paramètres :
    api_key (str): Clé privée SerpApi.
    query (str): Requête de recherche.
    location (str, optionnel): Lieu géographique pour la recherche.
    hl (str): Langue pour la recherche.
    chips (list, optionnel): Conditions supplémentaires pour la requête.

    Retourne :
    list: Toutes les réponses JSON analysées de l'API.
    """
    all_results = []
    start = 0
    while True:
        # Paramètres pour la requête API
        params = {
            "engine": "google_jobs",
            "q": query,
            "start": start,
            "hl": hl,
            "api_key": api_key
        }

        if location:
            params["location"] = location

        if chips:
            params["chips"] = ','.join(chips)

        # URL de base pour l'API Google Jobs
        base_url = "https://serpapi.com/search.json"

        # Envoi d'une requête GET à l'API
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            data = response.json()
            all_results.append(data)

            # Vérifier s'il y a encore des résultats
            if "jobs_results" not in data or not data["jobs_results"]:
                break

            start += 20  # Incrémenter pour la prochaine page
        else:
            break

    return all_results

# Utilisation de la fonction
# Remplacez 'your_api_key' par votre clé SerpApi rée
complete_data = fetch_all_google_jobs_data("b384a7a8bd5ed660d935e64bc467147a6587986f9eeee21c92e5468fa37c0016", "Data", "Italy")


In [51]:
complete_data

[{'search_metadata': {'id': '65ac3c6dc9de455e8ce489f0',
   'status': 'Success',
   'json_endpoint': 'https://serpapi.com/searches/7adfbc37a851fbc3/65ac3c6dc9de455e8ce489f0.json',
   'created_at': '2024-01-20 21:34:37 UTC',
   'processed_at': '2024-01-20 21:34:37 UTC',
   'google_jobs_url': 'https://www.google.com/search?q=Data&ibp=htl;jobs&uule=w+CAIQICILU3dpdHplcmxhbmQ&hl=en&start=0',
   'raw_html_file': 'https://serpapi.com/searches/7adfbc37a851fbc3/65ac3c6dc9de455e8ce489f0.html',
   'total_time_taken': 0.74},
  'search_parameters': {'q': 'Data',
   'engine': 'google_jobs',
   'location_requested': 'Switzerland',
   'location_used': 'Switzerland',
   'google_domain': 'google.com',
   'hl': 'en',
   'start': 0},
  'jobs_results': [{'title': 'Flow Cytometry Data Analyst',
    'company_name': 'Roche',
    'location': '  Basel, Switzerland   ',
    'via': 'via Roche Careers',
    'description': "The Position\n\nIn Roche’s Pharmaceutical Research and Early Development organization (pRED),

In [42]:
import requests

payload = { 'api_key': '02c61b750bca71883a0e8dbb30c630b0', 'url': 'https://serpapi.com/search.json' }
r = requests.get('https://api.scraperapi.com/', params=payload)
print(r.text)


Request failed. You will not be charged for this request. Please make sure your url is correct and try the request again. If this continues to happen, please contact support@scraperapi.com. The domain may be protected, please try adding the premium=true OR ultra_premium=true parameter to your request.


In [61]:


import re

def extract_job_info(jobs_results):
    """
    Extrait les informations d'une liste d'offres d'emploi à partir de données JSON.

    :param jobs_results: Liste de dictionnaires contenant les données des emplois.
    :return: Liste de dictionnaires avec les informations extraites pour chaque emploi.
    """
    extracted_jobs = []
    for job_data in jobs_results:
        job_info = {
            "title": job_data.get("title", "N/A"),
            "company_name": job_data.get("company_name", "N/A"),
            "location": job_data.get("location", "N/A").strip(),
            "experience": extract_experience(job_data.get("description", "")),
            "technical_skills": extract_technical_skills(job_data.get("description", ""))
        }
        extracted_jobs.append(job_info)
    return extracted_jobs

def extract_experience(description):
    """
    Extrait l'expérience requise à partir de la description de l'emploi.

    :param description: La description de l'emploi.
    :return: Expérience requise (sous forme de texte).
    """
    experience_patterns = [
        r'(\d+ ans d’expérience)',
        r'expérience de (\d+ ans)',
        r'expérience d’au moins (\d+ ans)'
    ]
    for pattern in experience_patterns:
        match = re.search(pattern, description)
        if match:
            return match.group(0)
    return "Non spécifié"

def extract_technical_skills(description):
    """
    Extrait les compétences techniques de la description de l'emploi.

    :param description: La description de l'emploi.
    :return: Liste des compétences techniques.
    """
    # Liste des compétences techniques courantes à rechercher
    skills = [
    "Python", "R", "SQL", "Excel", "Tableau", "PowerBI", "Dataiku", "Snowflake",
    "GCP", "AWS", "AZURE", "Databricks", "Talend", "Spark", "Scala", "Dataiku",
    "Airflow", "S3", "Kafka", "Hadoop", "SQL", "NoSQL", "Cassandra", "MongoDB",
    "ELK", "Git", "Kubernetes", "Docker", "OVH", "Kanban", "Jira", "Jenkins",
    "ElasticSearch", "Linus", "Kibana", "LogStash", "keras", "Tensorflow", "Gitlab",
    "Snowflake", "NLP", "Random Forest", "XGBoost", "SQL Server", "Oracle", "LLM",
    "IA generative", "generative AI", "GPT", "Terraform", "Yolo", "Pytorch",
    "Transformers", "scikit-learn", "pySpark", "Pyspark", "PySpark", "SAS", "scipy",
    "Azure Data Factory", "Java", "VBA", "Matplotlib", "Seaborn", "Pandas",
    "NumPy", "SciKit-Learn", "Streamlit", "Dash", "Shiny", "Flask", "Django",
    "BigQuery", "Redshift", "PostgreSQL", "MySQL", "DB2", "SQLite", "MLflow",
    "Plotly", "Bokeh", "D3.js", "GraphQL", "REST API", "Microservices"
]


    found_skills = []
    for skill in skills:
        if skill.lower() in description.lower():
            found_skills.append(skill)
    return found_skills if found_skills else ["Non spécifié"]




In [69]:
import pandas as pd
from google.colab import files

def extract_job_info(complete_data):
    """
    Extrait les informations de toutes les offres d'emploi dans la structure de données complète.

    :param complete_data: Liste de dictionnaires contenant les métadonnées de recherche et les résultats des emplois.
    :return: Liste de dictionnaires avec les informations extraites pour chaque emploi.
    """
    all_job_infos = []
    for data in complete_data:
        jobs_results = data.get('jobs_results', [])
        for job_data in jobs_results:
            job_info = {
                "title": job_data.get("title", "N/A"),
                "company_name": job_data.get("company_name", "N/A"),
                "location": job_data.get("location", "N/A").strip(),
                "experience": extract_experience(job_data.get("description", "")),
                "technical_skills": extract_technical_skills(job_data.get("description", ""))
            }
            all_job_infos.append(job_info)
    return all_job_infos

# Exemple d'utilisation
extracted_job_infos = extract_job_info(complete_data)
# Création d'un DataFrame à partir de la liste des offres d'emploi
df = pd.DataFrame(extracted_job_infos)
# Sauvegarde du DataFrame dans un fichier CSV
df.to_csv("google_jobs_data.csv", index=False)


files.download('google_jobs_data.csv')



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [43]:
!pip install pandas



In [28]:
complete_data

[]

In [8]:
all_extracted_job_infos = []
for data in complete_data:
    job_infos = extract_job_info(data['jobs_results'])
    all_extracted_job_infos.extend(job_infos)

# Affichage des informations extraites pour chaque emploi
for job_info in all_extracted_job_infos:
    print(job_info)

KeyError: 'jobs_results'