In [1]:
# Ce script permet de scraper les formations de l'Onisep
# Crée un dictionnaire de la forme :
#
# Nom de la formation :
# --> Débouchés professionnels
# ----> Débouchés
# ----> Exemples de débouchés
#
# --> Poursuivre mes études
# ----> Poursuivre
# ----> Exemples de formations poursuivies
#
# --> Accès au métier 
# ----> Accès
# ----> Exemples de formations requises

import os
import json
import numpy as np
import math
import pandas as pd
from urllib import request
from bs4 import BeautifulSoup

In [2]:
# Chemins utiles
dir_path = '/home/timothee/PycharmProjects/ImpalaPoleEmploi2/Impala/Formations/'
# Dossier data
data_dir_path = os.path.join(dir_path, '0 - data')
# Dossier output
output_dir_path = os.path.join(dir_path, '3 - Formation Scraper/output')
    
# Dossier output contenant les données et les fiches
formations_output_path = os.path.join(dir_path, 'formations.json')
errors_output_path = os.path.join(dir_path, 'errors.json')

In [4]:
# Liste des liens vers les formations (8095 formations)
formations_file_path = os.path.join(data_dir_path, 'Onisep_formation.csv')
df = pd.read_csv(formations_file_path, sep=";")
formations_links = df.iloc[:,[2,3,7]]

In [5]:
# Fonction qui récupère les onglets "Accès à la formation" et "Poursuivre mes études" 
# de la formation en lien et qui crée une fiche html 
def parse_formation_page(link):
    #print('GET '+ link)
    
    try:
        page = request.urlopen(link).read().decode('utf-8')

        page = page.replace('€', 'euros')  # because encode/decode error
        page = page.replace(u"\u2019", "'")  # because encode/decode error
        page = page.replace(u"\u2026", "é")  # because encode/decode error
        page = page.replace(u"\u0153", "oe")  # because encode/decode error

        page = page.replace('\n', '')  # remove new line
        page = page.replace('  ', '')  # remove big spaces (to be more human readable)
        #file = os.path.join(fiches_dir_path, name.replace('/', '-').replace(' ', '_') + '.html')
        #with open(file, 'w', encoding='utf8') as f:
        #    f.write(page)

        soup = BeautifulSoup(page, 'html.parser')

        formation = {}
        # Onglet "Débouchés professionnels"
        onglet = soup.find(name='div', attrs={'id': 'oni_onglet-1'})
        if onglet is not None:
            onglet_name = onglet.h2.extract().get_text()
            if onglet_name=="Débouchés professionnels":
                formation[onglet_name] = {}
                formation[onglet_name]["Débouchés"] = []
                formation[onglet_name]["Exemples de métiers"] = []
                for element in onglet.find_all():
                    #print(element)
                    if element.name=='p' and element.get_text() not in ["Exemple(s) de métier(s):"]:
                        formation[onglet_name]["Débouchés"].append(element.get_text())
                    if element.name=='li':
                        formation[onglet_name]["Exemples de métiers"].append(element.get_text())
                        #print(element)
                        
        # Onglet "Accès à la formation"
        onglet = soup.find(name='div', attrs={'id': 'oni_onglet-2'})
        if onglet is not None:
            onglet_name = onglet.h2.extract().get_text()
            if onglet_name=="Accès à la formation ":
                formation[onglet_name]={}
                formation[onglet_name]["Accès"] = []
                formation[onglet_name]["Exemples de formations requises"] = []
                for element in onglet.find_all():
                    #print(element)
                    if element.name=='p' and element.get_text() not in ["Admission","Exemples de formations requises:"]:
                        formation[onglet_name]["Accès"].append(element.get_text())
                    if element.name=='li' and element.find(name='a')==None:
                        formation[onglet_name]["Accès"].append(element.get_text())
                    if element.name=='li' and element.find(name='a')!=None:
                        formation[onglet_name]["Exemples de formations requises"].append(element.get_text())

                            
        # Onglet "Poursuivre mes études"
        onglet = soup.find(name='div', attrs={'id': 'oni_onglet-3'})
        if onglet is not None:
            onglet_name = onglet.h2.extract().get_text()
            if onglet_name=="Poursuivre mes études...":
                formation[onglet_name]={}
                formation[onglet_name]["Poursuivre"] = []
                formation[onglet_name]["Exemples de formations poursuivies"] = []
                for element in onglet.find_all():
                    if element.name=='li':
                        formation[onglet_name]["Exemples de formations poursuivies"].append(element.get_text())
                    if element.name=='p' and element.get_text() not in ["Poursuite d'études conditionnelle"]:
                        formation[onglet_name]["Poursuivre"].append(element.get_text().strip())
                        
        return formation
    
    except:
        return('error')

#formation = parse_formation_page("http://www.onisep.fr/Ressources/Univers-Formation/Formations/Post-bac/BTS-Maintenance-des-systemes-electro-navals")
#formation

In [6]:
def get_name_formation(link):
    try:
        page = request.urlopen(link).read().decode('utf-8')

        page = page.replace('€', 'euros')  # because encode/decode error
        page = page.replace(u"\u2019", "'")  # because encode/decode error
        page = page.replace(u"\u2026", "é")  # because encode/decode error
        page = page.replace(u"\u0153", "oe")  # because encode/decode error

        page = page.replace('\n', '')  # remove new line
        page = page.replace('  ', '')  # remove big spaces (to be more human readable)
        #file = os.path.join(fiches_dir_path, name.replace('/', '-').replace(' ', '_') + '.html')
        #with open(file, 'w', encoding='utf8') as f:
        #    f.write(page)

        soup = BeautifulSoup(page, 'html.parser')
        
        bloc = soup.find(name='h1')
        return(bloc.get_text())

    except:
        return('error')
        
#name = get_name_formation('http://www.onisep.fr/Ressources/Univers-Formation/Formations/Post-bac/BTS-Management-des-unites-commerciales')
#name

In [7]:
# Fonction pour récupérer les formations des lignes i à j dans le document formations.csv de l'Onisep
# La liste "errors" contient les formations avec des liens inexistants
def get_formations(i,j):
    formations_links_i_j = formations_links.iloc[i:j,:]
    formations = {}
    errors = []
    for row in formations_links_i_j.iterrows():
        #name = row[1]["libellé principal"]
        #if str(row[1]["libellé complémentaire"])!='nan':
        #    name = name + ' : ' + str(row[1]["libellé complémentaire"])
        link = row[1]["lien site onisep.fr"]
        name = get_name_formation(link)
        formation_content = parse_formation_page(link)
        if formation_content is not 'error':
            formations[name] = formation_content
        else:
            name = row[1]["libellé principal"]
            if str(row[1]["libellé complémentaire"])!='nan':
                name = name + ' : ' + str(row[1]["libellé complémentaire"])
            errors.append(name)
                
    # Copie du dictionnaire au format JSON
    formations_output_path = os.path.join(dir_path, 'output/formations_' + str(i) + '_' + str(j) + '.json')
    with open(formations_output_path, 'w') as f:
        json.dump(formations, f, indent=4)

    # Copie du dictionnaire au format JSON
    errors_output_path = os.path.join(dir_path, 'output/errors_' + str(i) + '_' + str(j) + '.json')
    with open(errors_output_path, 'w') as f:
        json.dump(errors, f, indent=4)

In [8]:
get_formations(25,28)

In [9]:
get_formations(0,1000)

In [10]:
get_formations(1000,2000)

In [11]:
get_formations(2000,3000)

In [12]:
get_formations(3000,4000)

In [13]:
get_formations(4000,5000)

In [14]:
get_formations(5000,6000)

In [15]:
get_formations(6000,7000)

In [16]:
get_formations(7000,8000)

In [17]:
get_formations(8000,8096)

In [5]:
file1 = os.path.join(dir_path,'output/formations_' + str(0) + '_' + str(1000) + '.json') 
file2 = os.path.join(dir_path,'output/formations_' + str(1000) + '_' + str(2000) + '.json') 
file3 = os.path.join(dir_path,'output/formations_' + str(2000) + '_' + str(3000) + '.json') 
file4 = os.path.join(dir_path,'output/formations_' + str(3000) + '_' + str(4000) + '.json') 
file5 = os.path.join(dir_path,'output/formations_' + str(4000) + '_' + str(5000) + '.json') 
file6 = os.path.join(dir_path,'output/formations_' + str(5000) + '_' + str(6000) + '.json') 
file7 = os.path.join(dir_path,'output/formations_' + str(6000) + '_' + str(7000) + '.json') 
file8 = os.path.join(dir_path,'output/formations_' + str(7000) + '_' + str(8000) + '.json') 
file9 = os.path.join(dir_path,'output/formations_' + str(8000) + '_' + str(8096) + '.json') 


with open(file1, 'r') as f:
    f1 = json.load(f)

with open(file2, 'r') as f:
    f2 = json.load(f)

with open(file3, 'r') as f:
    f3 = json.load(f)

with open(file4, 'r') as f:
    f4 = json.load(f)
    
with open(file5, 'r') as f:
    f5 = json.load(f)
    
with open(file6, 'r') as f:
    f6 = json.load(f)
    
with open(file7, 'r') as f:
    f7 = json.load(f)
    
with open(file8, 'r') as f:
    f8 = json.load(f)

with open(file9, 'r') as f:
    f9 = json.load(f)
print(len(f1))
formations = dict(f1, **f2)
print(len(formations), len(f2))
formations = dict(formations, **f3)
print(len(formations), len(f3))
formations = dict(formations, **f4)
print(len(formations), len(f4))
formations = dict(formations, **f5)
print(len(formations), len(f5))
formations = dict(formations, **f6)
print(len(formations), len(f6))
formations = dict(formations, **f7)
print(len(formations), len(f7))
formations = dict(formations, **f8)
print(len(formations), len(f8))
formations = dict(formations, **f9)
print(len(formations), len(f9))

708
1118 410
1357 239
1537 180
1734 197
1899 165
2070 171
2392 322
2404 12


In [7]:
file1 = os.path.join(dir_path,'output/errors_' + str(0) + '_' + str(1000) + '.json') 
file2 = os.path.join(dir_path,'output/errors_' + str(1000) + '_' + str(2000) + '.json') 
file3 = os.path.join(dir_path,'output/errors_' + str(2000) + '_' + str(3000) + '.json') 
file4 = os.path.join(dir_path,'output/errors_' + str(3000) + '_' + str(4000) + '.json') 
file5 = os.path.join(dir_path,'output/errors_' + str(4000) + '_' + str(5000) + '.json') 
file6 = os.path.join(dir_path,'output/errors_' + str(5000) + '_' + str(6000) + '.json') 
file7 = os.path.join(dir_path,'output/errors_' + str(6000) + '_' + str(7000) + '.json') 
file8 = os.path.join(dir_path,'output/errors_' + str(7000) + '_' + str(8000) + '.json') 
file9 = os.path.join(dir_path,'output/errors_' + str(8000) + '_' + str(8096) + '.json') 


with open(file1, 'r') as f:
    f1 = json.load(f)

with open(file2, 'r') as f:
    f2 = json.load(f)

with open(file3, 'r') as f:
    f3 = json.load(f)

with open(file4, 'r') as f:
    f4 = json.load(f)
    
with open(file5, 'r') as f:
    f5 = json.load(f)
    
with open(file6, 'r') as f:
    f6 = json.load(f)
    
with open(file7, 'r') as f:
    f7 = json.load(f)
    
with open(file8, 'r') as f:
    f8 = json.load(f)

with open(file9, 'r') as f:
    f9 = json.load(f)
    
errors = f1 + f2 + f3 + f4 + f5 +f6 + f7 + f8 + f9 

In [10]:
# Dossier output contenant les données et les fiches
formations_output_path = os.path.join(dir_path, 'formations.json')
errors_output_path = os.path.join(dir_path, 'errors.json')

# Copie du dictionnaire au format JSON
with open(formations_output_path, 'w') as f:
    json.dump(formations, f, indent=4)

# Copie du dictionnaire au format JSON
with open(errors_output_path, 'w') as f:
    json.dump(errors, f, indent=4)

In [5]:
# Boucle pour récupérer les formations
# La liste "errors" contient les formations avec des liens inexistants

#formations = {}
#errors = []
#for row in formations_links.iterrows():
#    name = row[1]["libellé principal"]
#    link = row[1]["lien site onisep.fr"]
#    formation_content = parse_formation_page(link)
#    if formation_content is not 'error':
#        formations[name] = formation_content
#    else:
#        errors.append(name)

NameError: name 'formations_links' is not defined