In [1]:
from bs4 import BeautifulSoup
from datetime import date

import numpy as np
import os
import pandas as pd
import requests

In [2]:
today = date.today()
d = today.strftime('%Y-%m-%d')
print("Today's date:", d)

Today's date: 2024-04-25


This url allow for the suffix ``#x`` at the very end of the url. `x` being the department number.

You can use ```.format()``` to add it dynamically to the url

I wasn't able to utilize this as it's not changing the page though.
It's just worth mentioning.

In [3]:
bison_url = "http://tipi.bison-fute.gouv.fr/bison-fute-ouvert/publicationsDIR/Evenementiel-DIR/cnir/RecapBouchonsFranceEntiere.html"
bison_url

'http://tipi.bison-fute.gouv.fr/bison-fute-ouvert/publicationsDIR/Evenementiel-DIR/cnir/RecapBouchonsFranceEntiere.html'

In [4]:
html = requests.get(bison_url)

In [5]:
bsobj = BeautifulSoup(html.content, 'html.parser')
bsobj


<html>
<head>
<meta content="text/html;charset=utf-8" http-equiv="Content-Type"/>
<style type="text/css">body {
    font-family:arial,tahoma,helvetica,sans-serif;
    font-size:10pt;
    font-size-adjust:none;
    font-style:normal;
    font-variant:normal;
    font-weight:normal;
}

/*.version-regroupement {
	font-family: arial, tahoma, helvetica, sans-serif;
	font-size: 10pt;
	font-size-adjust: none;
	font-style: normal;
	font-variant: normal;
	font-weight: normal;
}*/

.nature {
	font-weight: bold;
}

.confidentialite-interne * {
	color: green !important;
	font-style: italic;
}

.confidentialite-autorites * {
	color: #CC0000 !important;
}

.confidentialite-gestionnaires-routiers * {
	color: blue !important;
}

.attribut-gestionnaire {
	color: blue;
}

.attribut-interne {
	color: green;
}

.attribut-important {
	font-weight: bold;
}

.separator-date+.separator-date:before {
	content: " - ";
}

.text-multiline {
	white-space: normal;
}
div.titreRecap {
    text-align:center;
}
div.bl

In [6]:
def process_data(next_element, keys, qnames, importance, department_name, df):
    data = {}

    for key, qname in zip(keys, qnames):
        try:
            element = next_element.find('span', {'qname': qname})
            element = next_element.find('span', class_=qname) if not element else element # if qname is not found, try with class
            data[key] = element.get_text(strip=True) if element else None
        except AttributeError:
            data[key] = None

    data['date'] = d
    data['department'] = department_name
    data['importance'] = len(importance)
    # data['nature'] = nature

    new_row_df = pd.DataFrame(data, index=[0])
    updated_df = pd.concat([df, new_row_df], ignore_index=True)

    """
    print(f"Importance: {len(importance)}\nNature: {nature}\n")
    for key, value in data.items():
        print(f"{key}: {value}")
    # print("")
    # print("Raw :", next_element.get_text(strip=True))
    print("---------------------------------------------")
    print('')
    """
    return updated_df

In [7]:
data_folder = 'data'
keys = ['nature', 'horodate', 'axe', 'sens_cardinal', 'point_repere', 'longueur_pr', 'commune']
qnames = ['nature', 'ligne_horodate_fin_exception_ve', 'axe', 'sens_cardinal', 'pr', 'longueur', 'commune']
columns = ['date','department', 'importance'] + keys

if not os.path.exists(data_folder):
    os.makedirs(data_folder)
if not os.path.exists(data_folder + '/bison_fute_bouchons.csv'):
    columns = ['date','department', 'importance', 'nature']
    df = pd.DataFrame(columns=columns)
else:
    df = pd.read_csv(data_folder + '/bison_fute_bouchons_raw.csv')

columns

['date',
 'department',
 'importance',
 'nature',
 'horodate',
 'axe',
 'sens_cardinal',
 'point_repere',
 'longueur_pr',
 'commune']

In [8]:
departments = bsobj.find_all('span', class_='rupture')
initial_rows = df.shape[0]

# TODO: heurodate des bouchons + cas de plusieurs bouchons sur une meme ligne

for department in departments:
    department_name = department.find('a').get_text(strip=True)
    # print(department_name)
    # print("=============================================")
    HTML_DOC = '<div class="interligne"><span qname="marqueurs" class="confidentialite-public"><span qname="ligne_importance_vr"> <span qname="importance_vr_reduit">**</span></span></span><span qname="element" class="confidentialite-public version-element"><span qname="ligne_nature_ve"> <span class="nature" qname="nature_bouchon">Bouchon</span><span qname="lg_bouchon"> de 0,7 km</span><span qname="nbr_voies"> toutes les voies</span></span><span qname="ligne_axe_sens_pr">, <span qname="axe"> M901</span><span qname="sens_cardinal"> (sens nord-sud</span><span qname="parenthese">)</span></span><span qname="ligne_axe_pr">, <span qname="longueur"> (sur 0,7 km environ)</span></span><span qname="ligne_commune">, <span qname="commune">à Toulouse</span></span><span qname="ligne_complementaire">, <span qname="toutes_voies">Sur toutes les voies</span></span></span><span qname="element" class="confidentialite-public version-element"> ; <span qname="ligne_horodates_element_ve"> <span class="separator-date" qname="horodates_constatation_reduit_ve">À 16h29</span></span><span qname="ligne_nature_ve">, <span class="nature" qname="nature_bouchon">Bouchon</span><span qname="lg_bouchon"> de 2,6 km</span><span qname="nbr_voies"> sur 0 voie</span></span><span qname="ligne_axe_sens_pr">, <span qname="axe"> A621</span><span qname="sens_par_pole">, de Toulouse vers Aéroport Blagnac</span><span qname="sens_cardinal"> (sens est-ouest</span><span qname="parenthese">)</span></span><span qname="ligne_axe_pr">, <span qname="pr">entre les PR 1+188 et 3+784</span><span qname="longueur"> (sur 2,6 km)</span></span><span qname="ligne_commune">, <span qname="commune">de Toulouse à Blagnac</span></span></span><span qname="origine" class="origine"> ; <span qname="ligne_origine_vr"> <span qname="origine_ve_crochet_debut"> [</span><span qname="origine_vr">Origine : DIRSO Toulouse Sagacité</span><span qname="origine_ve_crochet_fin">]</span></span></span></div>'
    next_element = department.find_next_sibling()
    # next_element = BeautifulSoup(HTML_DOC, 'html.parser')
    while next_element and next_element.name != 'span' and 'rupture' not in next_element.get('class', []):
        if next_element.name == 'div' and 'interligne' in next_element['class'][0]:
            elements = next_element.find_all('span', {'qname': 'element'})
            for element in elements:
                importance = next_element.find('span', {'qname': 'importance_vr_reduit'}).get_text(strip=True)
                # nature = next_element.find('span', class_='nature').get_text(strip=True)
                df = process_data(element, keys, qnames, importance, department_name, df)

        next_element = next_element.find_next_sibling()

df.drop_duplicates(inplace=True) # I don't want to add elements I already have in my df
print(f"Added {df.shape[0] - initial_rows} new rows")

Added 13 new rows


In [9]:
df

Unnamed: 0,date,department,importance,nature,horodate,axe,sens_cardinal,point_repere,longueur_pr,commune
0,2024-04-25,Département 06 (Alpes-Maritimes),2,Bouchon,,A8,(sens ouest-est,entre les PR 164+199 et 165+199,"(sur 1,0 km)",à Mougins
1,2024-04-25,Département 06 (Alpes-Maritimes),2,Bouchon,,A8,(sens est-ouest,entre les PR 186+536 et 185+536,"(sur 1,0 km)",à Nice
2,2024-04-25,Département 06 (Alpes-Maritimes),2,Bouchon,,A8,(sens est-ouest,entre les PR 190+629 et 189+129,"(sur 1,5 km)",à Nice
3,2024-04-25,Département 06 (Alpes-Maritimes),2,Bouchon,,A8,(sens est-ouest,entre les PR 199+514 et 193+514,"(sur 6,0 km)",à Nice
4,2024-04-25,Département 06 (Alpes-Maritimes),2,Bouchon,,A8,(sens est-ouest,entre les PR 209+903 et 207+403,"(sur 2,5 km)",de La Turbie à La Trinité
...,...,...,...,...,...,...,...,...,...,...
154,2024-04-25,Département 78 (Yvelines),2,Ralentissement,",prévu jusqu'à 20h",A13,(sens est-ouest,entre les PR 45+498 et 48+338,"(sur 2,9 km)",de Guerville à Mantes-la-Ville
155,2024-04-25,Département 78 (Yvelines),1,Bouchon,",prévu jusqu'à 20h",A13,(sens est-ouest,au PR 48+104,,à Mantes-la-Ville
156,2024-04-25,Département 78 (Yvelines),1,Bouchon,",prévu jusqu'à 20h",A14,(sens ouest-est,au PR 7+763,,à Carrières-sur-Seine
157,2024-04-25,Département 78 (Yvelines),1,Bouchon,",prévu jusqu'à 19h",A14,(sens est-ouest,entre les PR 7+469 et 7+769,"(sur 0,3 km)",à Carrières-sur-Seine


I opted for a general approach in the end
- `['horodate', 'axe', 'sens_cardinal', 'point_repere', 'longueur_pr', 'commune']` were the most common features
- I you really want to be precise and exploit every bits of data each classes have, you can still write an if statement on `nature` like this:

```py
if next_element.name == 'div' and 'interligne' in next_element['class'][0]:
    importance = next_element.find('span', {'qname': 'importance_vr_reduit'}).get_text(strip=True)
    nature = next_element.find('span', class_='nature').get_text(strip=True)

    if nature.lower == 'your_class_here': # 'accident' for example
        keys = [...] # the name you want to give the features in the dataframe
        qnames = [...] # the actual qnames in the html object for your specific case
        df = process_data(next_element, keys, qnames, importance, nature, department_name, df)
    else:
        df = process_data(next_element, keys, qnames, importance, nature, department_name, df)
```
- bear in mind, you also need to update the cell above so that the dataframe recognize the keys as its columns in here:
```py
keys = ['horodate', 'axe', 'sens_cardinal', 'point_repere', 'longueur_pr', 'commune']
qnames = ['ligne_horodate_fin_exception_ve', 'axe', 'sens_cardinal', 'pr', 'longueur', 'commune']
columns = ['date','department', 'importance', 'nature'] + keys # this should have every feature you added
df = pd.DataFrame(columns=columns)
```

In [10]:
print(df.shape)
df.head(3)

(95, 10)


Unnamed: 0,date,department,importance,nature,horodate,axe,sens_cardinal,point_repere,longueur_pr,commune
0,2024-04-25,Département 06 (Alpes-Maritimes),2,Bouchon,,A8,(sens ouest-est,entre les PR 164+199 et 165+199,"(sur 1,0 km)",à Mougins
1,2024-04-25,Département 06 (Alpes-Maritimes),2,Bouchon,,A8,(sens est-ouest,entre les PR 186+536 et 185+536,"(sur 1,0 km)",à Nice
2,2024-04-25,Département 06 (Alpes-Maritimes),2,Bouchon,,A8,(sens est-ouest,entre les PR 190+629 et 189+129,"(sur 1,5 km)",à Nice


In [11]:
df.to_csv(data_folder + f'/bison_fute_bouchons_raw.csv', index=False)

As you can see, we might want to do some cleaning, I'll leave some of the feature cleaning to do:
- for example, the feature `commune` can be at a specific commune, but also in between two communes, how do we handle that ? The answer might depend on you use case so I'll leave it to you

### sens cardinal

In [12]:
df.fillna(value=np.nan, inplace=True)
df['sens_cardinal'].unique()

array(['(sens ouest-est', '(sens est-ouest', '(sens sud-nord',
       '(sens nord-sud', '(sens intérieur', '(sens extérieur', nan],
      dtype=object)

In [13]:
df['sens_cardinal'] = df['sens_cardinal'].str.replace(r'^\(|sens(?=\s)', '', regex=True).str.strip()
df['sens_cardinal'].unique()

array(['ouest-est', 'est-ouest', 'sud-nord', 'nord-sud', 'intérieur',
       'extérieur', nan], dtype=object)

### longueur point repère

In [14]:
df['longueur_pr'].unique()

array(['(sur 1,0 km)', '(sur 1,5 km)', '(sur 6,0 km)', '(sur 2,5 km)',
       nan, '(sur 0,5 km)', '(sur 3,9 km)', '(sur 2,6 km)',
       '(sur 1,4 km)', '(sur 2,0 km)', '(sur 5,0 km)', '(sur 3,5 km)',
       '(sur 2,8 km)', '(sur 0,3 km)', '(sur 1,9 km)', '(sur 5,6 km)',
       '(sur 0,9 km)', '(sur 1,2 km)', '(sur 0,8 km)',
       '(sur 0,5 km environ)', '(sur 2,4 km environ)', '(sur 4,0 km)',
       '(sur 3,0 km)', '(sur 2,4 km)', '(sur 1,1 km)', '(sur 1,3 km)',
       '(sur 0,7 km)', '(sur 4,8 km)', '(sur 2,3 km)',
       '(sur 1,5 km environ)', '(sur 4,4 km)', '(sur 2,9 km)',
       '(sur 5,5 km)', '(sur 11,4 km)'], dtype=object)

In [15]:
df['longueur_pr'] = df['longueur_pr'].str.replace(' environ', '').str.replace('(sur ', '').str.replace(' km)', '').str.replace('de longueur indéterminée', 'nan')
df['longueur_pr'] = df['longueur_pr'].str.replace(',', '.')
df['longueur_pr'].unique()


array(['1.0', '1.5', '6.0', '2.5', nan, '0.5', '3.9', '2.6', '1.4', '2.0',
       '5.0', '3.5', '2.8', '0.3', '1.9', '5.6', '0.9', '1.2', '0.8',
       '2.4', '4.0', '3.0', '1.1', '1.3', '0.7', '4.8', '2.3', '4.4',
       '2.9', '5.5', '11.4'], dtype=object)

In [16]:
df['longueur_pr'] = df['longueur_pr'].apply(lambda x: float(x) if x != 'nan' else np.nan)
df['longueur_pr'].unique()


array([ 1. ,  1.5,  6. ,  2.5,  nan,  0.5,  3.9,  2.6,  1.4,  2. ,  5. ,
        3.5,  2.8,  0.3,  1.9,  5.6,  0.9,  1.2,  0.8,  2.4,  4. ,  3. ,
        1.1,  1.3,  0.7,  4.8,  2.3,  4.4,  2.9,  5.5, 11.4])

In [17]:
df = df.rename(columns={'longueur_pr': 'longueur_pr (km)'})
df.head(3)

Unnamed: 0,date,department,importance,nature,horodate,axe,sens_cardinal,point_repere,longueur_pr (km),commune
0,2024-04-25,Département 06 (Alpes-Maritimes),2,Bouchon,,A8,ouest-est,entre les PR 164+199 et 165+199,1.0,à Mougins
1,2024-04-25,Département 06 (Alpes-Maritimes),2,Bouchon,,A8,est-ouest,entre les PR 186+536 et 185+536,1.0,à Nice
2,2024-04-25,Département 06 (Alpes-Maritimes),2,Bouchon,,A8,est-ouest,entre les PR 190+629 et 189+129,1.5,à Nice


In [18]:
df['department'].unique()

array(['Département 06 (Alpes-Maritimes)',
       'Département 13 (Bouches-du-Rhône)', 'Département 25 (Doubs)',
       'Département 26 (Drôme)', 'Département 28 (Eure-et-Loir)',
       'Département 29 (Finistère)', 'Département 31 (Haute-Garonne)',
       'Département 33 (Gironde)', 'Département 34 (Hérault)',
       'Département 35 (Ille-et-Vilaine)', 'Département 38 (Isère)',
       'Département 42 (Loire)', 'Département 44 (Loire-Atlantique)',
       'Département 64 (Pyrénées-Atlantiques)',
       'Département 66 (Pyrénées-Orientales)', 'Département 69 (Rhône)',
       'Département 73 (Savoie)', 'Département 74 (Haute-Savoie)',
       'Département 76 (Seine-Maritime)',
       'Département 77 (Seine-et-Marne)', 'Département 78 (Yvelines)',
       'Département 82 (Tarn-et-Garonne)', 'Département 83 (Var)',
       'Département 93 (Seine-Saint-Denis)'], dtype=object)

In [19]:
# Extract department number and name
df['department_number'] = df['department'].str.extract(r'Département (\d+)')
df['department_name'] = df['department'].str.extract(r'\(([^()]+)\)')

# Display the first few rows to verify the result
print(df[['department', 'department_number', 'department_name']].head())


                         department department_number  department_name
0  Département 06 (Alpes-Maritimes)                06  Alpes-Maritimes
1  Département 06 (Alpes-Maritimes)                06  Alpes-Maritimes
2  Département 06 (Alpes-Maritimes)                06  Alpes-Maritimes
3  Département 06 (Alpes-Maritimes)                06  Alpes-Maritimes
4  Département 06 (Alpes-Maritimes)                06  Alpes-Maritimes


In [20]:
df.tail(3)

Unnamed: 0,date,department,importance,nature,horodate,axe,sens_cardinal,point_repere,longueur_pr (km),commune,department_number,department_name
156,2024-04-25,Département 78 (Yvelines),1,Bouchon,",prévu jusqu'à 20h",A14,ouest-est,au PR 7+763,,à Carrières-sur-Seine,78,Yvelines
157,2024-04-25,Département 78 (Yvelines),1,Bouchon,",prévu jusqu'à 19h",A14,est-ouest,entre les PR 7+469 et 7+769,0.3,à Carrières-sur-Seine,78,Yvelines
163,2024-04-25,Département 93 (Seine-Saint-Denis),3,Ralentissement,",prévu jusqu'à 19h",A4,est-ouest,entre les PR 24+36 et 12+728,11.4,de Bussy-Saint-Georges à Noisy-le-Grand,93,Seine-Saint-Denis


In [21]:
df.drop(columns=['department'], inplace=True)
new_order = ['date', 'department_number', 'department_name'] + [col for col in df.columns if col not in ['date', 'department_number', 'department_name']]
df = df[new_order]

df.head(3)

Unnamed: 0,date,department_number,department_name,importance,nature,horodate,axe,sens_cardinal,point_repere,longueur_pr (km),commune
0,2024-04-25,6,Alpes-Maritimes,2,Bouchon,,A8,ouest-est,entre les PR 164+199 et 165+199,1.0,à Mougins
1,2024-04-25,6,Alpes-Maritimes,2,Bouchon,,A8,est-ouest,entre les PR 186+536 et 185+536,1.0,à Nice
2,2024-04-25,6,Alpes-Maritimes,2,Bouchon,,A8,est-ouest,entre les PR 190+629 et 189+129,1.5,à Nice


In [22]:
df.to_csv(f'data/bison_fute_bouchons.csv', index=False)

In [23]:
df.sort_values(by=['department_number'], inplace=True)
df

Unnamed: 0,date,department_number,department_name,importance,nature,horodate,axe,sens_cardinal,point_repere,longueur_pr (km),commune
0,2024-04-25,06,Alpes-Maritimes,2,Bouchon,,A8,ouest-est,entre les PR 164+199 et 165+199,1.0,à Mougins
1,2024-04-25,06,Alpes-Maritimes,2,Bouchon,,A8,est-ouest,entre les PR 186+536 et 185+536,1.0,à Nice
2,2024-04-25,06,Alpes-Maritimes,2,Bouchon,,A8,est-ouest,entre les PR 190+629 et 189+129,1.5,à Nice
3,2024-04-25,06,Alpes-Maritimes,2,Bouchon,,A8,est-ouest,entre les PR 199+514 et 193+514,6.0,à Nice
4,2024-04-25,06,Alpes-Maritimes,2,Bouchon,,A8,est-ouest,entre les PR 209+903 et 207+403,2.5,de La Turbie à La Trinité
...,...,...,...,...,...,...,...,...,...,...,...
79,2024-04-25,83,Var,2,Bouchon,,A57,sud-nord,entre les PR 0 et 2+513,2.5,à Toulon
78,2024-04-25,83,Var,2,Bouchon,,A57,nord-sud,entre les PR 7+130 et 1+608,5.5,de La Garde à Toulon
77,2024-04-25,83,Var,2,Bouchon,,A50,ouest-est,entre les PR 67+116 et 68+555,1.5,à Toulon
81,2024-04-25,93,Seine-Saint-Denis,3,Ralentissement,,A4,est-ouest,entre les PR 24+36 et 12+728,11.4,de Bussy-Saint-Georges à Noisy-le-Grand


In [18]:
import pandas as pd

all_years = pd.DataFrame()
for year in range(2019, 2023):
    print(f"Processing data for year {year}")
    # Lire le CSV dans un DataFrame pandas
    df = pd.read_csv(f"files/caracteristiques-{year}.csv", sep=";")

    # Convertir les colonnes "jour", "mois" et "an" en une colonne "date" au format '%Y-%m-%d'
    df['date_entree'] = pd.to_datetime(df[['an', 'mois', 'jour']].astype(str).agg('-'.join, axis=1))

    # Grouper les données par date et par département, puis compter le nombre d'accidents
    result = df.groupby(['date_entree', 'dep']).size().reset_index(name='nb_accidents')

    all_years = pd.concat([all_years, result])

# Enregistrer les résultats dans un nouveau CSV
all_years.to_csv("nombre_accidents_par_date_par_departement.csv", index=False)


Processing data for year 2019
Processing data for year 2020
Processing data for year 2021
Processing data for year 2022
