In [1]:
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By



# CONSTS

In [2]:
CHROME_DRIVER_PATH = "./chromedriver"
BRAVE_PATH = "/Applications/Brave Browser.app/Contents/MacOS/Brave Browser"
DOWNLOAD_DIRECTORY = "/Users/mamoudou/Developer/Python/ProjetTrutoré/"
ARCHIVE_DIRECTORY = "/Users/mamoudou/Developer/Python/ProjetTrutoré/archive"
URL = "https://donneespubliques.meteofrance.fr/?fond=produit&id_produit=90&id_rubrique=32"

## Browser Configs

In [3]:
options = Options()
options.binary_location = BRAVE_PATH  

prefs = {
    "download.default_directory": DOWNLOAD_DIRECTORY,
    "download.prompt_for_download": False,
    "download.directory_upgrade": True,
    "safebrowsing.enabled": True 
}
options.add_experimental_option("prefs", prefs)

In [4]:
service = Service(executable_path=CHROME_DRIVER_PATH)

driver = webdriver.Chrome(service=service, options=options)



driver.get(URL)
print("Page Title:", driver.title)
telecharger_button = driver.find_element(By.XPATH, "//input[@type='submit' and @value='Télécharger']")
telecharger_button.click()

time.sleep(5)
driver.quit()

Page Title: Données Publiques de Météo-France - Données SYNOP essentielles OMM


## Fixing the file extension problem

In [5]:
from file_management import change_file_extension

cvs_file = change_file_extension(DOWNLOAD_DIRECTORY)

Le fichier a été renommé : /Users/mamoudou/Developer/Python/ProjetTrutoré/synop.2024122618.txt -> /Users/mamoudou/Developer/Python/ProjetTrutoré/synop.2024122618.csv


# Starting the ELT

In [6]:
import pandas as pd
import numpy as np
df = pd.read_csv(cvs_file, sep=";", dtype={"date": str})

print(df.to_string())

    numer_sta            date    pmer  tend cod_tend   dd    ff           t          td    u     vv  ww  w1  w2    n nbas  hbas  cl  cm  ch    pres niv_bar geop tend24        tn12        tn24        tx12        tx24     tminsol  sw  tw      raf10     rafper  per etat_sol  ht_neige    ssfrai perssfrai       rr1        rr3        rr6       rr12       rr24 phenspe1 phenspe2 phenspe3 phenspe4 nnuage1 ctype1 hnuage1 nnuage2 ctype2 hnuage2 nnuage3 ctype3 hnuage3 nnuage4 ctype4 hnuage4  Unnamed: 59
0        7005  20241226180000  103480     0        5  110   1.8  278.250000  277.850000   97  17940   0  mq  mq   mq   mq    mq  mq  mq  mq  102570      mq   mq    -40  278.250000          mq  280.550000          mq  279.050000  mq  mq   3.200000   4.400000  -10        0  0.000000        mq        mq  0.000000   0.000000   0.000000   0.000000   0.000000       mq       mq       mq       mq      mq     mq      mq      mq     mq      mq      mq     mq      mq      mq     mq      mq          NaN
1     

# Récupération des informations importantes à savoir :
1. Indicatif OMM station : numéro de station `numer_sta` -> `int`
2. Date (UTC) : `date` -> `datetime`
3. Pression au niveau mer : `pmer` -> `int`
4. Variation de pression en 3 heures : `tend` -> `int`
5. Type de tendance barométrique : `cod_tend` -> `int`
6. Direction du vent moyen 10 mn : `dd` -> `int`
7. Vitesse du vent moyen 10 mn : `ff` -> `float`
8. Température : `t` -> `float`
9. Point de rosée : `td` -> `float`
10. Humidité : `u` -> `int`
11. Visibilité horizontale : `vv` -> `float`
12. Temps présent : `ww` -> `int`
13. Nébulosité totale : `n` -> `float`
14. Nébulosité des nuages de l'étage inférieur : `nbas` -> `int`
15. Hauteur de la base des nuages de l'étage inférieur : `hbas` -> `int`
16. Pression station : `pres` -> `int`
17. Variation de pression en 24 heures : `tend24` -> `int`
18. Température minimale sur N heures : `tn12` -> `float`
19. Température maximale sur N heures : `tx12` -> `float`
20. Température minimale du sol sur 12 heures : `tminsol` -> `float`
21. Rafales sur les 10 dernières minutes : `raf10` -> `float`
22. Rafales sur une période : `rafper` -> `float`
23. Période de mesure des rafales : `per` -> `float`
24. Précipitations dans les N dernières heures : `rr12` -> `float`

In [7]:
df = df[['numer_sta', 'date', "pmer", "tend", "cod_tend", "dd", "ff", "t", "td", "u", "vv", "ww", "n", "nbas", "hbas","pres", "tend24", "tn12", "tx12", "tminsol", "raf10", "rafper", "per", "rr12" ]]


print(df.to_string())


    numer_sta            date    pmer  tend cod_tend   dd    ff           t          td    u     vv  ww    n nbas  hbas    pres tend24        tn12        tx12     tminsol      raf10     rafper  per       rr12
0        7005  20241226180000  103480     0        5  110   1.8  278.250000  277.850000   97  17940   0   mq   mq    mq  102570    -40  278.250000  280.550000  279.050000   3.200000   4.400000  -10   0.000000
1        7015  20241226180000  103530   -10        6   20   2.1  277.350000  277.250000   99   3460  10   mq    0    mq  102930    -10  277.250000  280.350000  274.850000   3.100000   3.100000  -10   0.000000
2        7020  20241226180000  103370    60        3  100   2.5  281.050000  281.050000  100     mq  mq   mq   mq    mq  103260   -100  279.650000  281.750000          mq   4.600000   6.000000  -10   0.000000
3        7027  20241226180000  103380    20        3   90   2.5  280.450000  280.350000   99   1990  10  101    9    30  102540   -150  279.350000  281.150000  281.

## Remplacer les valeurs manquantes `mq` par `NaN`

In [8]:
df.replace("mq", np.nan, inplace=True)
print(df.to_string())

    numer_sta            date    pmer  tend cod_tend   dd    ff           t          td    u     vv   ww    n nbas  hbas    pres tend24        tn12        tx12     tminsol      raf10     rafper  per       rr12
0        7005  20241226180000  103480     0        5  110   1.8  278.250000  277.850000   97  17940    0  NaN  NaN   NaN  102570    -40  278.250000  280.550000  279.050000   3.200000   4.400000  -10   0.000000
1        7015  20241226180000  103530   -10        6   20   2.1  277.350000  277.250000   99   3460   10  NaN    0   NaN  102930    -10  277.250000  280.350000  274.850000   3.100000   3.100000  -10   0.000000
2        7020  20241226180000  103370    60        3  100   2.5  281.050000  281.050000  100    NaN  NaN  NaN  NaN   NaN  103260   -100  279.650000  281.750000         NaN   4.600000   6.000000  -10   0.000000
3        7027  20241226180000  103380    20        3   90   2.5  280.450000  280.350000   99   1990   10  101    9    30  102540   -150  279.350000  281.150000 

## Nettoyages de colonnes non renseignées 

1. Informations générales
	-	Numéro de station : `numer_sta` -> int
	-	Date (UTC) : `date` -> datetime
2. Pression
	-	Pression au niveau mer : `pmer` -> int
	-	Variation de pression en 3 heures : `tend` -> int
	-	Type de tendance barométrique : `cod_tend` -> int
	-	Pression station : `pres` -> int
	-	Variation de pression en 24 heures : `tend24` -> int
3. Vent
	-	Direction du vent moyen 10 mn : `dd` -> int
	-	Vitesse du vent moyen 10 mn : `ff` -> float
	-	Rafales sur les 10 dernières minutes : `raf10` -> float
	-	Rafales sur une période : `rafper` -> float
	-	Période de mesure des rafales : `per` -> float
4. Température
	-	Température : `t` -> float
	-	Point de rosée : `td` -> float
	-	Température minimale sur N heures : `tn12` -> float
	-	Température maximale sur N heures : `tx12` -> float
	-	Température minimale du sol sur 12 heures : `tminsol` -> float
5. Humidité
	-	Humidité : `u` -> int
6. Visibilité
	-	Visibilité horizontale : `vv` -> float
7. Nébulosité et nuages
	-	Nébulosité totale : `n` -> float
	-	Nébulosité des nuages de l’étage inférieur : `nbas` -> int
	-	Hauteur de la base des nuages de l’étage inférieur : `hbas` -> int
8. Précipitations
	-	Précipitations dans les N dernières heures : `rr12` -> float
9. Temps présent
	-	Temps présent : `ww` -> int

In [9]:
df[['dd', 'ff', 'raf10', 'rafper', 'rr12', "per"]] = df[['dd', 'ff', 'raf10', 'rafper', 'rr12', "per"]].fillna(0)

moyenne = ["pmer", "tend", "cod_tend", "pres", "tend24", "t", "td", "tn12", "tx12", "tminsol", "u", "vv", "n", "nbas", "hbas", "ww"]
df[moyenne] = df[moyenne].apply(pd.to_numeric, errors='coerce')

for col in moyenne:
    df[col] = df[col].fillna(df[col].mean())

print(df.to_string())

    numer_sta            date           pmer        tend  cod_tend   dd    ff           t          td           u            vv         ww           n      nbas    hbas           pres      tend24        tn12       tx12     tminsol      raf10     rafper  per       rr12
0        7005  20241226180000  103480.000000    0.000000  5.000000  110   1.8  278.250000  277.850000   97.000000  17940.000000   0.000000   82.631579  3.461538   693.0  102570.000000  -40.000000  278.250000  280.55000  279.050000   3.200000   4.400000  -10   0.000000
1        7015  20241226180000  103530.000000  -10.000000  6.000000   20   2.1  277.350000  277.250000   99.000000   3460.000000  10.000000   82.631579  0.000000   693.0  102930.000000  -10.000000  277.250000  280.35000  274.850000   3.100000   3.100000  -10   0.000000
2        7020  20241226180000  103370.000000   60.000000  3.000000  100   2.5  281.050000  281.050000  100.000000  19945.681818   5.302326   82.631579  3.461538   693.0  103260.000000 -100.0000

## Typages des colonnes

In [10]:
df["date"] = pd.to_datetime(df["date"], format='%Y%m%d%H%M%S')

entier = ["numer_sta", "pmer", "tend", "cod_tend", "dd", "u", "ww", "nbas", "hbas", "pres", "tend24"]
for i in entier:
    print(f"{i} -> {df[col].dtype}")
    df[entier] = df[entier].astype(int)
    print(f"{i} -> {df[col].dtype}")
    print("")

double = ['dd', 'ff', 'raf10', 'rafper', 'rr12', "per", "t", "td", "tn12", "tx12", "tminsol", "n", "vv"]
for i in double:
    print(f"{i} -> {df[col].dtype}")
    df[double] = df[double].astype(float)
    print(f"{i} -> {df[col].dtype}")
    print("")

numer_sta -> float64
numer_sta -> int64

pmer -> int64
pmer -> int64

tend -> int64
tend -> int64

cod_tend -> int64
cod_tend -> int64

dd -> int64
dd -> int64

u -> int64
u -> int64

ww -> int64
ww -> int64

nbas -> int64
nbas -> int64

hbas -> int64
hbas -> int64

pres -> int64
pres -> int64

tend24 -> int64
tend24 -> int64

dd -> int64
dd -> int64

ff -> int64
ff -> int64

raf10 -> int64
raf10 -> int64

rafper -> int64
rafper -> int64

rr12 -> int64
rr12 -> int64

per -> int64
per -> int64

t -> int64
t -> int64

td -> int64
td -> int64

tn12 -> int64
tn12 -> int64

tx12 -> int64
tx12 -> int64

tminsol -> int64
tminsol -> int64

n -> int64
n -> int64

vv -> int64
vv -> int64



In [11]:
print(df.to_string())

    numer_sta                date    pmer  tend  cod_tend     dd    ff           t          td    u            vv  ww           n  nbas  hbas    pres  tend24        tn12       tx12     tminsol  raf10  rafper   per  rr12
0        7005 2024-12-26 18:00:00  103480     0         5  110.0   1.8  278.250000  277.850000   97  17940.000000   0   82.631579     3   693  102570     -40  278.250000  280.55000  279.050000    3.2     4.4 -10.0   0.0
1        7015 2024-12-26 18:00:00  103530   -10         6   20.0   2.1  277.350000  277.250000   99   3460.000000  10   82.631579     0   693  102930     -10  277.250000  280.35000  274.850000    3.1     3.1 -10.0   0.0
2        7020 2024-12-26 18:00:00  103370    60         3  100.0   2.5  281.050000  281.050000  100  19945.681818   5   82.631579     3   693  103260    -100  279.650000  281.75000  277.444444    4.6     6.0 -10.0   0.0
3        7027 2024-12-26 18:00:00  103380    20         3   90.0   2.5  280.450000  280.350000   99   1990.000000  10  1

## Conversion des températures de Kelvin en Celcuis

In [12]:
celcuis = ["t", "td", "tn12", "tx12", "tminsol"]
for i in celcuis:
    df[i] = df[i] - 273.15

print(df.to_string())

    numer_sta                date    pmer  tend  cod_tend     dd    ff          t         td    u            vv  ww           n  nbas  hbas    pres  tend24       tn12      tx12    tminsol  raf10  rafper   per  rr12
0        7005 2024-12-26 18:00:00  103480     0         5  110.0   1.8   5.100000   4.700000   97  17940.000000   0   82.631579     3   693  102570     -40   5.100000   7.40000   5.900000    3.2     4.4 -10.0   0.0
1        7015 2024-12-26 18:00:00  103530   -10         6   20.0   2.1   4.200000   4.100000   99   3460.000000  10   82.631579     0   693  102930     -10   4.100000   7.20000   1.700000    3.1     3.1 -10.0   0.0
2        7020 2024-12-26 18:00:00  103370    60         3  100.0   2.5   7.900000   7.900000  100  19945.681818   5   82.631579     3   693  103260    -100   6.500000   8.60000   4.294444    4.6     6.0 -10.0   0.0
3        7027 2024-12-26 18:00:00  103380    20         3   90.0   2.5   7.300000   7.200000   99   1990.000000  10  101.000000     9    30 

## Arrondir tous les decimaux à 2chiffres apes la virgules

In [13]:
df[double] = df[double].round(2)
print(df.to_string())

    numer_sta                date    pmer  tend  cod_tend     dd    ff      t     td    u        vv  ww       n  nbas  hbas    pres  tend24   tn12   tx12  tminsol  raf10  rafper   per  rr12
0        7005 2024-12-26 18:00:00  103480     0         5  110.0   1.8   5.10   4.70   97  17940.00   0   82.63     3   693  102570     -40   5.10   7.40     5.90    3.2     4.4 -10.0   0.0
1        7015 2024-12-26 18:00:00  103530   -10         6   20.0   2.1   4.20   4.10   99   3460.00  10   82.63     0   693  102930     -10   4.10   7.20     1.70    3.1     3.1 -10.0   0.0
2        7020 2024-12-26 18:00:00  103370    60         3  100.0   2.5   7.90   7.90  100  19945.68   5   82.63     3   693  103260    -100   6.50   8.60     4.29    4.6     6.0 -10.0   0.0
3        7027 2024-12-26 18:00:00  103380    20         3   90.0   2.5   7.30   7.20   99   1990.00  10  101.00     9    30  102540    -150   6.20   8.00     8.30    3.7     4.1 -10.0   0.0
4        7037 2024-12-26 18:00:00  103450    30   

# Archive file

In [14]:
from file_management import move_file

move_file(cvs_file, ARCHIVE_DIRECTORY)

Le fichier a été déplacé de /Users/mamoudou/Developer/Python/ProjetTrutoré/synop.2024122618.csv à /Users/mamoudou/Developer/Python/ProjetTrutoré/archive/synop.2024122618.csv
