In [19]:
import time
import datetime
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

# CONSTS

In [20]:
CHROME_DRIVER_PATH = "./chromedriver-linux/chromedriver"
BRAVE_PATH = "/usr/bin/brave-browser"
DOWNLOAD_DIRECTORY = "/home/mamoudou/dossier_partage/donnees-meteo/download"
ARCHIVE_DIRECTORY = "/home/mamoudou/dossier_partage/donnees-meteo/archive/daily"

In [None]:
now = datetime.datetime.now()

formatted_date = now.strftime("%Y%m%d %H").split(" ")

DATE, HEURE = formatted_date[0], "{:02d}".format(int(formatted_date[1])-2)
print(f"Voici la date du jour recuperer: {formatted_date}.\nAprès répartiton, on a une date: {DATE} et une heure: {HEURE}")

URL = f"https://donneespubliques.meteofrance.fr/?fond=donnee_libre&prefixe=Txt%2FSynop%2Fsynop&extension=csv&date={DATE}&reseau={HEURE}"

print(f"L'URL formée après, devient: {URL}")

Voici la date du jour recuperer: ['20250201', '14'].
Après répartiton, on a une date: 20250201 et une heure: 12
L'URL formée après, devient: https://donneespubliques.meteofrance.fr/?fond=donnee_libre&prefixe=Txt%2FSynop%2Fsynop&extension=csv&date=20250201&reseau=12


## Browser Configs

In [22]:
options = Options()
options.binary_location = BRAVE_PATH  
options.add_argument("--headless=new")
options.add_argument("--disable-gpu")
options.add_argument("--nosandbox")
options.add_argument("--disable-dev-shm-usage")

prefs = {
    "download.default_directory": DOWNLOAD_DIRECTORY,
    "download.prompt_for_download": False,
    "download.directory_upgrade": True,
    "safebrowsing.enabled": True 
}
options.add_experimental_option("prefs", prefs)

In [23]:
service = Service(executable_path=CHROME_DRIVER_PATH)

driver = webdriver.Chrome(service=service, options=options)

try:
    print(f"Téléchargement pour la date : {DATE}")
    driver.get(URL)
    time.sleep(5)
    print(f"Téléchargement terminé !!!")

except Exception as e:
    print(f"Erreur lors du téléchargement de {DATE}: {e}")

driver.close()
driver.quit()

Téléchargement pour la date : 20250201
Téléchargement terminé !!!


## Fixing the file extension problem

In [24]:
from file_management import change_file_extension

cvs_file = change_file_extension(DOWNLOAD_DIRECTORY)
print(f"The file I'm working with: {cvs_file}")

Le fichier a été renommé : /home/mamoudou/dossier_partage/donnees-meteo/download/synop.2025020112.txt -> /home/mamoudou/dossier_partage/donnees-meteo/download/synop.2025020112.csv
The file I'm working with: /home/mamoudou/dossier_partage/donnees-meteo/download/synop.2025020112.csv


# Starting the ELT

In [25]:
import pandas as pd
import numpy as np
df = pd.read_csv(cvs_file, sep=";", dtype={"date": str})

print(df.to_string())

    numer_sta            date    pmer  tend cod_tend   dd         ff           t          td    u     vv  ww  w1  w2    n nbas  hbas  cl  cm  ch    pres niv_bar geop tend24        tn12        tn24 tx12        tx24     tminsol  sw  tw      raf10     rafper  per etat_sol  ht_neige    ssfrai perssfrai        rr1        rr3        rr6       rr12       rr24 phenspe1 phenspe2 phenspe3 phenspe4 nnuage1 ctype1 hnuage1 nnuage2 ctype2 hnuage2 nnuage3 ctype3 hnuage3 nnuage4 ctype4 hnuage4  Unnamed: 59
0        7005  20250201120000  103070  -100        8  190   0.700000  275.950000  275.950000  100   1710  10  mq  mq   mq   mq    mq  mq  mq  mq  102160      mq   mq    410          mq          mq   mq          mq  273.450000  mq  mq   2.300000   2.900000  -10       mq  0.000000        mq        mq   0.000000   0.000000   0.000000   0.000000   3.800000       mq       mq       mq       mq      mq     mq      mq      mq     mq      mq      mq     mq      mq      mq     mq      mq          NaN
1       

# Récupération des informations importantes à savoir :
1. Indicatif OMM station : numéro de station `numer_sta` -> `int`
2. Date (UTC) : `date` -> `datetime`
3. Pression au niveau mer : `pmer` -> `int`
4. Variation de pression en 3 heures : `tend` -> `int`
5. Type de tendance barométrique : `cod_tend` -> `int`
6. Direction du vent moyen 10 mn : `dd` -> `int`
7. Vitesse du vent moyen 10 mn : `ff` -> `float`
8. Température : `t` -> `float`
9. Point de rosée : `td` -> `float`
10. Humidité : `u` -> `int`
11. Visibilité horizontale : `vv` -> `float`
12. Temps présent : `ww` -> `int`
13. Nébulosité totale : `n` -> `float`
14. Nébulosité des nuages de l'étage inférieur : `nbas` -> `int`
15. Hauteur de la base des nuages de l'étage inférieur : `hbas` -> `int`
16. Pression station : `pres` -> `int`
17. Variation de pression en 24 heures : `tend24` -> `int`
18. Température minimale sur N heures : `tn12` -> `float`
19. Température maximale sur N heures : `tx12` -> `float`
20. Température minimale du sol sur 12 heures : `tminsol` -> `float`
21. Rafales sur les 10 dernières minutes : `raf10` -> `float`
22. Rafales sur une période : `rafper` -> `float`
23. Période de mesure des rafales : `per` -> `float`
24. Précipitations dans les N dernières heures : `rr12` -> `float`

In [26]:
df = df[['numer_sta', 'date', "pmer", "tend", "cod_tend", "dd", "ff", "t", "td", "u", "vv", "ww", "n", "nbas", "hbas","pres", "tend24", "tn12", "tx12", "tminsol", "raf10", "rafper", "per", "rr12" ]]


print(df.to_string())


    numer_sta            date    pmer  tend cod_tend   dd         ff           t          td    u     vv  ww    n nbas  hbas    pres tend24        tn12 tx12     tminsol      raf10     rafper  per       rr12
0        7005  20250201120000  103070  -100        8  190   0.700000  275.950000  275.950000  100   1710  10   mq   mq    mq  102160    410          mq   mq  273.450000   2.300000   2.900000  -10   0.000000
1        7015  20250201120000  103110  -110        8  140   2.100000  279.850000  274.350000   68   8670   0   mq    0    mq  102520    380          mq   mq  271.550000   3.500000   3.600000  -10   0.200000
2        7020  20250201120000  102830   -30        8  180   5.100000  280.750000  279.250000   90  11000   2  100    8   450  102720    430          mq   mq          mq   7.200000   7.600000  -10   0.400000
3        7027  20250201120000  102970   -10        8  150   4.600000  277.250000  275.450000   88  29760   0  100    8   450  102130    490          mq   mq  278.250000   6

## Remplacer les valeurs manquantes `mq` par `NaN`

In [27]:
df.replace("mq", np.nan, inplace=True)
print(df.to_string())

    numer_sta            date    pmer  tend cod_tend   dd         ff           t          td    u     vv   ww    n nbas  hbas    pres tend24        tn12  tx12     tminsol      raf10     rafper  per       rr12
0        7005  20250201120000  103070  -100        8  190   0.700000  275.950000  275.950000  100   1710   10  NaN  NaN   NaN  102160    410         NaN   NaN  273.450000   2.300000   2.900000  -10   0.000000
1        7015  20250201120000  103110  -110        8  140   2.100000  279.850000  274.350000   68   8670    0  NaN    0   NaN  102520    380         NaN   NaN  271.550000   3.500000   3.600000  -10   0.200000
2        7020  20250201120000  102830   -30        8  180   5.100000  280.750000  279.250000   90  11000    2  100    8   450  102720    430         NaN   NaN         NaN   7.200000   7.600000  -10   0.400000
3        7027  20250201120000  102970   -10        8  150   4.600000  277.250000  275.450000   88  29760    0  100    8   450  102130    490         NaN   NaN  278.

  df.replace("mq", np.nan, inplace=True)


## Nettoyages de colonnes non renseignées 

1. Informations générales
	-	Numéro de station : `numer_sta` -> int
	-	Date (UTC) : `date` -> datetime
2. Pression
	-	Pression au niveau mer : `pmer` -> int
	-	Variation de pression en 3 heures : `tend` -> int
	-	Type de tendance barométrique : `cod_tend` -> int
	-	Pression station : `pres` -> int
	-	Variation de pression en 24 heures : `tend24` -> int
3. Vent
	-	Direction du vent moyen 10 mn : `dd` -> int
	-	Vitesse du vent moyen 10 mn : `ff` -> float
	-	Rafales sur les 10 dernières minutes : `raf10` -> float
	-	Rafales sur une période : `rafper` -> float
	-	Période de mesure des rafales : `per` -> float
4. Température
	-	Température : `t` -> float
	-	Point de rosée : `td` -> float
	-	Température minimale sur N heures : `tn12` -> float
	-	Température maximale sur N heures : `tx12` -> float
	-	Température minimale du sol sur 12 heures : `tminsol` -> float
5. Humidité
	-	Humidité : `u` -> int
6. Visibilité
	-	Visibilité horizontale : `vv` -> float
7. Nébulosité et nuages
	-	Nébulosité totale : `n` -> float
	-	Nébulosité des nuages de l’étage inférieur : `nbas` -> int
	-	Hauteur de la base des nuages de l’étage inférieur : `hbas` -> int
8. Précipitations
	-	Précipitations dans les N dernières heures : `rr12` -> float
9. Temps présent
	-	Temps présent : `ww` -> int

In [28]:
df[['dd', 'ff', 'raf10', 'rafper', 'rr12', "per"]] = df[['dd', 'ff', 'raf10', 'rafper', 'rr12', "per"]].fillna(0)

moyenne = ["pmer", "tend", "cod_tend", "pres", "tend24", "t", "td", "tn12", "tx12", "tminsol", "u", "vv", "n", "nbas", "hbas", "ww"]
df[moyenne] = df[moyenne].apply(pd.to_numeric, errors='coerce')

for col in moyenne:
    df[col] = df[col].fillna(df[col].mean())

print(df.to_string())



    numer_sta            date           pmer        tend  cod_tend   dd         ff           t          td           u            vv         ww          n      nbas    hbas           pres       tend24    tn12  tx12     tminsol      raf10     rafper  per       rr12
0        7005  20250201120000  103070.000000 -100.000000  8.000000  190   0.700000  275.950000  275.950000  100.000000   1710.000000  10.000000   86.73913  4.131579   820.0  102160.000000   410.000000  296.65   NaN  273.450000   2.300000   2.900000  -10   0.000000
1        7015  20250201120000  103110.000000 -110.000000  8.000000  140   2.100000  279.850000  274.350000   68.000000   8670.000000   0.000000   86.73913  0.000000   820.0  102520.000000   380.000000  296.65   NaN  271.550000   3.500000   3.600000  -10   0.200000
2        7020  20250201120000  102830.000000  -30.000000  8.000000  180   5.100000  280.750000  279.250000   90.000000  11000.000000   2.000000  100.00000  8.000000   450.0  102720.000000   430.000000  296

## Typages des colonnes

In [29]:
df["date"] = pd.to_datetime(df["date"], format='%Y%m%d%H%M%S')
df['date'] = df['date'].dt.strftime("%Y-%m-%d %H:%M:%S")

entier = ["numer_sta", "pmer", "tend", "cod_tend", "dd", "u", "ww", "nbas", "hbas", "pres", "tend24"]
for i in entier:
    print(f"{i} -> {df[i].dtype}")
    df[i] = df[i].astype(int)
    print(f"{i} -> {df[i].dtype}")
    print("")

double = ['dd', 'ff', 'raf10', 'rafper', 'rr12', "per", "t", "td", "tn12", "tx12", "tminsol", "n", "vv"]
for i in double:
    print(f"{i} -> {df[i].dtype}")
    df[i] = df[i].astype(float)
    print(f"{i} -> {df[i].dtype}")
    print("")

numer_sta -> int64
numer_sta -> int64

pmer -> float64
pmer -> int64

tend -> float64
tend -> int64

cod_tend -> float64
cod_tend -> int64

dd -> object
dd -> int64

u -> float64
u -> int64

ww -> float64
ww -> int64

nbas -> float64
nbas -> int64

hbas -> float64
hbas -> int64

pres -> float64
pres -> int64

tend24 -> float64
tend24 -> int64

dd -> int64
dd -> float64

ff -> object
ff -> float64

raf10 -> object
raf10 -> float64

rafper -> object
rafper -> float64

rr12 -> object
rr12 -> float64

per -> object
per -> float64

t -> float64
t -> float64

td -> float64
td -> float64

tn12 -> float64
tn12 -> float64

tx12 -> float64
tx12 -> float64

tminsol -> float64
tminsol -> float64

n -> float64
n -> float64

vv -> float64
vv -> float64



In [30]:
print(df.to_string())

    numer_sta                 date    pmer  tend  cod_tend     dd    ff           t          td    u            vv  ww          n  nbas  hbas    pres  tend24    tn12  tx12     tminsol  raf10  rafper   per  rr12
0        7005  2025-02-01 12:00:00  103070  -100         8  190.0   0.7  275.950000  275.950000  100   1710.000000  10   86.73913     4   820  102160     410  296.65   NaN  273.450000    2.3     2.9 -10.0   0.0
1        7015  2025-02-01 12:00:00  103110  -110         8  140.0   2.1  279.850000  274.350000   68   8670.000000   0   86.73913     0   820  102520     380  296.65   NaN  271.550000    3.5     3.6 -10.0   0.2
2        7020  2025-02-01 12:00:00  102830   -30         8  180.0   5.1  280.750000  279.250000   90  11000.000000   2  100.00000     8   450  102720     430  296.65   NaN  276.202778    7.2     7.6 -10.0   0.4
3        7027  2025-02-01 12:00:00  102970   -10         8  150.0   4.6  277.250000  275.450000   88  29760.000000   0  100.00000     8   450  102130     49

## Conversion des températures de Kelvin en Celcuis

In [31]:
celcuis = ["t", "td", "tn12", "tx12", "tminsol"]
for i in celcuis:
    df[i] = df[i] - 273.15

print(df.to_string())

    numer_sta                 date    pmer  tend  cod_tend     dd    ff          t         td    u            vv  ww          n  nbas  hbas    pres  tend24  tn12  tx12    tminsol  raf10  rafper   per  rr12
0        7005  2025-02-01 12:00:00  103070  -100         8  190.0   0.7   2.800000   2.800000  100   1710.000000  10   86.73913     4   820  102160     410  23.5   NaN   0.300000    2.3     2.9 -10.0   0.0
1        7015  2025-02-01 12:00:00  103110  -110         8  140.0   2.1   6.700000   1.200000   68   8670.000000   0   86.73913     0   820  102520     380  23.5   NaN  -1.600000    3.5     3.6 -10.0   0.2
2        7020  2025-02-01 12:00:00  102830   -30         8  180.0   5.1   7.600000   6.100000   90  11000.000000   2  100.00000     8   450  102720     430  23.5   NaN   3.052778    7.2     7.6 -10.0   0.4
3        7027  2025-02-01 12:00:00  102970   -10         8  150.0   4.6   4.100000   2.300000   88  29760.000000   0  100.00000     8   450  102130     490  23.5   NaN   5.1000

## Arrondir tous les decimaux à 2chiffres après la virgules

In [32]:
df[double] = df[double].round(2)
print(df.to_string())

    numer_sta                 date    pmer  tend  cod_tend     dd    ff      t     td    u       vv  ww       n  nbas  hbas    pres  tend24  tn12  tx12  tminsol  raf10  rafper   per  rr12
0        7005  2025-02-01 12:00:00  103070  -100         8  190.0   0.7   2.80   2.80  100   1710.0  10   86.74     4   820  102160     410  23.5   NaN     0.30    2.3     2.9 -10.0   0.0
1        7015  2025-02-01 12:00:00  103110  -110         8  140.0   2.1   6.70   1.20   68   8670.0   0   86.74     0   820  102520     380  23.5   NaN    -1.60    3.5     3.6 -10.0   0.2
2        7020  2025-02-01 12:00:00  102830   -30         8  180.0   5.1   7.60   6.10   90  11000.0   2  100.00     8   450  102720     430  23.5   NaN     3.05    7.2     7.6 -10.0   0.4
3        7027  2025-02-01 12:00:00  102970   -10         8  150.0   4.6   4.10   2.30   88  29760.0   0  100.00     8   450  102130     490  23.5   NaN     5.10    6.5     6.5 -10.0   0.0
4        7037  2025-02-01 12:00:00  103050   -80         8  

# Add verif to check is all values

In [33]:
verif = ["pmer", "tend", "cod_tend", "pres", "tend24", "t", "td", "tn12", "tx12", "tminsol", "u", "vv", "n", "nbas", "hbas", "ww"]

for col in verif:
    if df[col].isna().all() :
        df[col].fillna(0.0,inplace=True)

print(df.to_string())

    numer_sta                 date    pmer  tend  cod_tend     dd    ff      t     td    u       vv  ww       n  nbas  hbas    pres  tend24  tn12  tx12  tminsol  raf10  rafper   per  rr12
0        7005  2025-02-01 12:00:00  103070  -100         8  190.0   0.7   2.80   2.80  100   1710.0  10   86.74     4   820  102160     410  23.5   0.0     0.30    2.3     2.9 -10.0   0.0
1        7015  2025-02-01 12:00:00  103110  -110         8  140.0   2.1   6.70   1.20   68   8670.0   0   86.74     0   820  102520     380  23.5   0.0    -1.60    3.5     3.6 -10.0   0.2
2        7020  2025-02-01 12:00:00  102830   -30         8  180.0   5.1   7.60   6.10   90  11000.0   2  100.00     8   450  102720     430  23.5   0.0     3.05    7.2     7.6 -10.0   0.4
3        7027  2025-02-01 12:00:00  102970   -10         8  150.0   4.6   4.10   2.30   88  29760.0   0  100.00     8   450  102130     490  23.5   0.0     5.10    6.5     6.5 -10.0   0.0
4        7037  2025-02-01 12:00:00  103050   -80         8  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(0.0,inplace=True)


# Archive file

In [34]:
from file_management import move_file

move_file(cvs_file, ARCHIVE_DIRECTORY)

Le fichier a été déplacé de /home/mamoudou/dossier_partage/donnees-meteo/download/synop.2025020112.csv à /home/mamoudou/dossier_partage/donnees-meteo/archive/daily/synop.2025020112.csv


# Sending data to API

In [35]:
import requests
from api import send_data

In [36]:
API_URL = "http://127.0.0.1:8080/api/donnees-climatiques" 

dict_data = df.to_dict(orient="records")

for row in dict_data:
    print(row)
    send_data(API_URL, row, requests)

{'numer_sta': 7005, 'date': '2025-02-01 12:00:00', 'pmer': 103070, 'tend': -100, 'cod_tend': 8, 'dd': 190.0, 'ff': 0.7, 't': 2.8, 'td': 2.8, 'u': 100, 'vv': 1710.0, 'ww': 10, 'n': 86.74, 'nbas': 4, 'hbas': 820, 'pres': 102160, 'tend24': 410, 'tn12': 23.5, 'tx12': 0.0, 'tminsol': 0.3, 'raf10': 2.3, 'rafper': 2.9, 'per': -10.0, 'rr12': 0.0}
Données ajoutées avec succès!
{'numer_sta': 7015, 'date': '2025-02-01 12:00:00', 'pmer': 103110, 'tend': -110, 'cod_tend': 8, 'dd': 140.0, 'ff': 2.1, 't': 6.7, 'td': 1.2, 'u': 68, 'vv': 8670.0, 'ww': 0, 'n': 86.74, 'nbas': 0, 'hbas': 820, 'pres': 102520, 'tend24': 380, 'tn12': 23.5, 'tx12': 0.0, 'tminsol': -1.6, 'raf10': 3.5, 'rafper': 3.6, 'per': -10.0, 'rr12': 0.2}
Données ajoutées avec succès!
{'numer_sta': 7020, 'date': '2025-02-01 12:00:00', 'pmer': 102830, 'tend': -30, 'cod_tend': 8, 'dd': 180.0, 'ff': 5.1, 't': 7.6, 'td': 6.1, 'u': 90, 'vv': 11000.0, 'ww': 2, 'n': 100.0, 'nbas': 8, 'hbas': 450, 'pres': 102720, 'tend24': 430, 'tn12': 23.5, 'tx12