## Meteorology analysis
### Author : Nicolas Lacroix (https://github.com/NicolasLacroix)

#### Analyzed data will be used to predict the Sud region's consumption

Licence : [Apache License 2.0]

Data provided by [OpenDataSoft] and [Météo France]

Source file link (csv) : https://public.opendatasoft.com/explore/dataset/donnees-synop-essentielles-omm/download/?format=csv&refine.nom_reg=Provence-Alpes-C%C3%B4te+d%27Azur&timezone=Europe/Berlin&use_labels_for_header=true&csv_separator=%3B

[OpenDataSoft]: https://public.opendatasoft.com/
[Météo France]: https://donneespubliques.meteofrance.fr/

[Apache License 2.0]: https://github.com/NicolasLacroix/data-representation/blob/master/LICENSE

In [25]:
# TODO: add inhabitants number to datasets
#%run MyOtherNotebook.ipynb
import pandas as pd
from datetime import datetime, date

In [26]:
# TODO: read https://public.opendatasoft.com/explore/dataset/donnees-synop-essentielles-omm/api/
# downloading data for the SUD region
data_link = "../datasets/meteorologie/donnees-synop-essentielles-omm-paca.csv"
# data_link = "https://public.opendatasoft.com/explore/dataset/donnees-synop-essentielles-omm/download/?format=csv&refine.nom_reg=Provence-Alpes-C%C3%B4te+d%27Azur&timezone=Europe/Berlin&use_labels_for_header=true&csv_separator=%3B"
data = pd.read_csv(data_link, delimiter=';', parse_dates=True)
print(data.shape)
data.head()

(115137, 81)


Unnamed: 0,ID OMM station,Date,Pression au niveau mer,Variation de pression en 3 heures,Type de tendance barométrique,Direction du vent moyen 10 mn,Vitesse du vent moyen 10 mn,Température,Point de rosée,Humidité,...,Longitude,Latitude,communes (name),communes (code),EPCI (name),EPCI (code),department (name),department (code),region (name),region (code)
0,7591,2019-05-26T08:00:00+02:00,,40.0,3.0,30.0,0.8,282.85,282.55,98.0,...,6.502333,44.565667,Embrun,5046,CC Serre-Ponçon,200067742,Hautes-Alpes,5,Provence-Alpes-Côte d'Azur,93
1,7661,2019-05-26T17:00:00+02:00,101230.0,-90.0,7.0,300.0,3.2,297.15,285.45,48.0,...,5.940833,43.079333,Saint-Mandrier-sur-Mer,83153,Métropole Toulon-Provence-Méditerranée,248300543,Var,83,Provence-Alpes-Côte d'Azur,93
2,7661,2019-05-30T17:00:00+02:00,102230.0,30.0,1.0,240.0,3.6,294.35,281.25,43.0,...,5.940833,43.079333,Saint-Mandrier-sur-Mer,83153,Métropole Toulon-Provence-Méditerranée,248300543,Var,83,Provence-Alpes-Côte d'Azur,93
3,7650,2019-06-02T17:00:00+02:00,101890.0,-80.0,7.0,200.0,5.0,298.75,284.55,41.0,...,5.216,43.437667,Marignane,13054,Métropole d'Aix-Marseille-Provence,200054807,Bouches-du-Rhône,13,Provence-Alpes-Côte d'Azur,93
4,7661,2019-06-03T11:00:00+02:00,101870.0,60.0,1.0,230.0,3.0,294.75,286.35,59.0,...,5.940833,43.079333,Saint-Mandrier-sur-Mer,83153,Métropole Toulon-Provence-Méditerranée,248300543,Var,83,Provence-Alpes-Côte d'Azur,93


In [27]:
# removing useless columns
# TODO: add EPCI (or department) when inhabitants number will be added
# removing columns with too much NaN values
acceptablePercentage = 40

def removeNaN(threshold, data):
    numRows, numColumns = data.shape
    nan_stats = data.isna().sum()
    for col_name, value in nan_stats.items():
        # if more than the acceptable percentage of the values are NaN
        if value*100/numRows >= acceptablePercentage:
            data = data.drop(columns=[col_name])
    return data

data = removeNaN(acceptablePercentage, data)
data

Unnamed: 0,ID OMM station,Date,Pression au niveau mer,Variation de pression en 3 heures,Type de tendance barométrique,Direction du vent moyen 10 mn,Vitesse du vent moyen 10 mn,Température,Point de rosée,Humidité,...,Longitude,Latitude,communes (name),communes (code),EPCI (name),EPCI (code),department (name),department (code),region (name),region (code)
0,7591,2019-05-26T08:00:00+02:00,,40.0,3.0,30.0,0.8,282.85,282.55,98.0,...,6.502333,44.565667,Embrun,5046,CC Serre-Ponçon,200067742,Hautes-Alpes,5,Provence-Alpes-Côte d'Azur,93
1,7661,2019-05-26T17:00:00+02:00,101230.0,-90.0,7.0,300.0,3.2,297.15,285.45,48.0,...,5.940833,43.079333,Saint-Mandrier-sur-Mer,83153,Métropole Toulon-Provence-Méditerranée,248300543,Var,83,Provence-Alpes-Côte d'Azur,93
2,7661,2019-05-30T17:00:00+02:00,102230.0,30.0,1.0,240.0,3.6,294.35,281.25,43.0,...,5.940833,43.079333,Saint-Mandrier-sur-Mer,83153,Métropole Toulon-Provence-Méditerranée,248300543,Var,83,Provence-Alpes-Côte d'Azur,93
3,7650,2019-06-02T17:00:00+02:00,101890.0,-80.0,7.0,200.0,5.0,298.75,284.55,41.0,...,5.216000,43.437667,Marignane,13054,Métropole d'Aix-Marseille-Provence,200054807,Bouches-du-Rhône,13,Provence-Alpes-Côte d'Azur,93
4,7661,2019-06-03T11:00:00+02:00,101870.0,60.0,1.0,230.0,3.0,294.75,286.35,59.0,...,5.940833,43.079333,Saint-Mandrier-sur-Mer,83153,Métropole Toulon-Provence-Méditerranée,248300543,Var,83,Provence-Alpes-Côte d'Azur,93
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115132,7661,2018-12-01T01:00:00+01:00,101560.0,80.0,1.0,320.0,1.6,282.65,281.05,90.0,...,5.940833,43.079333,Saint-Mandrier-sur-Mer,83153,Métropole Toulon-Provence-Méditerranée,248300543,Var,83,Provence-Alpes-Côte d'Azur,93
115133,7650,2018-12-01T07:00:00+01:00,101780.0,90.0,1.0,320.0,4.3,281.25,279.25,87.0,...,5.216000,43.437667,Marignane,13054,Métropole d'Aix-Marseille-Provence,200054807,Bouches-du-Rhône,13,Provence-Alpes-Côte d'Azur,93
115134,7650,2019-01-10T07:00:00+01:00,101290.0,60.0,3.0,340.0,7.7,276.85,271.45,68.0,...,5.216000,43.437667,Marignane,13054,Métropole d'Aix-Marseille-Provence,200054807,Bouches-du-Rhône,13,Provence-Alpes-Côte d'Azur,93
115135,7591,2019-01-10T13:00:00+01:00,,-70.0,8.0,190.0,1.6,275.75,259.85,30.0,...,6.502333,44.565667,Embrun,5046,CC Serre-Ponçon,200067742,Hautes-Alpes,5,Provence-Alpes-Côte d'Azur,93


In [28]:
# removing cities' useless caracteristics
# TODO: define criterias for remove selection
toRemove = [
    'Coordonnees', 'Nom', 'Longitude', 'Latitude', 'communes (name)', 
    'communes (code)', 'EPCI (name)', 'EPCI (code)', 'department (name)', 
    'department (code)', 'region (name)', 'region (code)',
    'Temps passé 1.1', 'Temps présent.1', 'Altitude'
]
data = data.drop(columns=toRemove, errors='ignore') # ignore errors due to previous deletions
data

Unnamed: 0,ID OMM station,Date,Pression au niveau mer,Variation de pression en 3 heures,Type de tendance barométrique,Direction du vent moyen 10 mn,Vitesse du vent moyen 10 mn,Température,Point de rosée,Humidité,...,Temps présent,Pression station,Rafales sur une période,Periode de mesure de la rafale,Précipitations dans la dernière heure,Précipitations dans les 3 dernières heures,Précipitations dans les 6 dernières heures,Précipitations dans les 12 dernières heures,Type de tendance barométrique.1,Température (°C)
0,7591,2019-05-26T08:00:00+02:00,,40.0,3.0,30.0,0.8,282.85,282.55,98.0,...,0.0,91440.0,2.1,-10.0,0.0,0.0,0.0,0.2,3.0,9.7
1,7661,2019-05-26T17:00:00+02:00,101230.0,-90.0,7.0,300.0,3.2,297.15,285.45,48.0,...,,99650.0,8.1,-10.0,0.0,0.0,0.6,0.6,7.0,24.0
2,7661,2019-05-30T17:00:00+02:00,102230.0,30.0,1.0,240.0,3.6,294.35,281.25,43.0,...,,100620.0,6.0,-10.0,0.0,0.0,0.0,0.0,1.0,21.2
3,7650,2019-06-02T17:00:00+02:00,101890.0,-80.0,7.0,200.0,5.0,298.75,284.55,41.0,...,0.0,101520.0,8.1,-10.0,0.0,0.0,0.0,0.0,7.0,25.6
4,7661,2019-06-03T11:00:00+02:00,101870.0,60.0,1.0,230.0,3.0,294.75,286.35,59.0,...,,100270.0,4.8,-10.0,0.0,0.0,0.0,0.0,1.0,21.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115132,7661,2018-12-01T01:00:00+01:00,101560.0,80.0,1.0,320.0,1.6,282.65,281.05,90.0,...,,100070.0,4.9,-10.0,-0.1,0.6,2.4,23.2,1.0,9.5
115133,7650,2018-12-01T07:00:00+01:00,101780.0,90.0,1.0,320.0,4.3,281.25,279.25,87.0,...,0.0,101390.0,5.8,-10.0,0.0,0.0,0.0,-0.1,1.0,8.1
115134,7650,2019-01-10T07:00:00+01:00,101290.0,60.0,3.0,340.0,7.7,276.85,271.45,68.0,...,0.0,100890.0,13.1,-10.0,0.0,0.0,0.0,0.0,3.0,3.7
115135,7591,2019-01-10T13:00:00+01:00,,-70.0,8.0,190.0,1.6,275.75,259.85,30.0,...,0.0,91250.0,3.7,-10.0,0.0,0.0,0.0,0.0,8.0,2.6


In [29]:
# sort data by date (increasing order)
data = data.sort_values(by='Date')
data.dtypes

ID OMM station                                   int64
Date                                            object
Pression au niveau mer                         float64
Variation de pression en 3 heures              float64
Type de tendance barométrique                  float64
Direction du vent moyen 10 mn                  float64
Vitesse du vent moyen 10 mn                    float64
Température                                    float64
Point de rosée                                 float64
Humidité                                       float64
Visibilité horizontale                         float64
Temps présent                                  float64
Pression station                               float64
Rafales sur une période                        float64
Periode de mesure de la rafale                 float64
Précipitations dans la dernière heure          float64
Précipitations dans les 3 dernières heures     float64
Précipitations dans les 6 dernières heures     float64
Précipitat

In [30]:
# group rows by Date and applying the mean operation
data = data.groupby(['Date']).mean()
data = data.loc['2019-12-01':]
data

Unnamed: 0_level_0,ID OMM station,Pression au niveau mer,Variation de pression en 3 heures,Type de tendance barométrique,Direction du vent moyen 10 mn,Vitesse du vent moyen 10 mn,Température,Point de rosée,Humidité,Visibilité horizontale,Temps présent,Pression station,Rafales sur une période,Periode de mesure de la rafale,Précipitations dans la dernière heure,Précipitations dans les 3 dernières heures,Précipitations dans les 6 dernières heures,Précipitations dans les 12 dernières heures,Type de tendance barométrique.1,Température (°C)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2019-12-01T01:00:00+01:00,7648.0,101720.000000,-70.0,7.25,135.000000,5.175000,284.000,279.175,74.00,27910.000000,20.000000,98605.0,9.800000,-10.0,0.050,0.150,0.150000,0.150000,7.25,10.850
2019-12-01T04:00:00+01:00,7648.0,101533.333333,-172.5,8.00,252.500000,5.975000,284.825,280.575,77.00,14430.000000,36.750000,98432.5,9.200000,-10.0,0.475,0.775,0.925000,0.925000,8.00,11.675
2019-12-01T07:00:00+01:00,7648.0,101413.333333,-145.0,6.25,133.333333,8.033333,284.250,282.075,86.50,15785.000000,37.250000,98287.5,15.566667,-10.0,0.475,1.700,2.500000,2.650000,6.25,11.100
2019-12-01T10:00:00+01:00,7648.0,101443.333333,45.0,3.75,306.666667,5.566667,282.975,281.925,93.00,7750.000000,46.750000,98332.5,9.933333,-10.0,3.525,7.475,9.200000,10.150000,3.75,9.825
2019-12-01T13:00:00+01:00,7648.0,101333.333333,-70.0,5.50,220.000000,4.525000,282.650,281.800,94.50,5106.666667,64.666667,98262.5,7.733333,-10.0,4.150,12.600,20.075000,22.600000,5.50,9.500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-01-29T10:00:00+01:00,7648.0,101613.333333,140.0,1.25,275.000000,6.800000,282.450,276.350,66.75,28615.000000,1.250000,98502.5,12.475000,-10.0,0.000,0.000,-0.033333,0.266667,1.25,9.300
2020-01-29T13:00:00+01:00,7648.0,101620.000000,-7.5,2.00,240.000000,5.700000,285.775,276.500,55.75,38796.666667,0.333333,98495.0,9.725000,-10.0,0.000,0.000,0.000000,0.266667,2.00,12.625
2020-01-29T16:00:00+01:00,7648.0,101596.666667,-10.0,4.00,227.500000,7.000000,285.050,278.175,63.50,38566.666667,0.666667,98485.0,10.400000,-10.0,0.000,0.000,0.000000,-0.033333,4.00,11.900
2020-01-29T19:00:00+01:00,7648.0,101756.666667,172.5,3.00,160.000000,4.625000,282.275,277.900,74.50,35000.000000,0.500000,98657.5,7.300000,-10.0,0.000,0.000,0.000000,0.000000,3.00,9.125


In [31]:
# convert temperature from kelvin to celcius
kelvin = 273.15
data["Température"] = data["Température"].apply(lambda x: x-kelvin)
data["Température"]