In [25]:
# Dataset naam: Daggegevens van het weer in Nederland - De Bilt
# URL:          https://www.knmi.nl/nederland-nu/klimatologie/daggegevens
# Doel:         Is het lelijk weer op basis van inputparameters?

# YYYYMMDD  = Datum (YYYY=jaar MM=maand DD=dag) / Date (YYYY=year MM=month DD=day)
# DDVEC     = Vectorgemiddelde windrichting in graden (360=noord, 90=oost, 180=zuid, 270=west, 0=windstil/variabel). Zie http://www.knmi.nl/kennis-en-datacentrum/achtergrond/klimatologische-brochures-en-boeken / Vector mean wind direction in degrees (360=north, 90=east, 180=south, 270=west, 0=calm/variable)
# FHVEC     = Vectorgemiddelde windsnelheid (in 0.1 m/s). Zie http://www.knmi.nl/kennis-en-datacentrum/achtergrond/klimatologische-brochures-en-boeken / Vector mean windspeed (in 0.1 m/s)
# FG        = Etmaalgemiddelde windsnelheid (in 0.1 m/s) / Daily mean windspeed (in 0.1 m/s) 
# FHX       = Hoogste uurgemiddelde windsnelheid (in 0.1 m/s) / Maximum hourly mean windspeed (in 0.1 m/s)
# FHXH      = Uurvak waarin FHX is gemeten / Hourly division in which FHX was measured
# FHN       = Laagste uurgemiddelde windsnelheid (in 0.1 m/s) / Minimum hourly mean windspeed (in 0.1 m/s)
# FHNH      = Uurvak waarin FHN is gemeten / Hourly division in which FHN was measured
# FXX       = Hoogste windstoot (in 0.1 m/s) / Maximum wind gust (in 0.1 m/s)
# FXXH      = Uurvak waarin FXX is gemeten / Hourly division in which FXX was measured
# TG        = Etmaalgemiddelde temperatuur (in 0.1 graden Celsius) / Daily mean temperature in (0.1 degrees Celsius)
# TN        = Minimum temperatuur (in 0.1 graden Celsius) / Minimum temperature (in 0.1 degrees Celsius)
# TNH       = Uurvak waarin TN is gemeten / Hourly division in which TN was measured
# TX        = Maximum temperatuur (in 0.1 graden Celsius) / Maximum temperature (in 0.1 degrees Celsius)
# TXH       = Uurvak waarin TX is gemeten / Hourly division in which TX was measured
# T10N      = Minimum temperatuur op 10 cm hoogte (in 0.1 graden Celsius) / Minimum temperature at 10 cm above surface (in 0.1 degrees Celsius)
# T10NH     = 6-uurs tijdvak waarin T10N is gemeten / 6-hourly division in which T10N was measured; 6=0-6 UT, 12=6-12 UT, 18=12-18 UT, 24=18-24 UT 
# SQ        = Zonneschijnduur (in 0.1 uur) berekend uit de globale straling (-1 voor <0.05 uur) / Sunshine duration (in 0.1 hour) calculated from global radiation (-1 for <0.05 hour)
# SP        = Percentage van de langst mogelijke zonneschijnduur / Percentage of maximum potential sunshine duration
# Q         = Globale straling (in J/cm2) / Global radiation (in J/cm2)
# DR        = Duur van de neerslag (in 0.1 uur) / Precipitation duration (in 0.1 hour)
# RH        = Etmaalsom van de neerslag (in 0.1 mm) (-1 voor <0.05 mm) / Daily precipitation amount (in 0.1 mm) (-1 for <0.05 mm)
# RHX       = Hoogste uursom van de neerslag (in 0.1 mm) (-1 voor <0.05 mm) / Maximum hourly precipitation amount (in 0.1 mm) (-1 for <0.05 mm)
# RHXH      = Uurvak waarin RHX is gemeten / Hourly division in which RHX was measured
# PG        = Etmaalgemiddelde luchtdruk herleid tot zeeniveau (in 0.1 hPa) berekend uit 24 uurwaarden / Daily mean sea level pressure (in 0.1 hPa) calculated from 24 hourly values
# PX        = Hoogste uurwaarde van de luchtdruk herleid tot zeeniveau (in 0.1 hPa) / Maximum hourly sea level pressure (in 0.1 hPa)
# PXH       = Uurvak waarin PX is gemeten / Hourly division in which PX was measured
# PN        = Laagste uurwaarde van de luchtdruk herleid tot zeeniveau (in 0.1 hPa) / Minimum hourly sea level pressure (in 0.1 hPa)
# PNH       = Uurvak waarin PN is gemeten / Hourly division in which PN was measured
# VVN       = Minimum opgetreden zicht / Minimum visibility; 0: <100 m, 1:100-200 m, 2:200-300 m,..., 49:4900-5000 m, 50:5-6 km, 56:6-7 km, 57:7-8 km,..., 79:29-30 km, 80:30-35 km, 81:35-40 km,..., 89: >70 km)
# VVNH      = Uurvak waarin VVN is gemeten / Hourly division in which VVN was measured
# VVX       = Maximum opgetreden zicht / Maximum visibility; 0: <100 m, 1:100-200 m, 2:200-300 m,..., 49:4900-5000 m, 50:5-6 km, 56:6-7 km, 57:7-8 km,..., 79:29-30 km, 80:30-35 km, 81:35-40 km,..., 89: >70 km)
# VVXH      = Uurvak waarin VVX is gemeten / Hourly division in which VVX was measured
# NG        = Etmaalgemiddelde bewolking (bedekkingsgraad van de bovenlucht in achtsten, 9=bovenlucht onzichtbaar) / Mean daily cloud cover (in octants, 9=sky invisible)
# UG        = Etmaalgemiddelde relatieve vochtigheid (in procenten) / Daily mean relative atmospheric humidity (in percents)
# UX        = Maximale relatieve vochtigheid (in procenten) / Maximum relative atmospheric humidity (in percents)
# UXH       = Uurvak waarin UX is gemeten / Hourly division in which UX was measured
# UN        = Minimale relatieve vochtigheid (in procenten) / Minimum relative atmospheric humidity (in percents)
# UNH       = Uurvak waarin UN is gemeten / Hourly division in which UN was measured
# EV24      = Referentiegewasverdamping (Makkink) (in 0.1 mm) / Potential evapotranspiration (Makkink) (in 0.1 mm)

In [26]:
import pandas                   as pd
import numpy                    as np
from sklearn                    import preprocessing
from sklearn.model_selection    import train_test_split
from keras.models               import Sequential
from keras.layers               import Dense

df = pd.read_csv(".\data\lelijk_weer_MvL_no_spaces.csv")
pd.set_option('display.max_rows', df.shape[0]+1)
# replace NaN values with column average
print("Replacing NaN values with column average...")
df = df.fillna(df.mean())
print("Replacing NaN values with column average... Done")

Replacing NaN values with column average...
Replacing NaN values with column average... Done


In [27]:
# Voeg slecht-weer-indicator toe aan de bestaande data.
# Bepalend zijn:

# +--------+-------------------------------------------------------+-----------+
# |Code    |Beschrijving                                           |Eenheid    |
# +--------+-------------------------------------------------------+-----------+
# |FG:     |gemiddelde windselheid								   |0.1 m/s    |
# |FXX:    |hoogste windstoot									   |0.1 m/s    |
# |TG:     |gemiddelde temperatuur								   |0.1 C      |
# |SP:     |percentage van langst mogelijke zonneschijnsduur	   |%          |
# |DR:     |duur neerslag										   |0.1 h      |
# |RH:     |etmaalsom neerslag									   |0.1 mm     |
# +--------+-------------------------------------------------------+-----------+

# Wanneer deze variabelen boven een bepaalde waarde komen, scoren ze een punt.
# Bij een score van 50% van de maximale score, krijgt het weer de classificatie "slect weer".
# 0 = geen slecht weer
# 1 = slecht weer

def setClassification (row):
    weatherScore = 0
    if row['FG'] >= 80: # Windkracht 5 of meer
        weatherScore += 1
    if row['FXX'] >= 172: # Windkracht 8 of meer
        weatherScore += 1
    if row['TG'] <= 100: # 10 graden C of kouder
        weatherScore += 1
    if row['SP'] <= 20: # zon scheen niet meer dan 20% van mogeijke maximale zonneschijnduur
        weatherScore += 1
    if row['DR'] >= 40: # regenduur 4 uur of langer
        weatherScore += 1
    if row['RH'] >= 200: # regensom 20 mm of meer
        weatherScore += 1

    scoreMax = 6
    weatherClassifier = 0
    if weatherScore >= (0.5 * scoreMax):
        weatherClassifier = 1
    return weatherClassifier

df['Classification'] = df.apply (lambda row: setClassification(row), axis=1)

In [28]:
df['Classification'].value_counts()

0    39788
1     4198
Name: Classification, dtype: int64

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43986 entries, 0 to 43985
Data columns (total 42 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   STN             43986 non-null  int64  
 1   YYYYMMDD        43986 non-null  int64  
 2   DDVEC           43986 non-null  float64
 3   FHVEC           43986 non-null  float64
 4   FG              43986 non-null  float64
 5   FHX             43986 non-null  float64
 6   FHXH            43986 non-null  float64
 7   FHN             43986 non-null  float64
 8   FHNH            43986 non-null  float64
 9   FXX             43986 non-null  float64
 10  FXXH            43986 non-null  float64
 11  TG              43986 non-null  int64  
 12  TN              43986 non-null  int64  
 13  TNH             43986 non-null  float64
 14  TX              43986 non-null  int64  
 15  TXH             43986 non-null  float64
 16  T10N            43986 non-null  float64
 17  T10NH           43986 non-null 

In [30]:
df.isnull().sum()

STN               0
YYYYMMDD          0
DDVEC             0
FHVEC             0
FG                0
FHX               0
FHXH              0
FHN               0
FHNH              0
FXX               0
FXXH              0
TG                0
TN                0
TNH               0
TX                0
TXH               0
T10N              0
T10NH             0
SQ                0
SP                0
Q                 0
DR                0
RH                0
RHX               0
RHXH              0
PG                0
PX                0
PXH               0
PN                0
PNH               0
VVN               0
VVNH              0
VVX               0
VVXH              0
NG                0
UG                0
UX                0
UXH               0
UN                0
UNH               0
EV24              0
Classification    0
dtype: int64

In [31]:
# scale de data en zet de classificatie terug
df_s = preprocessing.scale(df)
df_s = pd.DataFrame(df_s, columns=df.columns)
df_s['Classification'] = df['Classification']
df = df_s
df.head()

Unnamed: 0,STN,YYYYMMDD,DDVEC,FHVEC,FG,FHX,FHXH,FHN,FHNH,FXX,...,VVX,VVXH,NG,UG,UX,UXH,UN,UNH,EV24,Classification
0,0.0,-1.719336,0.0,3.590522e-16,0.0,0.0,0.0,0.0,-2.099041e-16,4.483333e-16,...,1.683233e-15,0.0,5.388276e-16,-1.490634,-3.530953e-15,-2.67613e-16,0.0,0.0,0.0,0
1,0.0,-1.719333,0.0,3.590522e-16,0.0,0.0,0.0,0.0,-2.099041e-16,4.483333e-16,...,1.683233e-15,0.0,5.388276e-16,0.437651,-3.530953e-15,-2.67613e-16,0.0,0.0,0.0,0
2,0.0,-1.71933,0.0,3.590522e-16,0.0,0.0,0.0,0.0,-2.099041e-16,4.483333e-16,...,1.683233e-15,0.0,5.388276e-16,0.726893,-3.530953e-15,-2.67613e-16,0.0,0.0,0.0,0
3,0.0,-1.719327,0.0,3.590522e-16,0.0,0.0,0.0,0.0,-2.099041e-16,4.483333e-16,...,1.683233e-15,0.0,5.388276e-16,-0.237249,-3.530953e-15,-2.67613e-16,0.0,0.0,0.0,0
4,0.0,-1.719325,0.0,3.590522e-16,0.0,0.0,0.0,0.0,-2.099041e-16,4.483333e-16,...,1.683233e-15,0.0,5.388276e-16,-1.587048,-3.530953e-15,-2.67613e-16,0.0,0.0,0.0,0


In [32]:
X = df[['FG', 'FXX', 'TG', 'SP', 'DR', 'RH']]
y = df['Classification']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [33]:
# Maak het neurale netwerk
m = Sequential()
m.add(Dense(16, activation='relu', input_dim=6))
m.add(Dense(8, activation='relu'))
m.add(Dense(1, activation='sigmoid'))

In [34]:
m.compile(optimizer='adam', loss='binary_crossentropy', metrics= ['accuracy'])

In [35]:
m.fit(X_train, y_train, epochs=150)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

<keras.callbacks.History at 0x2035d705280>

In [36]:
score_test = m.evaluate(X_test, y_test)



In [49]:
# +--------+-------------------------------------------------------+-----------+
# |Code    |Beschrijving                                           |Eenheid    |
# +--------+-------------------------------------------------------+-----------+
# |FG:     |gemiddelde windselheid								   |0.1 m/s    |
# |FXX:    |hoogste windstoot									   |0.1 m/s    |
# |TG:     |gemiddelde temperatuur								   |0.1 C      |
# |SP:     |percentage van langst mogelijke zonneschijnsduur	   |%          |
# |DR:     |duur neerslag										   |0.1 h      |
# |RH:     |etmaalsom neerslag									   |0.1 mm     |
# +--------+-------------------------------------------------------+-----------+
x = [[90.0, 200.0, 50.0, 0.0, 50.0, 250.0]]
x_norm = preprocessing.normalize(x)
print(np.argmax(m.predict(x_norm), axis=-1))

[[0.26470588 0.58823529 0.14705882 0.         0.14705882 0.73529412]]
[0]
