In [1]:
# Dataset nanm: weather and airpressure, atmosperical pressure and weathercodes at a 10 minute interval
# URL:          https://dataplatform.knmi.nl/dataset/weer-en-luchtdruk-1-0
# Doel:         bepaal de weercode op basis van luchtdruk, zicht, en voorgaande weercode

In [2]:
import pandas as pd
from sklearn.neighbors              import KNeighborsClassifier
from sklearn.model_selection        import train_test_split
from sklearn.linear_model           import LinearRegression
from sklearn                        import preprocessing
from sklearn                        import utils

In [3]:
# Read data file
df = pd.read_csv(".\\data\\merged_data_small_set.csv", ",")

# Remove white spaces from data
print("Removing white space...")
df.columns = df.columns.str.replace(' ','')
print("Removing white space... Done")

# drop the rows with NaN
print("Dropping rows with NaN values...")
df = df.dropna()
print("Dropping rows with NaN values... Done")

# Set the data types
print("Setting data types...")
df.DTG = pd.to_datetime(df.DTG)
df.LOCATION = df.LOCATION.astype('str')
df.NAME = df.NAME.astype('str')
df.LATITUDE = df.LATITUDE.astype('float64')
df.LONGITUDE = df.LONGITUDE.astype('float64')
df.ALTITUDE = df.ALTITUDE.astype('float64')
df.P_NAP_MSL_10 = df.P_NAP_MSL_10.astype('float64')
df.P_STN_LEVEL_10 = df.P_STN_LEVEL_10.astype('float64')
df.P_SENSOR_10 = df.P_SENSOR_10.astype('int64')
df.VV_10 = df.VV_10.astype('int64')
df.WW_IND_CURR_10 = df.WW_IND_CURR_10.astype('int64')
df.WW_IND_PAST_10_10 = df.WW_IND_PAST_10_10.astype('int64')
df.WW_CURR_10 = df.WW_CURR_10.astype('int64')
df.WW_PAST_10 = df.WW_PAST_10.astype('int64')
df.AH_10 = df.AH_10.astype('int64')
df.MOR_10 = df.MOR_10.astype('int64')
print("Setting data types... Done")

Removing white space...
Removing white space... Done
Dropping rows with NaN values...
Dropping rows with NaN values... Done
Setting data types...
Setting data types... Done


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4301 entries, 0 to 4319
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   DTG                4301 non-null   datetime64[ns]
 1   LOCATION           4301 non-null   object        
 2   NAME               4301 non-null   object        
 3   LATITUDE           4301 non-null   float64       
 4   LONGITUDE          4301 non-null   float64       
 5   ALTITUDE           4301 non-null   float64       
 6   P_NAP_MSL_10       4301 non-null   float64       
 7   P_STN_LEVEL_10     4301 non-null   float64       
 8   P_SENSOR_10        4301 non-null   int64         
 9   VV_10              4301 non-null   int64         
 10  WW_IND_CURR_10     4301 non-null   int64         
 11  WW_IND_PAST_10_10  4301 non-null   int64         
 12  WW_CURR_10         4301 non-null   int64         
 13  WW_PAST_10         4301 non-null   int64         
 14  AH_10   

In [5]:
df.isnull().sum()

DTG                  0
LOCATION             0
NAME                 0
LATITUDE             0
LONGITUDE            0
ALTITUDE             0
P_NAP_MSL_10         0
P_STN_LEVEL_10       0
P_SENSOR_10          0
VV_10                0
WW_IND_CURR_10       0
WW_IND_PAST_10_10    0
WW_CURR_10           0
WW_PAST_10           0
AH_10                0
MOR_10               0
dtype: int64

In [6]:
df.head()

Unnamed: 0,DTG,LOCATION,NAME,LATITUDE,LONGITUDE,ALTITUDE,P_NAP_MSL_10,P_STN_LEVEL_10,P_SENSOR_10,VV_10,WW_IND_CURR_10,WW_IND_PAST_10_10,WW_CURR_10,WW_PAST_10,AH_10,MOR_10
0,2003-04-01 00:10:00,260_A_a,De Bilt locatie A,52.098889,5.179722,1.9,1022.638,1022.388,1022,7380,0,0,10,10,5,7380
1,2003-04-01 00:20:00,260_A_a,De Bilt locatie A,52.098889,5.179722,1.9,1022.637,1022.387,1022,6630,0,0,10,10,5,6630
2,2003-04-01 00:30:00,260_A_a,De Bilt locatie A,52.098889,5.179722,1.9,1022.538,1022.288,1022,3700,1,0,10,10,5,3700
3,2003-04-01 00:40:00,260_A_a,De Bilt locatie A,52.098889,5.179722,1.9,1022.438,1022.188,1022,4040,1,1,10,10,5,4040
4,2003-04-01 00:50:00,260_A_a,De Bilt locatie A,52.098889,5.179722,1.9,1022.438,1022.188,1022,4280,1,1,10,10,5,4280


In [7]:
df.tail()

Unnamed: 0,DTG,LOCATION,NAME,LATITUDE,LONGITUDE,ALTITUDE,P_NAP_MSL_10,P_STN_LEVEL_10,P_SENSOR_10,VV_10,WW_IND_CURR_10,WW_IND_PAST_10_10,WW_CURR_10,WW_PAST_10,AH_10,MOR_10
4315,2003-04-30 23:20:00,260_A_a,De Bilt locatie A,52.098889,5.179722,1.9,1010.723,1010.481,1010,11200,0,0,1,1,5,11200
4316,2003-04-30 23:30:00,260_A_a,De Bilt locatie A,52.098889,5.179722,1.9,1010.823,1010.581,1010,12500,0,0,1,1,5,12500
4317,2003-04-30 23:40:00,260_A_a,De Bilt locatie A,52.098889,5.179722,1.9,1010.824,1010.582,1010,12500,0,0,3,1,5,12500
4318,2003-04-30 23:50:00,260_A_a,De Bilt locatie A,52.098889,5.179722,1.9,1010.824,1010.582,1010,12300,0,0,3,3,5,12300
4319,2003-05-01 00:00:00,260_A_a,De Bilt locatie A,52.098889,5.179722,1.9,1010.924,1010.682,1010,13600,0,0,3,3,5,13600


In [8]:
df.corr()

Unnamed: 0,LATITUDE,LONGITUDE,ALTITUDE,P_NAP_MSL_10,P_STN_LEVEL_10,P_SENSOR_10,VV_10,WW_IND_CURR_10,WW_IND_PAST_10_10,WW_CURR_10,WW_PAST_10,AH_10,MOR_10
LATITUDE,,,,,,,,,,,,,
LONGITUDE,,,,,,,,,,,,,
ALTITUDE,,,,,,,,,,,,,
P_NAP_MSL_10,,,,1.0,1.0,0.99942,0.184063,-0.313756,-0.315007,-0.384926,-0.385363,-0.070148,0.183972
P_STN_LEVEL_10,,,,1.0,1.0,0.99942,0.183956,-0.313764,-0.315016,-0.384942,-0.385379,-0.07012,0.183864
P_SENSOR_10,,,,0.99942,0.99942,1.0,0.183661,-0.313499,-0.314418,-0.385019,-0.385208,-0.070203,0.183567
VV_10,,,,0.184063,0.183956,0.183661,1.0,-0.260651,-0.247773,-0.299873,-0.29015,0.12739,0.999986
WW_IND_CURR_10,,,,-0.313756,-0.313764,-0.313499,-0.260651,1.0,0.733545,0.94804,0.724784,-0.145194,-0.26105
WW_IND_PAST_10_10,,,,-0.315007,-0.315016,-0.314418,-0.247773,0.733545,1.0,0.757095,0.948297,-0.136017,-0.248176
WW_CURR_10,,,,-0.384926,-0.384942,-0.385019,-0.299873,0.94804,0.757095,1.0,0.824822,-0.150121,-0.300236


In [9]:
# LOCATION = identifier, NAME = identifier, LATITUDE in degrees (WGS84), LONGITUDE in degrees (WGS84), ALTITUDE in 0.1 m relative to Mean Sea Level (MSL)
# P_NAP_MSL_10 is luchtdruk herleid naar msl of nap 10' eenheid hPa
# P_STN_LEVEL_10 is luchtdruk herleid naar stationshoogte 10' eenheid hPa
# P_SENSOR_10 is luchtdruk sensorhoogte 10' eenheid hPa
# VV_10 is zicht gemiddeld 10' eenheid m
# WW_IND_CURR_10 is weer indicator verleden weer 10' eenheid code
# WW_IND_PAST_10_10 is weer indicator verleden weer voorgaande 10' 10' eenheid code
# WW_CURR_10 is weer code present weather sensor 10' eenheid code
# WW_PAST_10 is weer code voorgaande 10' 10' eenheid code
# AH_10 is zicht helderheid achtergrond 10' eenheid Candela
# MOR_10 is zicht meteorologisch dagzicht 10' eenheid m

X = df[['P_NAP_MSL_10', 'VV_10', 'WW_PAST_10', 'AH_10', 'MOR_10']]
y = df[['WW_CURR_10']]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.2)

In [10]:
kncl = KNeighborsClassifier(n_neighbors=3)
kncl.fit(X_train,y_train.values.ravel())

KNeighborsClassifier(n_neighbors=3)

In [11]:
kncl.score(X_test,y_test)

0.3600464576074332

In [12]:
kncl.predict([[1050.0, 300, 0, 5, 4000]])

# Voor de betekenis van de weercode, zie automatische waarnemingen op https://cdn.knmi.nl/knmi/pdf/bibliotheek/scholierenpdf/weercodes_Nederland.pdf

array([10], dtype=int64)