In [1]:
# Dataset naam: temperature - transformed max. daily temperature, KNMI14, Netherlands
# URL:          https://dataplatform.knmi.nl/dataset/knmi14-maximum-temperature-3-2
# Doel:         In welke warmtecategorie valt een bepaalde dag?

In [1]:
import pandas                   as pd
from sklearn.neighbors          import KNeighborsClassifier
from sklearn.linear_model       import LogisticRegression
from sklearn.model_selection    import train_test_split
from sklearn                    import metrics

df = pd.read_csv("data/KNMI14_GL_2085_tx___19810101-20101231_v3.2.csv")

# set the first row to have a string data type, so it will not be included in the average calculations and classification assignments later on
df['0'] = df['0'].astype('str')

# drop the first 4 rows since we don't need them
df = df.drop(df.index[[0, 1, 2, 3]])

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10957 entries, 4 to 10960
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       10957 non-null  object 
 1   210     10957 non-null  float64
 2   235     10957 non-null  float64
 3   240     10957 non-null  float64
 4   260     10957 non-null  float64
 5   270     10957 non-null  float64
 6   275     10957 non-null  float64
 7   280     10957 non-null  float64
 8   290     10957 non-null  float64
 9   310     10957 non-null  float64
 10  344     10957 non-null  float64
 11  350     10957 non-null  float64
 12  370     10957 non-null  float64
 13  375     10957 non-null  float64
 14  380     10957 non-null  float64
dtypes: float64(14), object(1)
memory usage: 1.3+ MB


In [3]:
df.isnull().sum()

0      0
210    0
235    0
240    0
260    0
270    0
275    0
280    0
290    0
310    0
344    0
350    0
370    0
375    0
380    0
dtype: int64

In [4]:
# add temperature classifications based on KNMI definitions
df['Average'] = df.mean(axis=1)

def setClassification (row):
    if row['Average'] >= 30:
        return"Tropisch"
    elif row['Average'] >= 25:
        return"Zomers"
    elif row['Average'] >= 20:
        return"Warm"
    elif row['Average'] >= 10:
        return"Gemiddeld"
    elif row['Average'] >= 5:
        return"Fris"
    return"Koud"
df['Classification'] = df.apply (lambda row: setClassification(row), axis=1)

In [5]:
df.head(10)
# the column headers show the measuring station id. The first column "0" shows the measurement date.
# the measurements start at line 4

Unnamed: 0,0,210,235,240,260,270,275,280,290,310,344,350,370,375,380,Average,Classification
4,19810101,8.4,7.0,7.4,9.1,7.0,8.6,6.4,7.7,8.9,8.8,8.2,9.2,8.6,8.7,8.142857,Fris
5,19810102,9.0,9.3,9.0,9.5,9.6,8.8,9.1,9.0,8.5,9.6,8.9,9.4,9.3,8.9,9.135714,Fris
6,19810103,10.1,9.9,10.1,11.4,10.2,10.5,10.5,10.5,10.6,11.0,11.4,11.7,11.8,11.4,10.792857,Gemiddeld
7,19810104,7.8,7.3,7.4,7.4,7.4,6.5,6.8,6.6,8.0,8.0,8.5,7.7,7.6,7.6,7.471429,Fris
8,19810105,6.7,5.9,5.7,5.8,5.4,4.8,4.8,4.0,6.5,6.0,5.7,5.4,5.6,4.2,5.464286,Fris
9,19810106,6.2,4.2,5.0,3.8,3.3,2.1,2.0,2.3,6.9,5.9,4.3,3.9,3.7,3.8,4.1,Koud
10,19810107,5.8,4.8,4.9,4.3,3.5,1.6,1.7,1.6,5.8,5.4,4.9,4.6,4.2,3.5,4.042857,Koud
11,19810108,6.1,6.9,5.2,4.2,5.0,2.7,3.2,2.8,5.7,5.6,4.1,3.8,3.6,3.4,4.45,Koud
12,19810109,7.9,7.8,7.8,8.1,7.6,7.5,7.1,6.9,8.3,8.6,8.4,8.1,7.8,6.0,7.707143,Fris
13,19810110,6.2,6.4,6.0,6.2,6.2,5.6,5.7,5.4,6.5,6.3,6.0,6.3,7.2,5.7,6.121429,Fris


In [6]:
df.tail(10)

Unnamed: 0,0,210,235,240,260,270,275,280,290,310,344,350,370,375,380,Average,Classification
10951,20101222,0.5,1.4,0.5,0.9,0.8,0.1,-0.2,-0.4,2.2,0.8,1.9,2.5,0.8,2.5,1.021429,Koud
10952,20101223,2.3,2.5,2.3,1.9,2.4,1.3,1.6,1.3,2.9,2.4,1.7,1.7,1.5,1.1,1.921429,Koud
10953,20101224,1.7,2.2,1.8,1.6,1.8,0.5,1.3,0.3,3.0,1.9,1.0,1.0,0.8,0.3,1.371429,Koud
10954,20101225,4.5,3.9,2.8,2.4,1.9,0.2,0.5,-1.9,4.8,3.7,1.7,1.7,-0.4,-1.6,1.728571,Koud
10955,20101226,5.6,5.2,4.9,6.3,4.0,3.4,2.3,2.7,6.2,5.6,5.6,4.7,4.6,2.6,4.55,Koud
10956,20101227,3.9,5.1,4.2,2.7,3.3,2.9,2.8,3.0,2.4,3.3,2.1,2.3,3.1,2.9,3.142857,Koud
10957,20101228,2.9,2.8,3.1,3.0,2.1,2.4,1.0,2.3,3.7,3.9,3.5,3.2,2.6,4.4,2.921429,Koud
10958,20101229,3.6,1.8,3.7,3.5,-0.5,3.0,-3.0,-0.6,4.5,4.3,4.0,4.2,2.7,4.5,2.55,Koud
10959,20101230,4.8,6.1,3.6,3.6,5.0,1.2,3.2,1.0,4.8,4.6,4.0,4.0,0.4,3.8,3.578571,Koud
10960,20101231,6.6,6.9,6.4,5.2,6.3,3.7,5.1,3.5,5.2,6.7,4.1,3.5,3.4,2.7,4.95,Koud


In [7]:
X = df['Average'].values.reshape(-1,1)
y = df['Classification']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [8]:
kncl = KNeighborsClassifier(n_neighbors=3)
kncl.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=3)

In [9]:
round(kncl.score(X_test, y_test), 3)

1.0

In [10]:
y_pred = kncl.predict(X_test)
accurracy = metrics.accuracy_score(y_test, y_pred)
round(accurracy, 3)

1.0

In [11]:
print(kncl.predict([[-16]]))
print(kncl.predict([[7]]))
print(kncl.predict([[16]]))
print(kncl.predict([[22]]))
print(kncl.predict([[32]]))

['Koud']
['Fris']
['Gemiddeld']
['Warm']
['Tropisch']
