## Parametry

In [1]:
SERIES_LENGHT = 48
VALUES_DATATYPE = 'float16'

## Wczytanie danych z pliku

In [2]:
import pandas as pd

data = pd.read_csv("../resources/hourly_irish_weather.csv")
print(data)
print(data.shape)

         Unnamed: 0                 date       station   county  longitude  \
0                 0  1989-01-01 00:00:00  Cork_Airport     Cork     -8.485   
1                 1  1989-01-01 01:00:00  Cork_Airport     Cork     -8.485   
2                 2  1989-01-01 02:00:00  Cork_Airport     Cork     -8.485   
3                 3  1989-01-01 03:00:00  Cork_Airport     Cork     -8.485   
4                 4  1989-01-01 04:00:00  Cork_Airport     Cork     -8.485   
...             ...                  ...           ...      ...        ...   
4218049     4497433  2017-12-31 19:00:00    Malin_head  Donegal     -7.339   
4218050     4497434  2017-12-31 20:00:00    Malin_head  Donegal     -7.339   
4218051     4497435  2017-12-31 21:00:00    Malin_head  Donegal     -7.339   
4218052     4497436  2017-12-31 22:00:00    Malin_head  Donegal     -7.339   
4218053     4497437  2017-12-31 23:00:00    Malin_head  Donegal     -7.339   

         latitude  rain  temp  wetb  dewpt  ...  rhum     msl  

## Usunięcie niepełnych danych

In [3]:
data.dropna(inplace=True)
print(data.shape)

(1769250, 21)


## Usunięcie zbędnych kolumn;

Zostają:
* Data i godzina - date
* nazwa stacji - station
* opad deszczu w (mm) - rain
* temperatura (°C) - temp
* Wilgotność względna (%) - rhum
* ciśnienie npm (hpa) - msl
* średnia prędkość wiatru (kt) - wdsp
* kod synoptyczny (0-100) - ww
* czas naslonecznienia (h) - sun
* widoczność (m) - vis

In [4]:
data.drop(columns=["Unnamed: 0","county", "longitude", "latitude","wetb","dewpt","vappr", "wddir", "w", "clht", "clamt"], inplace=True)
data = data.loc[:, ~data.columns.str.contains('^Unnamed')]

## Przeskalowanie ciśnienia do zera

In [5]:
min_press = data["msl"].min()
data["msl"] = data["msl"] - min_press

## Wydzielenie daty i okresu godzinowego

In [6]:
data["dateTime"] = pd.to_datetime(data["date"])
data["day"] = data["dateTime"].apply(func=lambda x: x.date())
data["hour"] = data["dateTime"].apply(func=lambda x: x.hour)

data["is_0_1"] = data["hour"].apply(func=lambda x: int(x==0 or x==1))
data["is_2_3"] = data["hour"].apply(func=lambda x: int(x==2 or x==3))
data["is_4_5"] = data["hour"].apply(func=lambda x: int(x==4 or x==5))
data["is_6_7"] = data["hour"].apply(func=lambda x: int(x==6 or x==7))
data["is_8_9"] = data["hour"].apply(func=lambda x: int(x==8 or x==9))
data["is_10_11"] = data["hour"].apply(func=lambda x: int(x==10 or x==11))
data["is_12_13"] = data["hour"].apply(func=lambda x: int(x==12 or x==13))
data["is_14_15"] = data["hour"].apply(func=lambda x: int(x==14 or x==15))
data["is_16_17"] = data["hour"].apply(func=lambda x: int(x==16 or x==17))
data["is_18_19"] = data["hour"].apply(func=lambda x: int(x==18 or x==19))
data["is_20_21"] = data["hour"].apply(func=lambda x: int(x==20 or x==21))
data["is_22_23"] = data["hour"].apply(func=lambda x: int(x==22 or x==23))

## Wydzielenie miesiąca

In [7]:
data["is_jan_feb"] = data["dateTime"].apply(func=lambda x: int(x.month==1 or x.month==2))
data["is_mar_apr"] = data["dateTime"].apply(func=lambda x: int(x.month==3 or x.month==4))
data["is_may_jun"] = data["dateTime"].apply(func=lambda x: int(x.month==5 or x.month==6))
data["is_jul_aug"] = data["dateTime"].apply(func=lambda x: int(x.month==7 or x.month==8))
data["is_sep_oct"] = data["dateTime"].apply(func=lambda x: int(x.month==9 or x.month==10))
data["is_nov_dec"] = data["dateTime"].apply(func=lambda x: int(x.month==11 or x.month==12))

## Wydzielenie typu pogody - na podstawie pogody w

In [8]:
data["is_type_0"] = data["ww"].apply(func=lambda x: int(x<10))
data["is_type_1"] = data["ww"].apply(func=lambda x: int(x>=10 and x<20))
data["is_type_2"] = data["ww"].apply(func=lambda x: int(x>=20 and x<30))
data["is_type_3"] = data["ww"].apply(func=lambda x: int(x>=30 and x<40))
data["is_type_4"] = data["ww"].apply(func=lambda x: int(x>=40 and x<50))
data["is_type_5"] = data["ww"].apply(func=lambda x: int(x>=50 and x<60))
data["is_type_6"] = data["ww"].apply(func=lambda x: int(x>=60 and x<70))
data["is_type_7"] = data["ww"].apply(func=lambda x: int(x>=70 and x<80))
data["is_type_8"] = data["ww"].apply(func=lambda x: int(x>=80 and x<90))
data["is_type_9"] = data["ww"].apply(func=lambda x: int(x>=90))

## Usunięcie nepotrzebnych kolumn czasu i miejsca

In [9]:
data.drop(columns=["date", "station", "ww", "dateTime"], inplace=True)

print(data.columns)

Index(['rain', 'temp', 'rhum', 'msl', 'wdsp', 'sun', 'vis', 'day', 'hour',
       'is_0_1', 'is_2_3', 'is_4_5', 'is_6_7', 'is_8_9', 'is_10_11',
       'is_12_13', 'is_14_15', 'is_16_17', 'is_18_19', 'is_20_21', 'is_22_23',
       'is_jan_feb', 'is_mar_apr', 'is_may_jun', 'is_jul_aug', 'is_sep_oct',
       'is_nov_dec', 'is_type_0', 'is_type_1', 'is_type_2', 'is_type_3',
       'is_type_4', 'is_type_5', 'is_type_6', 'is_type_7', 'is_type_8',
       'is_type_9'],
      dtype='object')


## Normalizacja odpowiednich kolumn

In [10]:
from sklearn import preprocessing

min_max_scaler = preprocessing.StandardScaler()

data[['rain']] = min_max_scaler.fit_transform(data[['rain']].values)
min_max_scaler = preprocessing.StandardScaler()
data[['temp']] = min_max_scaler.fit_transform(data[['temp']].values)
min_max_scaler = preprocessing.StandardScaler()
data[['rhum']] = min_max_scaler.fit_transform(data[['rhum']].values)
min_max_scaler = preprocessing.StandardScaler()
data[['msl']] = min_max_scaler.fit_transform(data[['msl']].values)
min_max_scaler = preprocessing.StandardScaler()
data[['wdsp']] = min_max_scaler.fit_transform(data[['wdsp']].values)
min_max_scaler = preprocessing.StandardScaler()
data[['sun']] = min_max_scaler.fit_transform(data[['sun']].values)
min_max_scaler = preprocessing.StandardScaler()
data[['vis']] = min_max_scaler.fit_transform(data[['vis']].values)

## Konwersja na tensor 3 wymiarowy

In [11]:
import numpy as np
from script.main.series_converter import convert_to_series

print(data.shape)
data = convert_to_series(data, SERIES_LENGHT)

data = np.array(data, dtype=VALUES_DATATYPE)
print(data.shape)

(1769250, 37)


KeyboardInterrupt: 

## Naprawa wartosci NaN

In [None]:
data = np.nan_to_num(data)

## Zapis danych do pliku

In [None]:
file_name = "data_%s_series_%d" % (VALUES_DATATYPE, SERIES_LENGHT)
np.save("../resources/" + file_name, data)