In [2]:
import numpy as np
import pandas as pd

### Air pollution data
https://powietrze.gios.gov.pl/pjp/archives
'Wyniki pomiarów z 2021 roku DANE WSTĘPNIE ZWERYFIKOWANE'

In [3]:
pollution = pd.read_csv("../data/raw/2021_DANE_1H.csv", delimiter=';', parse_dates=True)
pollution["date-time"] = pd.to_datetime(pollution["date-time"])

In [4]:
pollution

Unnamed: 0,date-time,MzWarAlNiepo-PM25-1g
0,2021-01-01 01:00:00,26.68
1,2021-01-01 02:00:00,35.09
2,2021-01-01 03:00:00,43.55
3,2021-01-01 04:00:00,46.44
4,2021-01-01 05:00:00,41.24
...,...,...
8755,2021-12-31 20:00:00,9.03
8756,2021-12-31 21:00:00,8.88
8757,2021-12-31 22:00:00,8.10
8758,2021-12-31 23:00:00,8.65


In [5]:
# Converting times from CET to GMT
pollution["date-time"] = pollution["date-time"] - pd.DateOffset(hours=1)
pollution

Unnamed: 0,date-time,MzWarAlNiepo-PM25-1g
0,2021-01-01 00:00:00,26.68
1,2021-01-01 01:00:00,35.09
2,2021-01-01 02:00:00,43.55
3,2021-01-01 03:00:00,46.44
4,2021-01-01 04:00:00,41.24
...,...,...
8755,2021-12-31 19:00:00,9.03
8756,2021-12-31 20:00:00,8.88
8757,2021-12-31 21:00:00,8.10
8758,2021-12-31 22:00:00,8.65


### Weather data
https://danepubliczne.imgw.pl/data/dane_pomiarowo_obserwacyjne/dane_meteorologiczne/terminowe/synop/2021/
File: 2021_375_s.zip	2022-01-28 09:15	360K

In [6]:
# Reading column names from additional file
names = []
with open("../data/raw/column_names.txt", encoding="utf8") as file:
    for line in file:
        names.append(line.rstrip())

In [7]:
weather_full = pd.read_csv("../data/raw/s_t_375_2021.csv", encoding="iso8859_2", header=None, names=names, parse_dates={'date-time':["Rok", "Miesiąc", "Dzień", "Godzina"]})

In [8]:
weather_full

Unnamed: 0,date-time,Kod stacji,Nazwa stacji,Wysokość podstawy chmur CL CM szyfrowana [kod],Status pomiaru HPOD,Wysokość podstawy niższej [m],Status pomiaru HPON,Wysokość podstawy wyższej [m],Status pomiaru HPOW,Wysokość podstawy tekstowy [opis],...,Wysokość świeżo spadłego śniegu [cm],Status pomiaru HSS,Wysokość śniegu na poletku [cm],Status pomiaru GRSN,Gatunek śniegu [kod],Ukształtowanie pokrywy [kod],Wysokość próbki [cm],Status pomiaru HPRO,Ciężar próbki [g],Status pomiaru CIPR
0,2021-01-01 00:00:00,352200375,WARSZAWA-OKĘCIE,4,,300,,0,8.0,300P,...,0,8.0,0,8.0,,,0,8.0,0,8.0
1,2021-01-01 01:00:00,352200375,WARSZAWA-OKĘCIE,4,,300,,0,8.0,300P,...,0,8.0,0,8.0,,,0,8.0,0,8.0
2,2021-01-01 02:00:00,352200375,WARSZAWA-OKĘCIE,4,,330,,0,8.0,330P,...,0,8.0,0,8.0,,,0,8.0,0,8.0
3,2021-01-01 03:00:00,352200375,WARSZAWA-OKĘCIE,4,,330,,0,8.0,330P,...,0,8.0,0,8.0,,,0,8.0,0,8.0
4,2021-01-01 04:00:00,352200375,WARSZAWA-OKĘCIE,4,,300,,0,8.0,300P,...,0,8.0,0,8.0,,,0,8.0,0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2021-12-31 19:00:00,352200375,WARSZAWA-OKĘCIE,4,,570,,0,8.0,570,...,0,8.0,0,8.0,,,0,8.0,0,8.0
8756,2021-12-31 20:00:00,352200375,WARSZAWA-OKĘCIE,4,,450,,0,8.0,450P,...,0,8.0,0,8.0,,,0,8.0,0,8.0
8757,2021-12-31 21:00:00,352200375,WARSZAWA-OKĘCIE,3,,270,,450,,270P/450P,...,0,8.0,0,8.0,,,0,8.0,0,8.0
8758,2021-12-31 22:00:00,352200375,WARSZAWA-OKĘCIE,3,,240,,0,8.0,240P,...,0,8.0,0,8.0,,,0,8.0,0,8.0


In [9]:
# Only interesting columns
weather = weather_full[["date-time", "Kierunek wiatru  [°]", "Prędkość wiatru  [m/s]", "Temperatura powietrza [°C]", "Ciśnienie na poziomie stacji [hPa]"]]
weather

Unnamed: 0,date-time,Kierunek wiatru [°],Prędkość wiatru [m/s],Temperatura powietrza [°C],Ciśnienie na poziomie stacji [hPa]
0,2021-01-01 00:00:00,260,1,0.9,996.4
1,2021-01-01 01:00:00,277,2,1.0,996.5
2,2021-01-01 02:00:00,92,1,1.0,996.6
3,2021-01-01 03:00:00,259,2,0.9,996.5
4,2021-01-01 04:00:00,270,2,0.8,996.6
...,...,...,...,...,...
8755,2021-12-31 19:00:00,259,6,11.2,996.6
8756,2021-12-31 20:00:00,259,7,10.6,996.6
8757,2021-12-31 21:00:00,257,6,10.5,996.7
8758,2021-12-31 22:00:00,260,6,10.4,996.9


In [10]:
# Processing wind and date
wind_north = np.round(weather["Prędkość wiatru  [m/s]"] * np.cos(np.radians(weather["Kierunek wiatru  [°]"])), decimals=6)
wind_east = np.round(weather["Prędkość wiatru  [m/s]"]* np.sin(np.radians(weather["Kierunek wiatru  [°]"])), decimals=6)
month = weather["date-time"].dt.month
weekday = weather["date-time"].dt.weekday
hour = weather["date-time"].dt.hour

In [11]:
# Adding processed wind and date columns
weather.insert(1, "wind-north", wind_north)
weather.insert(1, "wind-east", wind_east)
weather.insert(0, "hour", hour)
weather.insert(0, "weekday", weekday)
weather.insert(0, "month", month)
weather = weather.drop(columns=["Prędkość wiatru  [m/s]", "Kierunek wiatru  [°]"])
weather

Unnamed: 0,month,weekday,hour,date-time,wind-east,wind-north,Temperatura powietrza [°C],Ciśnienie na poziomie stacji [hPa]
0,1,4,0,2021-01-01 00:00:00,-0.984808,-0.173648,0.9,996.4
1,1,4,1,2021-01-01 01:00:00,-1.985092,0.243739,1.0,996.5
2,1,4,2,2021-01-01 02:00:00,0.999391,-0.034899,1.0,996.6
3,1,4,3,2021-01-01 03:00:00,-1.963254,-0.381618,0.9,996.5
4,1,4,4,2021-01-01 04:00:00,-2.000000,-0.000000,0.8,996.6
...,...,...,...,...,...,...,...,...
8755,12,4,19,2021-12-31 19:00:00,-5.889763,-1.144854,11.2,996.6
8756,12,4,20,2021-12-31 20:00:00,-6.871390,-1.335663,10.6,996.6
8757,12,4,21,2021-12-31 21:00:00,-5.846220,-1.349706,10.5,996.7
8758,12,4,22,2021-12-31 22:00:00,-5.908847,-1.041889,10.4,996.9


In [12]:
# Pollution and weather join on date-time, changing long column names, sorting by date, saving processed data to CSV file
pollution\
    .merge(weather, how="inner", on="date-time")\
    .rename(columns={"MzWarAlNiepo-PM25-1g": "pm2_5", "Temperatura powietrza [°C]" : "temperature", "Ciśnienie na poziomie stacji [hPa]" : "pressure"})\
    .sort_values('date-time').drop(columns=["date-time"])\
    .to_csv("../data/processed/pollution_weather.csv", index=False)

### Target preparation

In [13]:
data = pd.read_csv("../data/processed/pollution_weather.csv", delimiter=',')
data.insert(8, "target", data["pm2_5"])
# We need three independent copies of the dataset
target6 = data.copy()
target12 = data.copy()
target24 = data.copy()
data

Unnamed: 0,pm2_5,month,weekday,hour,wind-east,wind-north,temperature,pressure,target
0,26.68,1,4,0,-0.984808,-0.173648,0.9,996.4,26.68
1,35.09,1,4,1,-1.985092,0.243739,1.0,996.5,35.09
2,43.55,1,4,2,0.999391,-0.034899,1.0,996.6,43.55
3,46.44,1,4,3,-1.963254,-0.381618,0.9,996.5,46.44
4,41.24,1,4,4,-2.000000,-0.000000,0.8,996.6,41.24
...,...,...,...,...,...,...,...,...,...
8755,9.03,12,4,19,-5.889763,-1.144854,11.2,996.6,9.03
8756,8.88,12,4,20,-6.871390,-1.335663,10.6,996.6,8.88
8757,8.10,12,4,21,-5.846220,-1.349706,10.5,996.7,8.10
8758,8.65,12,4,22,-5.908847,-1.041889,10.4,996.9,8.65


In [14]:
# Shifting target column by 6h, 12h, 24h
target6["target"] = target6["target"].shift(-6)
target12["target"] = target12["target"].shift(-12)
target24["target"] = target24["target"].shift(-24)

In [15]:
# Saving data with targets to CSV files, dropping entries with NULL
target6.dropna().to_csv("../data/processed/target06.csv", index=False)
target12.dropna().to_csv("../data/processed/target12.csv", index=False)
target24.dropna().to_csv("../data/processed/target24.csv", index=False)