# Spracovanie dát

Tento notebook slúži na kontrolu kvality raw dát a ich predspracovanie.

In [232]:
import pandas as pd

In [233]:
cycle_data_raw = pd.read_csv('../Data/cyklotrasy_data_bratislava_raw.csv')
weather_data_raw = pd.read_csv('../Data/pocasie_data_raw.csv')

## Kvalita

In [234]:
# Skrátenie cyklo dát na časový rozsah počasia
cycle_data_raw["datetime"] = (
    pd.to_datetime(cycle_data_raw["attributes.DATUM_A_CAS"], utc=True)
      .dt.tz_convert("Europe/Bratislava")
      .dt.tz_localize(None)
)

weather_data_raw["date"] = pd.to_datetime(weather_data_raw["date"])

cycle_data_raw = cycle_data_raw[
    cycle_data_raw["datetime"].between(
        weather_data_raw["date"].min(),
        weather_data_raw["date"].max() + pd.Timedelta(days=0.9999)
    )
].copy()

In [235]:
display(cycle_data_raw.count())
display(weather_data_raw.count())

attributes.NAZOV              450301
attributes.ZEMEPISNA_SIRKA    450301
attributes.ZEMEPISNA_DLZKA    450301
attributes.SMER_DO            450301
attributes.SMER_Z             450301
attributes.DATUM_A_CAS        450301
attributes.POCET_DO           450301
attributes.POCET_Z            450301
attributes.ObjectId           450301
datetime                      450301
dtype: int64

date    1085
tavg    1085
tmin    1085
tmax    1085
prcp    1084
snow       0
wdir       0
wspd    1085
wpgt    1085
pres    1085
tsun     748
dtype: int64

In [236]:
cycle_data_raw.describe()

Unnamed: 0,attributes.ZEMEPISNA_SIRKA,attributes.ZEMEPISNA_DLZKA,attributes.POCET_DO,attributes.POCET_Z,attributes.ObjectId,datetime
count,450301.0,450301.0,450301.0,450301.0,450301.0,450301
mean,48.149043,17.092158,12.107348,7.35377,318598.803971,2024-08-17 03:39:18.950124544
min,48.110442,16.969,0.0,0.0,1.0,2023-01-01 00:00:00
25%,48.136692,17.073272,0.0,0.0,115496.0,2023-12-08 04:00:00
50%,48.140819,17.116058,2.0,1.0,233859.0,2024-09-01 06:00:00
75%,48.162144,17.127928,14.0,6.0,580864.0,2025-05-10 01:00:00
max,48.21065,17.138556,583.0,645.0,693456.0,2025-12-20 23:00:00
std,0.025676,0.052929,23.829372,19.27489,232033.702638,


In [237]:
weather_data_raw.describe()

Unnamed: 0,date,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
count,1085,1085.0,1085.0,1085.0,1084.0,0.0,0.0,1085.0,1085.0,1085.0,748.0
mean,2024-06-26 00:00:00,12.521935,8.181659,16.956866,1.666697,,,12.416313,30.272258,1016.847926,267.161765
min,2023-01-01 00:00:00,-5.7,-9.1,-3.4,0.0,,,1.9,11.1,990.1,0.0
25%,2023-09-29 00:00:00,5.7,2.0,9.2,0.0,,,8.3,24.1,1011.7,80.0
50%,2024-06-26 00:00:00,12.3,8.2,16.6,0.0,,,11.4,29.6,1016.4,237.0
75%,2025-03-24 00:00:00,19.2,14.1,24.7,1.025,,,15.5,35.2,1021.8,419.25
max,2025-12-20 00:00:00,28.9,23.9,36.0,37.3,,,34.4,72.2,1043.3,792.0
std,,8.104544,7.340852,9.297833,4.167819,,,5.387349,8.84539,8.208956,211.41372


In [238]:
print('Null values in cycle path data:', cycle_data_raw.isnull().sum())
print('Null values in weather data:', weather_data_raw.isnull().sum())

Null values in cycle path data: attributes.NAZOV              0
attributes.ZEMEPISNA_SIRKA    0
attributes.ZEMEPISNA_DLZKA    0
attributes.SMER_DO            0
attributes.SMER_Z             0
attributes.DATUM_A_CAS        0
attributes.POCET_DO           0
attributes.POCET_Z            0
attributes.ObjectId           0
datetime                      0
dtype: int64
Null values in weather data: date       0
tavg       0
tmin       0
tmax       0
prcp       1
snow    1085
wdir    1085
wspd       0
wpgt       0
pres       0
tsun     337
dtype: int64


In [239]:
dupes_cycle = cycle_data_raw.duplicated(subset=["attributes.ObjectId"]).sum()
print("Duplicated cycle path rows:", dupes_cycle)

dupes_weather = weather_data_raw.duplicated(subset=["date"]).sum()
print("Duplicated weather rows:", dupes_weather)

Duplicated cycle path rows: 0
Duplicated weather rows: 0


## Predspracovanie

### Cyklotrasy

In [240]:
# Premenovanie stĺpcov z 'attributes.*' na samotné názvy atribútov a vynechanie ObjectId a datum_a_cas (už máme 'date' ktorý je v správnej časovej zóne voči počasiu)
cycle_data = cycle_data_raw.rename(columns={
    'attributes.NAZOV': 'nazov',
    'attributes.ZEMEPISNA_SIRKA': 'zemepisna_sirka',
    'attributes.ZEMEPISNA_DLZKA': 'zemepisna_dlzka',
    'attributes.SMER_DO': 'smer_do',
    'attributes.SMER_Z': 'smer_z',
    'attributes.POCET_DO': 'pocet_do',
    'attributes.POCET_Z': 'pocet_z'})

cycle_data = cycle_data.drop(columns=['attributes.ObjectId', 'attributes.DATUM_A_CAS'])

In [241]:
def season_from_month(m):
    if m in [12, 1, 2]: return "winter"
    if m in [3, 4, 5]: return "spring"
    if m in [6, 7, 8]: return "summer"
    return "autumn"

In [242]:
# Vytvorenie nových stĺpcov pre analýzu
cycle_data["pocet_total"] = cycle_data["pocet_do"] + cycle_data["pocet_z"]
cycle_data["date"] = cycle_data_raw["datetime"].dt.normalize()
cycle_data["weekday"] = cycle_data["datetime"].dt.weekday
cycle_data["is_weekend"] = cycle_data["weekday"].isin([5, 6]).astype(int)
cycle_data["month"] = cycle_data["datetime"].dt.month
cycle_data["year"] = cycle_data["datetime"].dt.year
cycle_data["season"] = cycle_data["month"].apply(season_from_month)

### Počasie

In [243]:
# Vynechanie prázdnych stĺpcov
weather_data = weather_data_raw.drop(columns=['snow', 'wdir'])

# Doplnenie 0 pre chýbajúcu hodnotu zrážok - 1 riadok
weather_data['prcp'] = weather_data['prcp'].fillna(0)

## Spojenie datasetov

In [244]:
# date v cycle je aj s časom, v weather je len dátum, preto join na základe dátumu
data = cycle_data.merge(weather_data, on='date', how='left')

print(data.shape)
print(data["tavg"].isna().sum())

(450301, 23)
0


In [245]:
data.describe()

Unnamed: 0,zemepisna_sirka,zemepisna_dlzka,pocet_do,pocet_z,datetime,pocet_total,date,weekday,is_weekend,month,year,tavg,tmin,tmax,prcp,wspd,wpgt,pres,tsun
count,450301.0,450301.0,450301.0,450301.0,450301,450301.0,450301,450301.0,450301.0,450301.0,450301.0,450301.0,450301.0,450301.0,450301.0,450301.0,450301.0,450301.0,333433.0
mean,48.149043,17.092158,12.107348,7.35377,2024-08-17 03:39:18.950124544,19.461118,2024-08-16 16:09:14.381180160,3.000486,0.285673,6.635426,2024.117162,12.863284,8.478058,17.350617,1.659009,12.37776,30.165796,1016.761736,270.738214
min,48.110442,16.969,0.0,0.0,2023-01-01 00:00:00,0.0,2023-01-01 00:00:00,0.0,0.0,1.0,2023.0,-5.7,-9.1,-3.4,0.0,1.9,11.1,990.1,0.0
25%,48.136692,17.073272,0.0,0.0,2023-12-08 04:00:00,0.0,2023-12-08 00:00:00,1.0,0.0,4.0,2023.0,6.1,2.5,10.1,0.0,8.3,24.1,1011.7,82.0
50%,48.140819,17.116058,2.0,1.0,2024-09-01 06:00:00,4.0,2024-09-01 00:00:00,3.0,0.0,7.0,2024.0,12.6,8.8,17.3,0.0,11.5,29.6,1016.2,242.0
75%,48.162144,17.127928,14.0,6.0,2025-05-10 01:00:00,23.0,2025-05-10 00:00:00,5.0,1.0,10.0,2025.0,19.4,14.3,24.9,0.9,15.5,35.2,1021.7,423.0
max,48.21065,17.138556,583.0,645.0,2025-12-20 23:00:00,852.0,2025-12-20 00:00:00,6.0,1.0,12.0,2025.0,28.9,23.9,36.0,37.3,34.4,72.2,1043.3,792.0
std,0.025676,0.052929,23.829372,19.27489,,38.803733,,1.999122,0.451735,3.337959,0.802385,8.010492,7.251459,9.210947,4.188012,5.294897,8.692311,7.978726,212.056414


In [246]:
data.isnull().sum()


nazov                   0
zemepisna_sirka         0
zemepisna_dlzka         0
smer_do                 0
smer_z                  0
pocet_do                0
pocet_z                 0
datetime                0
pocet_total             0
date                    0
weekday                 0
is_weekend              0
month                   0
year                    0
season                  0
tavg                    0
tmin                    0
tmax                    0
prcp                    0
wspd                    0
wpgt                    0
pres                    0
tsun               116868
dtype: int64

In [247]:
# Uloženie finálneho datasetu
data.to_csv('../Data/final_data.csv', index=False)