# Pandas time series

Loading and optimizing data

In [1]:
import pandas as pd

df = pd.read_parquet('daily_weather.parquet')

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27635763 entries, 0 to 24220
Data columns (total 14 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   station_id              category      
 1   city_name               category      
 2   date                    datetime64[us]
 3   season                  category      
 4   avg_temp_c              float64       
 5   min_temp_c              float64       
 6   max_temp_c              float64       
 7   precipitation_mm        float64       
 8   snow_depth_mm           float64       
 9   avg_wind_dir_deg        float64       
 10  avg_wind_speed_kmh      float64       
 11  peak_wind_gust_kmh      float64       
 12  avg_sea_level_pres_hpa  float64       
 13  sunshine_total_min      float64       
dtypes: category(3), datetime64[us](1), float64(10)
memory usage: 2.6 GB


In [3]:
df.head(5)

Unnamed: 0,station_id,city_name,date,season,avg_temp_c,min_temp_c,max_temp_c,precipitation_mm,snow_depth_mm,avg_wind_dir_deg,avg_wind_speed_kmh,peak_wind_gust_kmh,avg_sea_level_pres_hpa,sunshine_total_min
0,41515,Asadabad,1957-07-01,Summer,27.0,21.1,35.6,0.0,,,,,,
1,41515,Asadabad,1957-07-02,Summer,22.8,18.9,32.2,0.0,,,,,,
2,41515,Asadabad,1957-07-03,Summer,24.3,16.7,35.6,1.0,,,,,,
3,41515,Asadabad,1957-07-04,Summer,26.6,16.1,37.8,4.1,,,,,,
4,41515,Asadabad,1957-07-05,Summer,30.8,20.0,41.7,0.0,,,,,,


In [4]:
# Conversion of datatypes
float_cols = df.select_dtypes(include=['float']).columns
df[float_cols] = df[float_cols].apply(pd.to_numeric, downcast='float') # Optimize floats

In [5]:
df.dtypes # Rest of the columns are optimized, category is perfect for city_name, station_id and the season, because these column values are repeating

station_id                      category
city_name                       category
date                      datetime64[us]
season                          category
avg_temp_c                       float32
min_temp_c                       float32
max_temp_c                       float32
precipitation_mm                 float32
snow_depth_mm                    float32
avg_wind_dir_deg                 float32
avg_wind_speed_kmh               float32
peak_wind_gust_kmh               float32
avg_sea_level_pres_hpa           float32
sunshine_total_min               float32
dtype: object

konwersja danych do DataTime type, za pomoca **pd.to_datetime(df['kolumna'], format='%d.%m.%Y')**, dziala to tylko na stringu. Przy podaniu jawnego formatu, jest szybsze oraz mozemy podawac rozne formaty daty

In [6]:
df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y') # Tutaj akurat nie zadziala bo kolumna date juz jest formatu daty

In [7]:
df

Unnamed: 0,station_id,city_name,date,season,avg_temp_c,min_temp_c,max_temp_c,precipitation_mm,snow_depth_mm,avg_wind_dir_deg,avg_wind_speed_kmh,peak_wind_gust_kmh,avg_sea_level_pres_hpa,sunshine_total_min
0,41515,Asadabad,1957-07-01,Summer,27.000000,21.100000,35.599998,0.0,,,,,,
1,41515,Asadabad,1957-07-02,Summer,22.799999,18.900000,32.200001,0.0,,,,,,
2,41515,Asadabad,1957-07-03,Summer,24.299999,16.700001,35.599998,1.0,,,,,,
3,41515,Asadabad,1957-07-04,Summer,26.600000,16.100000,37.799999,4.1,,,,,,
4,41515,Asadabad,1957-07-05,Summer,30.799999,20.000000,41.700001,0.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24216,67975,Masvingo,2023-09-01,Spring,19.500000,9.600000,28.400000,,,180.0,4.6,,,
24217,67975,Masvingo,2023-09-02,Spring,21.299999,10.500000,31.400000,,,146.0,6.3,,,
24218,67975,Masvingo,2023-09-03,Spring,22.100000,13.000000,31.500000,,,147.0,8.2,,,
24219,67975,Masvingo,2023-09-04,Spring,21.500000,13.100000,29.700001,,,155.0,10.2,,,


Dostep do elementow daty, poprzez **df['kolumna_z_formatem_daty'].dt.x**, gdzie mozemy uzyc, jako x:
-   year # rok
-   month # miesiac
-   day # dzien
-   weekday # dzien tygodnia 
-   date # tylko data bez godziny
-   floor('D') # zaokraglenie daty w dol

Czestotliwosci do floor()
![image.png](attachment:image.png)

In [8]:
df['date'].dt.year.head(5)

0    1957
1    1957
2    1957
3    1957
4    1957
Name: date, dtype: int32

In [9]:
df['date'].dt.month.head()

0    7
1    7
2    7
3    7
4    7
Name: date, dtype: int32

In [10]:
df['date'].dt.month.unique()
months = {1: 'Styczen',
          2: 'Luty',
          3: "Marzec",
          4: 'Kwiecien',
          5: 'Maj',
          6: 'Czerwiec',
          7: 'Lipiec',
          8: 'Sierpien',
          9: 'Wrzesien',
          10: 'Pazdziernik',
          11: 'Listopad',
          12: 'Grudzien'}
df['date'].dt.month.map(months).head()



0    Lipiec
1    Lipiec
2    Lipiec
3    Lipiec
4    Lipiec
Name: date, dtype: object

In [11]:
df['date'].dt.day

0        1
1        2
2        3
3        4
4        5
        ..
24216    1
24217    2
24218    3
24219    4
24220    5
Name: date, Length: 27635763, dtype: int32

In [12]:
days = {0: 'Poniedzialek',
        1: 'Wtorek',
        2: 'Sroda',
        3: 'Czwartek',
        4: 'Piatek',
        5: 'Sobota',
        6: 'Niedziela'}

df['date'].dt.weekday.map(days).head()

0    Poniedzialek
1          Wtorek
2           Sroda
3        Czwartek
4          Piatek
Name: date, dtype: object

In [13]:
df['date'].dt.date.head()

0    1957-07-01
1    1957-07-02
2    1957-07-03
3    1957-07-04
4    1957-07-05
Name: date, dtype: object

In [14]:
df['date'].dt.floor('D') 

0       1957-07-01
1       1957-07-02
2       1957-07-03
3       1957-07-04
4       1957-07-05
           ...    
24216   2023-09-01
24217   2023-09-02
24218   2023-09-03
24219   2023-09-04
24220   2023-09-05
Name: date, Length: 27635763, dtype: datetime64[us]

Filtrowanie po dacie

Pierwszy sposob to uzycie nawiasow klamrowych w sposob **df[df['kolumna_z_data'] {wartosc logiczna}]**

In [15]:
df[df['date'] >= '2020-01-01'].head()

Unnamed: 0,station_id,city_name,date,season,avg_temp_c,min_temp_c,max_temp_c,precipitation_mm,snow_depth_mm,avg_wind_dir_deg,avg_wind_speed_kmh,peak_wind_gust_kmh,avg_sea_level_pres_hpa,sunshine_total_min
8309,41515,Asadabad,2020-01-01,Winter,5.2,,,,,,,,,
8310,41515,Asadabad,2020-01-02,Winter,2.7,,,18.0,,,,,,
8311,41515,Asadabad,2020-01-03,Winter,1.6,,,130.0,,,,,,
8312,41515,Asadabad,2020-01-04,Winter,3.8,,,57.900002,,,,,,
8313,41515,Asadabad,2020-01-05,Winter,3.2,,,0.0,,,,,,


In [16]:
df[(df['date'] >= '1960-01-01') & (df['date'] < '1970-01-01')].head()

Unnamed: 0,station_id,city_name,date,season,avg_temp_c,min_temp_c,max_temp_c,precipitation_mm,snow_depth_mm,avg_wind_dir_deg,avg_wind_speed_kmh,peak_wind_gust_kmh,avg_sea_level_pres_hpa,sunshine_total_min
549,41515,Asadabad,1961-02-23,Winter,6.7,1.7,15.6,0.0,,,,,,
550,41515,Asadabad,1961-03-25,Spring,10.0,4.4,19.4,0.3,,,,,,
551,41515,Asadabad,1961-03-27,Spring,13.6,6.1,22.799999,0.0,,,,,,
552,41515,Asadabad,1961-04-19,Spring,12.8,10.0,16.1,7.1,,,,,,
553,41515,Asadabad,1961-04-20,Spring,12.5,8.3,17.200001,6.1,,,,,,


Drugi sposob to uzycie query

In [17]:
df.query("date >= '2021-01-01'")

Unnamed: 0,station_id,city_name,date,season,avg_temp_c,min_temp_c,max_temp_c,precipitation_mm,snow_depth_mm,avg_wind_dir_deg,avg_wind_speed_kmh,peak_wind_gust_kmh,avg_sea_level_pres_hpa,sunshine_total_min
8602,41515,Asadabad,2021-01-01,Winter,5.300000,,,,,,,,,
8603,41515,Asadabad,2021-01-02,Winter,5.600000,1.6,9.700000,0.0,,81.0,5.3,,1026.400024,
8604,41515,Asadabad,2021-01-03,Winter,5.300000,1.7,9.700000,0.0,,69.0,3.8,,1023.299988,
8605,41515,Asadabad,2021-01-04,Winter,4.500000,1.5,7.500000,0.0,,60.0,1.3,,1024.300049,
8606,41515,Asadabad,2021-01-05,Winter,4.900000,1.7,7.500000,0.0,,75.0,2.0,,1020.099976,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24216,67975,Masvingo,2023-09-01,Spring,19.500000,9.6,28.400000,,,180.0,4.6,,,
24217,67975,Masvingo,2023-09-02,Spring,21.299999,10.5,31.400000,,,146.0,6.3,,,
24218,67975,Masvingo,2023-09-03,Spring,22.100000,13.0,31.500000,,,147.0,8.2,,,
24219,67975,Masvingo,2023-09-04,Spring,21.500000,13.1,29.700001,,,155.0,10.2,,,


In [18]:
df.query("date >= '1980-01-01' and date < '1990-01-01'")

Unnamed: 0,station_id,city_name,date,season,avg_temp_c,min_temp_c,max_temp_c,precipitation_mm,snow_depth_mm,avg_wind_dir_deg,avg_wind_speed_kmh,peak_wind_gust_kmh,avg_sea_level_pres_hpa,sunshine_total_min
1081,41515,Asadabad,1980-01-01,Winter,4.400000,1.0,8.000000,2.0,,,,,,
1082,41515,Asadabad,1980-01-02,Winter,3.800000,1.0,,2.0,,,,,,
1083,41515,Asadabad,1980-01-03,Winter,2.500000,,9.000000,1.0,,,,,,
1084,41515,Asadabad,1980-01-04,Winter,5.000000,-1.0,9.000000,0.0,,,,,,
1085,41515,Asadabad,1980-01-08,Winter,8.300000,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13369,67975,Masvingo,1989-12-27,Summer,21.799999,16.9,28.100000,0.0,,,,,,
13370,67975,Masvingo,1989-12-28,Summer,25.500000,16.9,30.299999,0.0,,,,,,
13371,67975,Masvingo,1989-12-29,Summer,23.799999,17.0,30.700001,0.0,,,,,,
13372,67975,Masvingo,1989-12-30,Summer,25.299999,16.0,31.299999,0.0,,,,,,


Grupowanie i resampling

Resampling, to metoda zmiany czestotliwosci danych czasowych - z danych dziennych -- (mozemy przejsc) -> na dane miesieczne itp. i do tego mozemy zastosowac agregacje danych, jak funkcje min, max itp. Uzywamy tu funkcji **df.resample()**.

Resample potrzebuje indeksu typu datetime, dlatego trzeba uzywac *.set_index*

In [19]:
df_resampled_mean_max_temp = df.query("date >= '2020-01-01'").set_index('date').resample('M')['max_temp_c'].mean()

  df_resampled_mean_max_temp = df.query("date >= '2020-01-01'").set_index('date').resample('M')['max_temp_c'].mean()


In [20]:
df_resampled_mean_max_temp.head() # Per month

date
2020-01-31    15.601944
2020-02-29    17.612049
2020-03-31    20.038639
2020-04-30    22.773127
2020-05-31    25.639099
Freq: ME, Name: max_temp_c, dtype: float32

In [21]:
df_resampled = df.query("date >= '2020-01-01'").set_index('date').resample('YE')[['avg_temp_c', 'min_temp_c', 'max_temp_c']].mean()

In [22]:
df_resampled # Temperatures avg per year

Unnamed: 0_level_0,avg_temp_c,min_temp_c,max_temp_c
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-12-31,17.967144,12.607007,22.521397
2021-12-31,17.813904,12.943479,23.124237
2022-12-31,17.897158,12.931993,23.247152
2023-12-31,18.761549,13.733894,24.126553


Tworzenie zakresu dat

funkcja **pd.date_range(start='date', end='date', freq='x')** sluzy do tworzenia zakresu dat, od jakies daty do jakiejs konkretnej daty z czestotliwoscia x

In [23]:
pd.date_range(start='2022-01-01', end='2024-01-01', freq='ME') # Daty co miesiac

DatetimeIndex(['2022-01-31', '2022-02-28', '2022-03-31', '2022-04-30',
               '2022-05-31', '2022-06-30', '2022-07-31', '2022-08-31',
               '2022-09-30', '2022-10-31', '2022-11-30', '2022-12-31',
               '2023-01-31', '2023-02-28', '2023-03-31', '2023-04-30',
               '2023-05-31', '2023-06-30', '2023-07-31', '2023-08-31',
               '2023-09-30', '2023-10-31', '2023-11-30', '2023-12-31'],
              dtype='datetime64[ns]', freq='ME')

In [24]:
pd.date_range(start='2025-01-01', periods=10, freq='D') # periods okresla liczbe generowanych dat, w tym przypadku 10

DatetimeIndex(['2025-01-01', '2025-01-02', '2025-01-03', '2025-01-04',
               '2025-01-05', '2025-01-06', '2025-01-07', '2025-01-08',
               '2025-01-09', '2025-01-10'],
              dtype='datetime64[ns]', freq='D')

Typowe zastosowanie to:
- uzupelnianie dat, ktore sa puste, Nan.
- symulacja
- tworzenie wykresow czasowych bez realnego zbioru danych (symulacje)

In [25]:
import numpy as np
import pandas as pd

num_of_data = 100

df_simulated = pd.DataFrame({
    'Data': pd.date_range(start='2025-01-01', periods=num_of_data, freq='D'),
    'Price': np.random.random(size=num_of_data) * 100
})

In [26]:
df_simulated

Unnamed: 0,Data,Price
0,2025-01-01,60.906778
1,2025-01-02,0.521983
2,2025-01-03,27.938428
3,2025-01-04,24.128658
4,2025-01-05,88.797515
...,...,...
95,2025-04-06,86.817298
96,2025-04-07,1.149967
97,2025-04-08,56.720737
98,2025-04-09,88.650941


Roznice czasowe 

In [27]:
data = {
    'uzytkownik': ['Anna', 'Bartek', 'Cezary', 'Daria', 'Ewa'],
    'start': [
        '2024-01-01 08:00:00',
        '2024-01-01 09:15:00',
        '2024-01-01 10:30:00',
        '2024-01-01 11:00:00',
        '2024-01-01 12:45:00',
    ],
    'koniec': [
        '2024-01-01 09:30:00',
        '2024-01-01 10:00:00',
        '2024-01-01 11:45:00',
        '2024-01-01 11:30:00',
        '2024-01-01 14:00:00',
    ]
}

df = pd.DataFrame(data)

In [28]:
df['start'] = pd.to_datetime(df['start'])
df['koniec'] = pd.to_datetime(df['koniec'])

In [29]:
df['czas (sekundy)'] = (df['koniec'] - df['start']).dt.total_seconds()# dt.total_seconds() zwraca ilosc sekund

In [30]:
df['czas'] = pd.to_timedelta(df['czas (sekundy)'], unit="seconds") # unit = okresla jaka jest jednostka w kolumnie, a sama funkcja
# Przeksztalca 

In [31]:
pd.Timedelta('2 hours') # Time delta, moze takze okreslac na podstawie stringow czas, tutaj rozpoznaje hours jako godziny itp

Timedelta('0 days 02:00:00')

In [32]:
pd.Timedelta('2 days')

Timedelta('2 days 00:00:00')

In [33]:
df['czas (min)'] = (df['koniec'] - df['start']).dt.total_seconds() / 60 

In [34]:
df

Unnamed: 0,uzytkownik,start,koniec,czas (sekundy),czas,czas (min)
0,Anna,2024-01-01 08:00:00,2024-01-01 09:30:00,5400.0,0 days 01:30:00,90.0
1,Bartek,2024-01-01 09:15:00,2024-01-01 10:00:00,2700.0,0 days 00:45:00,45.0
2,Cezary,2024-01-01 10:30:00,2024-01-01 11:45:00,4500.0,0 days 01:15:00,75.0
3,Daria,2024-01-01 11:00:00,2024-01-01 11:30:00,1800.0,0 days 00:30:00,30.0
4,Ewa,2024-01-01 12:45:00,2024-01-01 14:00:00,4500.0,0 days 01:15:00,75.0


In [35]:
df['czas (min)'].mean() # Sredni czas w minutach

63.0