### ▶️ Now lets import some python-libraries

In [2]:
import pandas as pd
import datetime
from IPython.display import display, HTML

### ▶️ Reading the prepared data from Google-Drive

#### This takes about 50 !! seconds to load... 
---

- Dataset includes a time period from 20-01-2014 till 20-01-2024
- It contents measurements for "Mittlere Temperatur", "Sonnenscheindauer" and "Windgeschwindigkeit"
- To made this a real world project with big data, it includes ALL german weather stations
- In summary the .csv-file contains over 3 Million datapoints and has a size of nearly 200 MegaBytes

In [3]:
# Import the .csv-file with the weather datasets
df = pd.read_csv('https://drive.usercontent.google.com/download?id=11DYEiMTRVhMySsH5r3cDfCWBU2mqlNqO&export=download&authuser=0&confirm=t&uuid=1aa8951f-3209-4a18-986c-229dfc0f29e2&at=APZUnTVUwg__SVFWwQUEz9ft2zsy:1705864211167', index_col=False)

df.head()

Unnamed: 0,Zeitstempel,Wert,Kategorie,Station
0,2014-01-20,-1.2,Mittlere Temperatur,Doberlug-Kirchhain
1,2014-01-21,-2.0,Mittlere Temperatur,Doberlug-Kirchhain
2,2014-01-22,-4.4,Mittlere Temperatur,Doberlug-Kirchhain
3,2014-01-23,-4.5,Mittlere Temperatur,Doberlug-Kirchhain
4,2014-01-24,-5.3,Mittlere Temperatur,Doberlug-Kirchhain


### ▶️ We have to convert our datatypes first

In [4]:

# Convert "Zeitstempel" to dateTime
df['Zeitstempel'] = pd.to_datetime(df.Zeitstempel)

# Convert "Station" to String
df['Station'] = df['Station'].astype("string")

# Convert "Produkt_Titel" to String
df['Kategorie'] = df['Kategorie'].astype("string")


### Some filters

In [None]:
# Filter by single day
#df_filtered = df1[df1['Zeitstempel'].dt.strftime('%Y-%m-%d') == '2020-01-01']

# Filter by single month
#df_filtered = df1[df1['Zeitstempel'].dt.strftime('%Y-%m') == '2020-01']

# Filter by single month
#df_filtered = df.loc[(df['Zeitstempel'].dt.strftime('%Y') == '2022') & (['Station'] == "Alsfeld")]

#df_filtered = df.loc[((df['Kategorie'] == "Mittlere Temperatur") & (df['Station'] == "Doberlug-Kirchhain") & (df['Zeitstempel'].dt.strftime('%Y') == '2022'))]

#df_filtered = df.loc[((df['Kategorie'] == "Sonnenscheindauer") & (df['Station'] == "Doberlug-Kirchhain") & (df['Zeitstempel'].dt.strftime('%Y') == '2017'))]


### ▶️ Now we can start to filter our data

In [5]:
pd.set_option('display.max_colwidth', None)

def display_side_by_side(dfs:list, captions:list):
    """Display tables side by side to save vertical space
    Input:
        dfs: list of pandas.DataFrame
        captions: list of table captions
    """
    output = ""
    combined = dict(zip(captions, dfs))
    for caption, df in combined.items():
        output += df.style.set_table_attributes("style='display:inline'").set_caption(caption)._repr_html_()
        output += "\xa0\xa0\xa0\xa0\xa0"
    display(HTML(output))

# We take the data for "Sonnenscheindauer"
df_sunshine = df.loc[(df['Kategorie'] == "Sonnenscheindauer")].reset_index(drop=True)
df_sunshine_2 = df_sunshine.groupby('Station')

# We group our pre-chosen "Sonnenscheindauer" data by station, then we aggregate Minimum, Maximum, Mean and Variance

# Sub-set is finally sorted by Minimum in a ascending manner
df_sunshine_min = df_sunshine.groupby('Station')[['Wert']].agg(['min']).sort_values(by=('Wert','min'),ascending=True).reset_index()

# Sub-set is finally sorted by Maximum in a ascending manner
df_sunshine_max = df_sunshine.groupby('Station')[['Wert']].agg(['max']).sort_values(by=('Wert','max'),ascending=False).reset_index()

# Sub-result is finally sorted by Maximum in a ascending manner
df_sunshine_var = df_sunshine.groupby('Station')[['Wert']].agg(['var']).sort_values(by=('Wert','var'),ascending=True).reset_index()

# Sub-result is finally sorted by Mean in a ascending manner
df_sunshine_mean = df_sunshine.groupby('Station')[['Wert']].agg(['mean']).sort_values(by=('Wert','mean'),ascending=False).reset_index()
#df_sunshine_mean = df_sunshine_mean.loc[(df['Wert'] >= 2.0)]

# We resolve the mulitindex to a single index, and give the columns a nicer name
df_sunshine_max.columns = ['Station', 'Maximalwerte']
df_sunshine_min.columns = ['Station', 'Minimalwerte']
df_sunshine_mean.columns = ['Station', 'Mittelwerte']
df_sunshine_var.columns = ['Station', 'Varianzen']

display_side_by_side([df_sunshine_min[0:10], df_sunshine_max[0:10], df_sunshine_mean[0:10], df_sunshine_var[0:10]], ['Minima absteigend', 'Maxima absteigend', 'Mittelwerte absteigend', 'Varianzen aufsteigend'])

Unnamed: 0,Station,Minimalwerte
0,Aachen-Orsbach,0.0
1,Neustadt am Kulm-Filchendorf,0.0
2,Neuruppin,0.0
3,Neunkirchen-Seelscheid-Krawinkel,0.0
4,Neuhütten/Spessart,0.0
5,Neuhaus am Rennweg,0.0
6,"Neuenahr, Bad-Ahrweiler",0.0
7,Neubulach-Oberhaugstett,0.0
8,Neu-Ulrichstein,0.0
9,Naumburg/Saale-Kreipitzsch,0.0

Unnamed: 0,Station,Maximalwerte
0,Leuchtturm Kiel,16.8
1,UFS TW Ems,16.7
2,Schleswig,16.7
3,Sankt Peter-Ording,16.7
4,Rostock-Warnemünde,16.7
5,Arkona,16.7
6,Elpersbüttel,16.6
7,Norderney,16.6
8,Karlshagen,16.6
9,Kiel-Holtenau,16.6

Unnamed: 0,Station,Mittelwerte
0,Ulm,5.896875
1,Kaufbeuren-Oberbeuren,5.763279
2,Greifswalder Oie,5.709145
3,Rheinfelden,5.620839
4,Leutkirch-Herlazhofen,5.592739
5,Arkona,5.582624
6,Stuttgart-Echterdingen,5.536237
7,Hohenpeißenberg,5.466153
8,Konstanz,5.464041
9,Renningen-Ihinger Hof,5.461595

Unnamed: 0,Station,Varianzen
0,Twistetal-Mühlhausen,14.700251
1,Lingen,15.623848
2,Arnsberg-Neheim,16.083763
3,Halle-Kröllwitz,16.143441
4,Glücksburg-Meierwik,16.283662
5,Oberstdorf,16.392853
6,Wittmundhafen,16.685039
7,Brilon-Thülen,16.703138
8,Garmisch-Partenkirchen,16.723962
9,Meppen,16.729907


### Ermitteln von Datensätzen der letzten 3 Jahre mit dem Temperatur-Mittelwert

In [46]:
df_mean = df.loc[((df['Kategorie'] == "Mittlere Temperatur") & (df['Zeitstempel'].dt.strftime('%Y') >= '2021'))]

print(df_mean.info())

<class 'pandas.core.frame.DataFrame'>
Index: 550988 entries, 2538 to 3634896
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Zeitstempel  550988 non-null  datetime64[ns]
 1   Wert         550988 non-null  float64       
 2   Kategorie    550988 non-null  string        
 3   Station      550988 non-null  string        
dtypes: datetime64[ns](1), float64(1), string(2)
memory usage: 21.0 MB
None


In [47]:
df_mean = df_mean.set_index('Zeitstempel')

Ermitteln der Varianz der mittleren Temperatur pro Station im Monat

In [72]:
print(df_mean)

#rolling = df_mean.groupby(pd.Grouper(freq="M"))[['Wert']].rolling("30D").mean()
rolling = df_mean.groupby('Station', as_index=False)[['Wert']].rolling(30).std()

print(rolling)

             Wert            Kategorie                  Station
Zeitstempel                                                    
2021-01-01    1.7  Mittlere Temperatur       Doberlug-Kirchhain
2021-01-02    0.0  Mittlere Temperatur       Doberlug-Kirchhain
2021-01-03    0.4  Mittlere Temperatur       Doberlug-Kirchhain
2021-01-04    0.4  Mittlere Temperatur       Doberlug-Kirchhain
2021-01-05    0.4  Mittlere Temperatur       Doberlug-Kirchhain
...           ...                  ...                      ...
2024-01-16   -2.2  Mittlere Temperatur  Dippoldiswalde-Reinberg
2024-01-17   -2.1  Mittlere Temperatur  Dippoldiswalde-Reinberg
2024-01-18   -2.4  Mittlere Temperatur  Dippoldiswalde-Reinberg
2024-01-19   -3.7  Mittlere Temperatur  Dippoldiswalde-Reinberg
2024-01-20   -3.2  Mittlere Temperatur  Dippoldiswalde-Reinberg

[550988 rows x 3 columns]


                             Wert
Zeitstempel Zeitstempel          
2021-01-31  2021-01-01   1.700000
            2021-01-01   3.000000
            2021-01-01   2.566667
            2021-01-01   2.650000
            2021-01-01   2.560000
...                           ...
2024-01-31  2024-01-20  -0.278174
            2024-01-20  -0.278085
            2024-01-20  -0.278077
            2024-01-20  -0.278659
            2024-01-20  -0.278955

[550988 rows x 1 columns]
