# Berlin Data

Time series for the Havel River inflow to the city (Konradshöhe, Messstellennummer 305) and the downstream station (Schleuse Spandau, Messstellennummer 320), DOC and TOC.

Two groundwater station are attached, only with quality, no DOC/TOC is measured here, but UV254 and other. The groundwater stations are not influenced by bank filtrate and represent near-natural conditions (for a city like Berlin).

In [5]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import statsmodels.tsa.seasonal as smt
from googletrans import Translator

# Define Paths

In [6]:
data_folder = os.path.join("..", "..", "data", "berlin")

raw_data_folder = os.path.join(data_folder, "raw_data")

ground_water_folder = os.path.join(raw_data_folder, "ground water")
surface_water_folder = os.path.join(raw_data_folder, "surface water")

# Load Data

## Ground Water

In [120]:
ts_gw_df = pd.read_csv(
    os.path.join(
        ground_water_folder, "time-series_ground-water_quality.csv"
    )
)

In [121]:
ts_gw_df

Unnamed: 0,Messstellennummer,Datum,Parameter,Einheit,Messwert
0,5130,2005-06-09,Temperatur (Luft),grd Celsius,17.00
1,5130,2005-06-09,pH-Wert (Feld),ohne Einheit,7.20
2,5130,2005-06-09,Temperatur (Wasser),grd C,11.70
3,5130,2005-06-09,Leitfähigkeit 25°C vor Ort,µS/cm,1372.70
4,5130,2005-06-09,Wasserst.(ROK) vor,m,2.50
...,...,...,...,...,...
7393,7285,2021-11-18,EDTA,µg/l,2.90
7394,7285,2021-11-18,NTA,µg/l,-1.00
7395,7285,2021-11-18,Summe Na+Cl,mg/l,92.40
7396,7285,2021-11-18,Trifluoressigsäure,µg/l,0.43


In [122]:
ts_gw_df.rename(
    columns={
        "Messstellennummer": "Station ID",
        "Datum": "Date",
        "Einheit": "Unit",
        "Messwert": "Value",
    },
    inplace=True,
)

### Inspect GW Dataset

In [37]:
translator = Translator()

In [38]:
parameters = ts_gw_df["Parameter"].unique()

In [39]:
parameters

array(['Temperatur (Luft)', 'pH-Wert (Feld)', 'Temperatur (Wasser)',
       'Leitfähigkeit 25°C vor Ort', 'Wasserst.(ROK) vor', 'Chlorid',
       'Fluorid', 'Hydrogenkarbonat', 'Nitrit (N)', 'Nitrat (N)',
       'Ortho-Phosphat (P)', 'Sulfid', 'Sulfat', 'Cyanide (ges.)',
       'Bromid', 'Nitrit', 'Nitrat', 'Ortho-Phosphat', 'Ammonium (N)',
       'Eisen-2', 'Eisen (ges.)', 'Kalium', 'Kalzium', 'Magnesium',
       'Natrium', 'Mangan', 'Ammonium', 'Leitfähigkeit /Lab. bei 25°C',
       'UV-Adsorption (254)', 'CSV (KMNO4)', 'Basenkap. bis 8.2',
       'Säure-Kap. bis 4.3', 'Kohlenstoff (organ.)', 'pH-Wert /Lab.',
       'Gesamthärte', 'Karbonathärte', 'AOX', 'Phenolindex (ges.)',
       'LHKW (Summe)', 'BTXE (Summe)', 'Arsen', 'Barium', 'Blei', 'Bor',
       'Cadmium', 'Chrom', 'Kupfer', 'Aluminium-gelöst', 'Nickel',
       'Quecksilber', 'Selen', 'Zink', 'Dichlormethan', 'Trichlormethan',
       'Tetrachlormethan', '1,2-Dichlorethan', 'cis-1,2-Dichloreth.',
       '1,1,1-Tri-Cl-Ethan', 

In [40]:
translator.translate('Aluminium', src="de", dest="en")

<googletrans.models.Translated at 0x320e461d0>

In [41]:
parameters_translated = [translator.translate(item, dest='en').text for item in parameters.tolist()]

In [42]:
"""Cumulated rainfall
-Environmental temperature
-Water temperature
-Conductivity
-Flow river
Turbidity
-Absorbance 254 nm
-Ammonium
Dissolved oxygen
-Nitrate
-pH
Redox potential"""

'Cumulated rainfall\n-Environmental temperature\n-Water temperature\n-Conductivity\n-Flow river\nTurbidity\n-Absorbance 254 nm\n-Ammonium\nDissolved oxygen\n-Nitrate\n-pH\nRedox potential'

In [43]:
parameters_translated

['temperature (air)',
 'pH value (field)',
 'temperature (water)',
 'Conductivity 25°C on site',
 'Water level (ROK).',
 'Chloride',
 'fluoride',
 'Hydrogenkarbonat',
 'Nitrit (N)',
 'Nitrate (N)',
 'Ortho-Phosphat (P)',
 'Sulfide',
 'Sulfate',
 'Cyanide (ges.)',
 'Bromide',
 'Nitrite',
 'Nitrate',
 'Ortho-Phosphat',
 'Ammonium (N)',
 'Eisen-2',
 'iron (general)',
 'Potassium',
 'calcium',
 'Magnesium',
 'Sodium',
 'Mangan',
 'Ammonium',
 'Conductivity / Lab. at 25°C',
 'UV-Adsorption (254)',
 'CSV (KMNO4)',
 'base cap. until 8.2',
 'Acid cap. until 4.3',
 'Carbon (organic)',
 'pH-Wert /Lab.',
 'Total hardness',
 'Carbonate hardness',
 'AOX',
 'Phenolindex (ges.)',
 'LHKW (total)',
 'BTXE (total)',
 'Arsenic',
 'Barium',
 'Nappy',
 'Bor',
 'Cadmium',
 'Chrom',
 'copper',
 'Aluminum dissolved',
 'Nickel',
 'mercury',
 'Selenium',
 'Zink',
 'Dichlormethan',
 'Trichlormethan',
 'Tetrachlormethan',
 '1,2-Dichlorethan',
 'cis-1,2-Dichloreth.',
 '1,1,1-Tri-Cl-Ethan',
 'Trichlorethen',
 'Tetr

### Build Dataset per Station

In [123]:
variables = {
    'Temperatur (Luft)': 'Air Temperature',
    'Temperatur (Wasser)': 'Water Temperature',
    'UV-Adsorption (254)': 'Absorbance 254nm',
    'Leitfähigkeit 25°C vor Ort': 'Conductivity',
    'Ammonium (N)': 'Ammonium',
    'Nitrat (N)': 'Nitrate',
    'pH-Wert (Feld)': 'pH',
}

In [124]:
ground_df = ts_gw_df[ts_gw_df['Parameter'].isin(variables.keys())]

ground_df['Parameter'] = ground_df['Parameter'].map(variables)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [125]:
ground_df['Station ID'].unique()

array([5130, 7285])

In [127]:
ground_df['Date'] = pd.to_datetime(ground_df['Date'])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [128]:
stations_dict = {}
for station in ground_df['Station ID'].unique():
    station_df = ground_df[ground_df['Station ID'] == station]
    station_df = station_df.pivot_table(
        index=pd.Grouper('Date'),
        columns='Parameter',
        values='Value'
    )
    
    stations_dict[station] = station_df

### Analyze Stations

#### 5130 - Treptow-Köpenick

In [129]:
station_df = stations_dict[5130]

In [130]:
station_df.isna().sum() / station_df.shape[0]

Parameter
Absorbance 254nm     0.0
Air Temperature      0.0
Ammonium             0.0
Conductivity         0.0
Nitrate              0.0
Water Temperature    0.0
pH                   0.0
dtype: float64

In [131]:
# compute the frequency of the time series
station_df.index.to_series().diff().value_counts()

Date
158 days    2
182 days    2
175 days    2
192 days    2
166 days    1
147 days    1
169 days    1
196 days    1
134 days    1
239 days    1
173 days    1
191 days    1
207 days    1
189 days    1
198 days    1
127 days    1
168 days    1
231 days    1
203 days    1
130 days    1
227 days    1
143 days    1
224 days    1
Name: count, dtype: int64

In [133]:
# most of the time series have a frequency of 6 months

##### Time series

In [134]:
# plot the data
for column in station_df.columns:
    fig = px.line(
        station_df,
        x=station_df.index,
        y=column,
        title=f"{column} at station 5130",
        labels={
            'Date': 'Date',
            column: column
        }
    )
    fig.show()

##### Boxplots

In [136]:
# boxplot of the data
for column in station_df.columns:
    
    fig = go.Figure()
    column_df = station_df[column]
    
    for year in column_df.index.year.unique():
        fig.add_trace(
            go.Box(
                y=column_df[column_df.index.year == year],
                name=year
            )
        )
    fig.update_layout(
        title=f"{column} at station 5130",
        xaxis_title="Year",
        yaxis_title=column
    )
    
    fig.show() 

#### 7285 - Steglitz-Zehlendorf

In [137]:
station_df = stations_dict[7285]

In [138]:
station_df.isna().sum() / station_df.shape[0]

Parameter
Absorbance 254nm     0.018519
Air Temperature      0.018519
Ammonium             0.018519
Conductivity         0.018519
Nitrate              0.018519
Water Temperature    0.000000
pH                   0.000000
dtype: float64

In [139]:
# compute the frequency of the time series
station_df.index.to_series().diff().value_counts()

Date
175 days    4
205 days    2
133 days    2
190 days    2
212 days    2
152 days    2
167 days    2
209 days    1
89 days     1
229 days    1
225 days    1
210 days    1
204 days    1
154 days    1
226 days    1
218 days    1
155 days    1
195 days    1
187 days    1
84 days     1
153 days    1
189 days    1
201 days    1
185 days    1
199 days    1
271 days    1
176 days    1
198 days    1
159 days    1
211 days    1
148 days    1
179 days    1
192 days    1
203 days    1
208 days    1
4 days      1
173 days    1
217 days    1
164 days    1
137 days    1
202 days    1
183 days    1
161 days    1
196 days    1
Name: count, dtype: int64

In [140]:
# most of the time series have a frequency of 6 months

##### Time series

In [141]:
# plot the data
for column in station_df.columns:
    fig = px.line(
        station_df,
        x=station_df.index,
        y=column,
        title=f"{column} at station 7285",
        labels={
            'Date': 'Date',
            column: column
        }
    )
    fig.show()

##### Boxplots

In [142]:
# boxplot of the data
for column in station_df.columns:
    
    fig = go.Figure()
    column_df = station_df[column]
    
    for year in column_df.index.year.unique():
        fig.add_trace(
            go.Box(
                y=column_df[column_df.index.year == year],
                name=year
            )
        )
    fig.update_layout(
        title=f"{column} at station 7285",
        xaxis_title="Year",
        yaxis_title=column
    )
    
    fig.show() 

## Surface Water

In [50]:
ts_sw_df = pd.read_csv(
    os.path.join(
        surface_water_folder, "time-series_surface-water_quality.csv"
    )
)

In [51]:
flow_df = pd.read_csv(
    os.path.join(
        surface_water_folder, "time-series_surface-water_flow.csv"
    )
)

In [52]:
ts_sw_df

Unnamed: 0,Messstelle,Messstellennummer,Datum,Parameter,Entnahmetiefe [m],Messmethode,Vorzeichen,Wert,Einheit,Bestimmungsgrenze
0,Dämeritzsee - Seemitte,105,1986-03-31T22:00:00Z,Abfiltrierbare Stoffe,0.5,---,,9.40,mg/l,
1,Dämeritzsee - Seemitte,105,1986-03-31T22:00:00Z,Calcium,0.5,---,,89.00,mg/l,
2,Dämeritzsee - Seemitte,105,1986-03-31T22:00:00Z,Chlorid,0.5,---,,48.00,mg/l,
3,Dämeritzsee - Seemitte,105,1986-03-31T22:00:00Z,Eisen gesamt,0.5,---,,0.81,mg/l,
4,Dämeritzsee - Seemitte,105,1986-03-31T22:00:00Z,Kaliumpermanganatverbrauch als O2,0.5,---,,6.20,mg/l,
...,...,...,...,...,...,...,...,...,...,...
52405,Havel - Pichelsdorfer Gemünd,325,2024-02-26T09:20:00Z,Sulfat,0.5,DIN EN ISO 10304-1 (D20),,158.00,mg/l,1.0
52406,Havel - Pichelsdorfer Gemünd,325,2024-02-26T09:20:00Z,Lufttemperatur,0.5,DIN 38404 (C4),,8.00,°C,
52407,Havel - Pichelsdorfer Gemünd,325,2024-02-26T09:20:00Z,TOC (Organischer Kohlenstoff),0.5,DIN EN 1484 (H3),,10.00,mg/l,1.0
52408,Havel - Pichelsdorfer Gemünd,325,2024-02-26T09:20:00Z,Wassertemperatur,0.5,DIN 38404 (C4),,7.60,°C,


In [53]:
ts_sw_df.rename(
    columns={
        "Messstelle": "Station",
        "Messstellennummer": "Station ID",
        "Datum": "Date",
        "Einheit": "Unit",
        "Wert": "Value",
    },
    inplace=True,
)

ts_sw_df.drop(
    columns=[
        'Entnahmetiefe [m]',
        'Vorzeichen',
        'Bestimmungsgrenze',
        'Messmethode'
    ],
    inplace=True
)

### Inspect SW Dataset

In [54]:
translator = Translator()

In [55]:
parameters = ts_sw_df["Parameter"].unique()

In [56]:
parameters

array(['Abfiltrierbare Stoffe', 'Calcium', 'Chlorid', 'Eisen gesamt',
       'Kaliumpermanganatverbrauch als O2', 'Säurekapazitat pH 4,3',
       'Leitfähigkeit', 'Magnesium', 'Ammonium-Stickstoff',
       'Nitrit-Stickstoff', 'Nitrat-Stickstoff',
       'ortho-Phosphat-Phosphor', 'Sauerstoff-Gehalt', 'pH-Wert',
       'Gesamt-Phosphor (P) gesamt', 'Sulfat', 'Wassertemperatur',
       'CSB (Chem. Sauerstoffbedarf)', 'Mangan gesamt', 'Chlorophyll-a',
       'Coliforme B.', 'E.Coli', 'Phaeophytin', 'Phenole',
       'Glühverlust der abfiltrierbaren Stoffe',
       'Spektraler Absorptionskoeffizient (SAK) 254nm', 'Sichttiefe',
       'Lufttemperatur', 'DOC (Gelöster organischer Kohlenstoff)',
       'BSB1 (Biochem. Sauerstoffbedarf, 24h)',
       'AOX (Adsorbierbare organ. Halogenverbindungen)',
       'TOC (Organischer Kohlenstoff)', 'Silicium gelöst',
       'Gesamt-Stickstoff (N) gelöst', 'Gesamt-Stickstoff (N) gesamt',
       'Arsen gesamt', 'Cadmium gesamt', 'Chrom gesamt', 'Kalium',

In [57]:
parameters_translated = [translator.translate(item, dest='en').text for item in parameters.tolist()]

In [58]:
"""
Cumulated rainfall
-Environmental temperature
-Water temperature
-Conductivity
-Flow river
Turbidity
-Absorbance 254 nm
-Ammonium
-Dissolved oxygen
-Nitrate
-pH
Redox potential
"""

'\nCumulated rainfall\n-Environmental temperature\n-Water temperature\n-Conductivity\n-Flow river\nTurbidity\n-Absorbance 254 nm\n-Ammonium\n-Dissolved oxygen\n-Nitrate\n-pH\nRedox potential\n'

In [59]:
parameters_translated

['Filterable substances',
 'Calcium',
 'Chloride',
 'Total iron',
 'Potassium permanganate consumption as O2',
 'Acid capacity pH 4.3',
 'conductivity',
 'Magnesium',
 'Ammonium nitrogen',
 'Nitrite nitrogen',
 'Nitrate nitrogen',
 'ortho-Phosphat-Phosphor',
 'Oxygen content',
 'PH value',
 'Total phosphorus (P) total',
 'Sulfate',
 'Water temperature',
 'COD (Chemical Oxygen Demand)',
 'total manganese',
 'Chlorophyll-a',
 'Coliform B.',
 'E.Coli',
 'Phaeophytin',
 'Phenole',
 'Loss of ignition of the substances that can be filtered off',
 'Spectral absorption coefficient (SAK) 254nm',
 'depth of view',
 'air temperature',
 'DOC (Dissolved Organic Carbon)',
 'BSB1 (Biochemical oxygen demand, 24h)',
 'AOX (Adsorbable organic halogen compounds)',
 'TOC (Organic Carbon)',
 'Silicon dissolved',
 'Total nitrogen (N) dissolved',
 'Total nitrogen (N).',
 'Total arsenic',
 'Total cadmium',
 'Total chrome',
 'Potassium',
 'Sodium',
 'Stay together',
 'Boron total',
 'Total copper',
 'Hydrogenc

### Build Dataset per Station

In [60]:
# the parameters that are present for the moment are:
variables = {
    'Lufttemperatur': 'Air Temperature',
    'Wassertemperatur': 'Water Temperature',
    'Spektraler Absorptionskoeffizient (SAK) 254nm': 'Absorbance 254nm',
    'Leitfähigkeit': 'Conductivity',
    'Ammonium-Stickstoff': 'Ammonium',
    'Nitrat-Stickstoff': 'Nitrate',
    'pH-Wert': 'pH',
    'DOC (Gelöster organischer Kohlenstoff)': 'DOC',
    'TOC (Organischer Kohlenstoff)': 'TOC',
    'Sauerstoff-Gehalt': 'Dissolved Oxygen',
}

In [61]:
surface_df = ts_sw_df[ts_sw_df['Parameter'].isin(variables.keys())]

surface_df['Parameter'] = surface_df['Parameter'].map(variables)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  surface_df['Parameter'] = surface_df['Parameter'].map(variables)


In [65]:
surface_df['Station ID'].unique()

array([105, 305, 325])

In [73]:
surface_df['Station'].unique()

array(['Dämeritzsee - Seemitte', 'Oberhavel - Konradshöhe',
       'Havel - Pichelsdorfer Gemünd'], dtype=object)

In [78]:
surface_df['Date'] = pd.to_datetime(surface_df['Date'])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [79]:
stations_dict = {}
for station in surface_df['Station ID'].unique():
    station_df = surface_df[surface_df['Station ID'] == station]
    station_df = station_df.pivot_table(
        index=pd.Grouper('Date'),
        columns='Parameter',
        values='Value'
    )
    
    stations_dict[station] = station_df

### Analyze Stations

#### 105 - Dämeritzsee-Seemitte

In [80]:
station_df = stations_dict[105]

In [81]:
station_df.isna().sum() / station_df.shape[0]

Parameter
Absorbance 254nm     0.993763
Air Temperature      0.239085
Ammonium             0.083160
Conductivity         0.008316
DOC                  0.284823
Dissolved Oxygen     0.010395
Nitrate              0.066528
TOC                  0.326403
Water Temperature    0.012474
pH                   0.010395
dtype: float64

In [88]:
# compute the frequency of the time series
station_df.index.to_series().diff().value_counts()

Date
14 days 00:00:00    106
28 days 00:00:00    103
15 days 00:00:00     18
28 days 01:00:00     17
13 days 00:00:00     15
                   ... 
40 days 22:40:00      1
27 days 00:30:00      1
71 days 10:50:00      1
42 days 01:00:00      1
28 days 00:48:00      1
Name: count, Length: 144, dtype: int64

In [89]:
# most of the time series have a frequency of 14 days or 1 month

##### Time series

In [84]:
# plot the data
for column in station_df.columns:
    fig = px.line(
        station_df,
        x=station_df.index,
        y=column,
        title=f"{column} at station 105",
        labels={
            'Date': 'Date',
            column: column
        }
    )
    fig.show()

##### Boxplots

In [106]:
# boxplot of the data
for column in station_df.columns:
    
    fig = go.Figure()
    column_df = station_df[column]
    
    for year in column_df.index.year.unique():
        fig.add_trace(
            go.Box(
                y=column_df[column_df.index.year == year],
                name=year
            )
        )
    fig.update_layout(
        title=f"{column} at station 105",
        xaxis_title="Year",
        yaxis_title=column
    )
    
    fig.show() 

#### 305 - Oberhavel-Konradshöhe

In [108]:
station_df = stations_dict[305]

In [109]:
station_df.isna().sum() / station_df.shape[0]

Parameter
Air Temperature      0.002203
Ammonium             0.004405
Conductivity         0.241189
DOC                  0.299559
Dissolved Oxygen     0.007709
Nitrate              0.005507
TOC                  0.500000
Water Temperature    0.001101
pH                   0.007709
dtype: float64

In [110]:
# compute the frequency of the time series
station_df.index.to_series().diff().value_counts()

Date
28 days 00:00:00    224
14 days 00:00:00    150
21 days 00:00:00     85
35 days 00:00:00     75
28 days 01:00:00     24
                   ... 
42 days 23:00:00      1
38 days 00:00:00      1
41 days 23:00:00      1
39 days 23:00:00      1
28 days 00:36:00      1
Name: count, Length: 164, dtype: int64

In [111]:
# most of the time series have a frequency of 14 days or 1 month

##### Time series

In [112]:
# plot the data
for column in station_df.columns:
    fig = px.line(
        station_df,
        x=station_df.index,
        y=column,
        title=f"{column} at station 105",
        labels={
            'Date': 'Date',
            column: column
        }
    )
    fig.show()

##### Boxplots

In [113]:
# boxplot of the data
for column in station_df.columns:
    
    fig = go.Figure()
    column_df = station_df[column]
    
    for year in column_df.index.year.unique():
        fig.add_trace(
            go.Box(
                y=column_df[column_df.index.year == year],
                name=year
            )
        )
    fig.update_layout(
        title=f"{column} at station 105",
        xaxis_title="Year",
        yaxis_title=column
    )
    
    fig.show() 

#### 325 - Havel-Pichelsdorfer Gemünd

In [114]:
station_df = stations_dict[325]

In [115]:
station_df.isna().sum() / station_df.shape[0]

Parameter
Air Temperature      0.001208
Ammonium             0.007246
Conductivity         0.259662
DOC                  0.326087
Dissolved Oxygen     0.002415
Nitrate              0.007246
TOC                  0.544686
Water Temperature    0.000000
pH                   0.001208
dtype: float64

In [116]:
# compute the frequency of the time series
station_df.index.to_series().diff().value_counts()

Date
28 days 00:00:00    286
21 days 00:00:00     87
35 days 00:00:00     71
14 days 00:00:00     29
28 days 01:00:00     26
                   ... 
41 days 22:55:00      1
27 days 21:25:00      1
28 days 12:10:00      1
33 days 00:00:00      1
28 days 00:53:00      1
Name: count, Length: 152, dtype: int64

In [117]:
# most of the time series have a frequency of 14 days or 1 month

##### Time series

In [118]:
# plot the data
for column in station_df.columns:
    fig = px.line(
        station_df,
        x=station_df.index,
        y=column,
        title=f"{column} at station 105",
        labels={
            'Date': 'Date',
            column: column
        }
    )
    fig.show()

##### Boxplots

In [119]:
# boxplot of the data
for column in station_df.columns:
    
    fig = go.Figure()
    column_df = station_df[column]
    
    for year in column_df.index.year.unique():
        fig.add_trace(
            go.Box(
                y=column_df[column_df.index.year == year],
                name=year
            )
        )
    fig.update_layout(
        title=f"{column} at station 105",
        xaxis_title="Year",
        yaxis_title=column
    )
    
    fig.show()

# UVA254 Raw Analysis

In [211]:
ts_uva254_df = ts_gw_df[
    ts_gw_df["Parameter"] == "UV-Adsorption (254)"
].copy()

In [212]:
ts_uva254_df

Unnamed: 0,Station ID,Date,Parameter,Unit,Value
28,5130,2005-06-09,UV-Adsorption (254),1/m,10.2
112,5130,2005-11-22,UV-Adsorption (254),1/m,10.1
204,5130,2006-06-02,UV-Adsorption (254),1/m,10.4
294,5130,2006-11-07,UV-Adsorption (254),1/m,11.9
387,5130,2007-03-30,UV-Adsorption (254),1/m,10.3
...,...,...,...,...,...
6937,7285,2019-10-15,UV-Adsorption (254),1/m,10.3
7036,7285,2020-05-14,UV-Adsorption (254),1/m,11.0
7135,7285,2020-10-13,UV-Adsorption (254),1/m,10.0
7234,7285,2021-05-06,UV-Adsorption (254),1/m,10.5


In [213]:
ts_uva254_df["Date"] = pd.to_datetime(
    ts_uva254_df["Date"], format="%Y-%m-%d", errors="coerce"
)

In [214]:
ts_uva254_df['Year'] = ts_uva254_df['Date'].dt.year
ts_uva254_df['Month'] = ts_uva254_df['Date'].dt.month

In [215]:
counts = ts_uva254_df['Station ID'].value_counts()

In [216]:
fig = px.line(
    ts_uva254_df,
    x="Date",
    y="Value",
    color="Station ID",
)

fig.update_layout(
    title={
        "text": "UV-Adsorption (254)",
        "x": 0.5,
        "xanchor": "center",
    },
    xaxis_title="Date",
    yaxis_title="Value",
)

fig.show()

### Station 7285

In [217]:
station_7285_df = ts_uva254_df[ts_uva254_df['Station ID'] == 7285].copy()

In [218]:
station_7285_df["Season"] = station_7285_df["Month"].apply(
    lambda x: "Winter"
    if x in [12, 1, 2]
    else "Spring"
    if x in [3, 4, 5]
    else "Summer"
    if x in [6, 7, 8]
    else "Autumn"
)

In [223]:
# plot station 7285 with seasons as hue
fig = px.line(
    station_7285_df,
    x="Date",
    y="Value",
    color="Season",
)

fig.update_layout(
    title={
        "text": "UV-Adsorption (254) at station 7285",
        "x": 0.5,
        "xanchor": "center",
    },
    xaxis_title="Date",
    yaxis_title="Value",
)

fig.show()

In [226]:
mean_station_7285_df = station_7285_df.groupby(["Year"]).agg({"Value": ["mean", "count"]}).reset_index().copy()

In [227]:
mean_station_7285_df

Unnamed: 0_level_0,Year,Value,Value
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count
0,1995,8.6,1
1,1996,8.6,2
2,1997,7.8,2
3,1998,7.2,2
4,1999,6.8,2
5,2000,5.3,2
6,2001,5.5,2
7,2002,5.3,2
8,2003,6.05,2
9,2004,6.95,2


In [228]:
mean_station_7285_df = station_7285_df.groupby(["Season"]).agg({"Value": ["mean", "count"]}).reset_index().copy()

In [229]:
mean_station_7285_df

Unnamed: 0_level_0,Season,Value,Value
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count
0,Autumn,8.253846,26
1,Spring,8.642857,21
2,Summer,8.04,5
3,Winter,6.5,1


In [230]:
# maggiorparte delle misurazioni in autunno e primavera, semestrali circa

In [232]:
ts = station_7285_df[['Date', 'Value']].copy()

result_7285 = smt.seasonal_decompose(
    ts.set_index('Date'), model="additive", period=2
)

In [238]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=result_7285.trend.index,
        y=result_7285.trend,
        mode="lines+markers",
        name="MA period=2",
        line=dict(color="blue"),
    )
)

fig.add_trace(
    go.Scatter(
        x=ts['Date'],
        y=ts['Value'],
        mode="lines+markers",
        name="Original",
        line=dict(color="red"),
    )
)

fig.show()

### Station 5130