## Libraries 

In [1]:
import pandas as pd
import numpy as np

import glob
import xarray as xr

import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import statsmodels.api as sm

from statsmodels.tsa.holtwinters import SimpleExpSmoothing, ExponentialSmoothing 

from sklearn.linear_model import LinearRegression

from sktime.transformations.series.detrend import Detrender
from sktime.forecasting.trend import PolynomialTrendForecaster
from sktime.utils.plotting import plot_series

import seaborn as sns

from standard_precip.spi import SPI
from standard_precip.utils import plot_index

In [2]:
#load df
feature_df = pd.read_csv('../data/feature_df.csv', header=0)
feature_df.index = range(1971, 2023)
feature_df.head(10)

Unnamed: 0,Canola_detrend,spi_4,spi_5,spi_6,spi_7,spi_8,spi_9,spi_10,avg_max_temp_4,avg_max_temp_5,avg_max_temp_6,avg_max_temp_7,avg_max_temp_8,avg_max_temp_9,avg_max_temp_10
1971,0.835685,1.55242,0.398286,1.608689,-1.138548,-0.619148,0.699883,1.150625,284.674011,291.586487,295.451691,297.247833,301.780426,292.659363,285.051239
1972,1.019651,0.310719,0.27251,-0.248256,-0.270669,-0.278103,-0.179716,-0.901951,282.146759,292.675629,296.55426,296.034912,299.861603,291.575226,282.966553
1973,3.180447,0.277076,-0.33409,0.603571,0.403538,0.782308,2.709394,0.711722,283.948242,291.120361,296.939117,298.817383,301.038696,291.306488,287.401154
1974,-0.681926,0.740812,1.373921,-1.551774,-0.435588,0.23275,-1.372053,-1.695853,281.231293,287.215179,296.320404,301.303864,296.313019,291.511688,288.25647
1975,-1.567468,2.002202,-0.518458,1.556986,0.509661,1.773032,0.895744,1.033422,275.753235,289.641205,294.908081,300.886871,296.02359,291.019562,285.494232
1976,0.52382,0.532937,-1.697363,0.961862,-1.446509,0.499695,-1.474134,-2.029217,283.979858,292.507629,296.083862,299.666107,300.495178,297.305878,283.504913
1977,0.591938,-1.05983,0.273647,-0.688714,-0.430323,-0.897585,1.339607,-1.14224,290.806793,297.221161,296.493164,300.909698,296.2453,290.97348,287.136871
1978,0.636888,-0.105057,0.681106,-1.141082,-0.218658,-0.842599,1.921957,-0.75634,283.612152,293.872833,297.419464,299.356598,300.507751,295.736267,287.023193
1979,-2.341333,0.840549,0.64693,-0.982525,0.189466,-1.968555,0.959279,-0.739282,276.0466,286.521606,296.374207,300.664185,297.919647,294.441315,284.963867
1980,-10.342722,-1.357788,-2.438554,-0.763792,-0.754718,2.122007,0.205853,0.806743,289.22403,296.703491,297.945221,301.438324,295.128296,292.04895,283.809509


## Weather data 

In [73]:
# load raw weather data with t2m, tp 
weather_raw = pd.read_csv('C:/Users/maris/python_notebooks/XAI_TS_Forecasting/notebooks/dist1_t2m_tp.csv', header=0)
weather_raw['time'] = pd.to_datetime(weather_raw['time'])
weather_raw = weather_raw.set_index('time')

# check for nan
contains_nan = weather_raw.isna().any().any()

weather_raw.head()

#weather_raw["tp"].values[1:100]

Unnamed: 0_level_0,t2m,tp,Canola
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1971-04-01 00:00:00,267.04202,0.010006,18.0
1971-04-01 01:00:00,266.61612,9e-06,18.0
1971-04-01 02:00:00,266.40143,1.7e-05,18.0
1971-04-01 03:00:00,266.30045,2.4e-05,18.0
1971-04-01 04:00:00,266.17166,3.1e-05,18.0


In [85]:
# function to calculate the longest consecutive true streak

def longest_consecutive_true_streak(series):
    
    # Convert the series to integers (True to 1, False to 0) for easier streak calculation
    as_ints = series.astype(int)
    # Calculate the difference to identify changes in streaks
    diff = as_ints.diff()
    # Start a new group every time there's a change from 0 to 1 (start of a new streak)
    groups = (diff == 1).cumsum()
    # Use the groups to isolate consecutive trues, then count them, keeping the max
    streak_lengths = as_ints.groupby(groups).sum()
    # Return the length of the longest streak
    return streak_lengths.max()

## Heat Wave Index 

In [68]:
# get max temp for one day 
dist1_df_hot = weather_raw 

# add extra columns containing the years and the months
dist1_df_hot['year'] = dist1_df_hot.index.year
dist1_df_hot['month'] = dist1_df_hot.index.month
dist1_df_hot['day'] = dist1_df_hot.index.day

# get variables year, month and canola from old data frame 
daily_df = weather_raw.resample('D').first()
daily_df = daily_df[["Canola","year","month"]]

# Resample the data to daily frequency and get the maximum temperature for each day
daily_max_temperature = dist1_df_hot['t2m'].resample('D').max()

#drop all nan ( values for month november-march)
daily_max_temperature = daily_max_temperature.dropna()

# divide in test and training data 
training_data = daily_max_temperature.loc[:'1989']
testing_data = daily_max_temperature.loc['1990':]

# calculate for every day the 90% quantile
quantile_90_series = training_data.groupby([training_data.index.month, training_data.index.day]).quantile(0.9)
quantile_90_series.index = quantile_90_series.index.map(lambda x: f"{x[0]:02d}-{x[1]:02d}")


test_df = testing_data.to_frame(name='value')
test_df['month_day'] = test_df.index.strftime('%m-%d')

# map the 90th percentile values from quantile_90_series to the test series
test_df['quantile_90'] = test_df['month_day'].apply(lambda x: quantile_90_series.get(x, pd.NA))

# compare each test value to its corresponding 90th percentile value
test_df['is_above_quantile_90'] = test_df['value'] > test_df['quantile_90']
test_df.drop(['month_day', 'quantile_90'], axis=1, inplace=True)

# Group the DataFrame by year, and apply the function to find the longest streak of True values
longest_streak_by_year = test_df.groupby(test_df.index.year)['is_above_quantile_90'].apply(longest_consecutive_true_streak)

#print(longest_streak_by_year)

## Cold Wave Index 

In [69]:
# Code from above with adjustments to the coldness

# get max temp for one day 
dist1_df_hot = weather_raw 

# add extra columns containing the years and the months
dist1_df_hot['year'] = dist1_df_hot.index.year
dist1_df_hot['month'] = dist1_df_hot.index.month
dist1_df_hot['day'] = dist1_df_hot.index.day

# get variables year, month and canola from old data frame 
daily_df = weather_raw.resample('D').first()
daily_df = daily_df[["Canola","year","month"]]

# Resample the data to daily frequency and get the maximum temperature for each day
daily_min_temperature = dist1_df_hot['t2m'].resample('D').min()

#drop all nan ( values for month november-march)
daily_min_temperature = daily_min_temperature.dropna()

# divide in test and training data 
training_data = daily_min_temperature.loc[:'1989']
testing_data = daily_min_temperature.loc['1990':]

# calculate for every day the 90% quantile
quantile_10_series = training_data.groupby([training_data.index.month, training_data.index.day]).quantile(0.1)
quantile_10_series.index = quantile_10_series.index.map(lambda x: f"{x[0]:02d}-{x[1]:02d}")


test_df_cold = testing_data.to_frame(name='value')
test_df_cold['month_day'] = test_df_cold.index.strftime('%m-%d')

# map the 90th percentile values from quantile_90_series to the test series
test_df_cold['quantile_10'] = test_df_cold['month_day'].apply(lambda x: quantile_10_series.get(x, pd.NA))

# compare each test value to its corresponding 90th percentile value
test_df_cold['is_under_quantile_10'] = test_df_cold['value'] < test_df_cold['quantile_10']
test_df_cold.drop(['month_day', 'quantile_10'], axis=1, inplace=True)


# Group the DataFrame by year, and apply the function to find the longest streak of True values
longest_streak_by_year_cold = test_df_cold.groupby(test_df.index.year)['is_under_quantile_10'].apply(longest_consecutive_true_streak)

#print(longest_streak_by_year_cold)

## Longest Dry Spell 

In [84]:
dist1_df_perci = weather_raw.resample('D').sum()

# use data frame without zero to calculate the 5% quantile (only!)
dist1_df_perci_wo0 = dist1_df_perci[dist1_df_perci['tp'] != 0]

# Threshhold is 1mm or 0.001m, here precipitation is likely measured in m 

# find days with less than 0.001m rain 
dist1_df_perci_wo0['less_than_0.001'] = dist1_df_perci_wo0['tp'] < 0.001

# apply function 
longest_dry_spell_per_year = dist1_df_perci_wo0.groupby(dist1_df_perci_wo0.index.year)['less_than_0.001'].apply(longest_consecutive_true_streak)

longest_dry_spell_per_year

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dist1_df_perci_wo0['less_than_0.001'] = dist1_df_perci_wo0['tp'] < 0.001


Unnamed: 0_level_0,t2m,tp,Canola,less_than_0.001
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1971-04-01,6358.52070,0.011872,432.0,False
1971-04-02,6334.89989,0.000127,432.0,True
1971-04-03,6290.85753,0.000018,432.0,True
1971-04-04,6407.70077,0.003287,432.0,False
1971-04-05,6485.13387,0.000270,432.0,True
...,...,...,...,...
2022-10-27,6650.86452,0.000335,897.6,True
2022-10-28,6742.60452,0.000007,897.6,True
2022-10-29,6765.05091,0.000014,897.6,True
2022-10-30,6709.10049,0.000011,897.6,True


## Longest Wet Spell

In [86]:
dist1_df_perci = weather_raw.resample('D').sum()

# use data frame without zero to calculate the 5% quantile (only!)
dist1_df_perci_wo0 = dist1_df_perci[dist1_df_perci['tp'] != 0]

# Threshhold is 1mm or 0.001m, here precipitation is likely measured in m 

# find days with less than 0.001m rain 
dist1_df_perci_wo0['more_than_0.001'] = dist1_df_perci_wo0['tp'] > 0.001

# apply function 
longest_wet_spell_per_year = dist1_df_perci_wo0.groupby(dist1_df_perci_wo0.index.year)['more_than_0.001'].apply(longest_consecutive_true_streak)

longest_wet_spell_per_year

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dist1_df_perci_wo0['more_than_0.001'] = dist1_df_perci_wo0['tp'] > 0.001


time
1971    14
1972    15
1973     9
1974    13
1975    17
1976    12
1977    27
1978    16
1979    23
1980    16
1981    19
1982    14
1983    10
1984    19
1985    12
1986    17
1987    21
1988    10
1989     9
1990    11
1991    11
1992    11
1993    14
1994    15
1995    25
1996    15
1997    22
1998    26
1999    12
2000    22
2001    12
2002    16
2003    16
2004    15
2005    30
2006    12
2007    12
2008    14
2009    14
2010    15
2011    11
2012    12
2013    15
2014    13
2015    18
2016    23
2017    14
2018    13
2019    18
2020    11
2021    11
2022    10
Name: more_than_0.001, dtype: int64