In [67]:
! pip install sktime
! pip install standard-precip
import pandas as pd
import numpy as np

from datetime import datetime
import glob
import xarray as xr
import os
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import statsmodels.api as sm

from statsmodels.tsa.holtwinters import SimpleExpSmoothing, ExponentialSmoothing 

from sklearn.linear_model import LinearRegression

from sktime.transformations.series.detrend import Detrender
from sktime.forecasting.trend import PolynomialTrendForecaster
from sktime.utils.plotting import plot_series

from scipy.stats import gamma

from standard_precip.spi import SPI
from standard_precip.utils import plot_index

import seaborn as sns



In [2]:
#read dataframe 
canola_2 = df = pd.read_csv('/kaggle/input/rm-yields-data/rm-yields-data.csv', header=0, index_col=0, parse_dates=True)
canola_small = canola_2.iloc[:, [0, 2]].copy()

In [3]:
start_year = 1938
start_analysis = 1990
exclude_years = start_analysis - start_year
#cut 70s and 80s as well 
#cut of first 52 observations (NAs)
canola_small.drop(canola_small.index[:exclude_years], inplace=True)

#filter out every observation that contains NAs
canola_filtered = canola_small.groupby('RM').filter(lambda group: not group['Canola'].isnull().any())

# how may districts? 148
num_districts = canola_filtered.groupby('RM').ngroups
print(num_districts)
#excluding 70s and 80s lead to 36 more colmplete districts 

184


In [4]:
# Group by 'RM' and check if 'Canola' has any missing values in each group
districts_with_full_data = canola_filtered.groupby('RM')['Canola'].apply(lambda group: not group.isnull().any())

# Extract the list of districts with full data
districts_with_full_data_list = districts_with_full_data[districts_with_full_data].index.tolist()

In [5]:
# select weather data
#open only the years from 1990 til 2022

# Define the directory path and pattern for the NetCDF files
directory_path = '/kaggle/input/copernicus-data/'
file_pattern = '*.nc'

# Get a list of files matching the pattern
files_to_open = glob.glob(os.path.join(directory_path, file_pattern))

# Open only the files for the years 1990 to 2022
years_to_open = list(map(str, range(start_analysis, 2023)))
files_to_open = [file for file in files_to_open if any(year in file for year in years_to_open)]

# Use open_mfdataset to open the selected files
cop_all_90 = xr.open_mfdataset(files_to_open, combine='by_coords')

In [6]:
print(cop_all_90)

<xarray.Dataset> Size: 5GB
Dimensions:    (longitude: 88, latitude: 41, time: 169488)
Coordinates:
  * longitude  (longitude) float32 352B -110.0 -109.9 -109.8 ... -101.4 -101.3
  * latitude   (latitude) float32 164B 53.0 52.9 52.8 52.7 ... 49.2 49.1 49.0
  * time       (time) datetime64[ns] 1MB 1990-04-01 ... 2022-10-31T23:00:00
Data variables:
    t2m        (time, latitude, longitude) float32 2GB dask.array<chunksize=(5136, 41, 88), meta=np.ndarray>
    tp         (time, latitude, longitude) float32 2GB dask.array<chunksize=(5136, 41, 88), meta=np.ndarray>
Attributes:
    Conventions:  CF-1.6
    history:      2024-01-23 18:32:47 GMT by grib_to_netcdf-2.24.0: /opt/ecmw...


In [7]:
# center points for regions
df_regions = pd.read_csv('/kaggle/input/cgn-sk-csv-eng/cgn_sk_csv_eng.csv')
df_rms = df_regions[['Geographical Name','Latitude', 'Longitude']][df_regions['Generic Term'] == 'Rural Municipality']
df_rms['region_index'] = df_rms['Geographical Name'].str.split(' ').str[-1].astype(int)

In [8]:
print(df_rms)

           Geographical Name   Latitude   Longitude  region_index
11          Aberdeen No. 373  52.429385 -106.293061           373
14         Abernethy No. 186  50.639074 -103.485159           186
238    Antelope Park No. 322  51.750704 -109.847626           322
241            Antler No. 61  49.577332 -101.799342            61
270       Arborfield No. 456  53.069285 -103.518643           456
...                      ...        ...         ...           ...
12864       Wolseley No. 155  50.517552 -103.173299           155
12872      Wolverine No. 340  52.012996 -105.210165           340
12881     Wood Creek No. 281  51.499659 -105.585912           281
12898      Wood River No. 74  49.650017 -106.597794            74
12937        Wreford No. 280  51.499430 -105.139404           280

[297 rows x 4 columns]


In [9]:
def get_center(region):
    avg_lat = df_rms['Latitude'][df_rms['region_index'] == region].item()
    avg_long = df_rms['Longitude'][df_rms['region_index'] == region].item()
    return avg_lat, avg_long

def detrend_ts(df_region):
    # linear detrending
    forecaster = PolynomialTrendForecaster(degree=2)
    transformer = Detrender(forecaster=forecaster)
    yt = transformer.fit_transform(df_region['Canola'])
    return yt

In [10]:
def merge_canola_weather_data(region = 310):
    # select data from region with center point
    center_lat, center_long = get_center(region)
    cropped_data_tmp = cop_all_90.sel(longitude=center_long, latitude=center_lat,method='nearest')

    # get residuals for canola yield
    df_tmp = canola_filtered[canola_filtered['RM'] == region]
    residuals = detrend_ts(df_tmp)

    # merge weather data and canola residuals
    df_weather_region = cropped_data_tmp.to_dataframe()

    df_weather_region['region'] = region

    column_to_append = residuals.tolist()
    years = df_weather_region.index.year
    df_weather_region['Canola_detrended'] = [column_to_append[year - start_analysis] for year in years]
    df_weather_region.drop(['longitude','latitude'],axis=1,inplace=True)

    return df_weather_region, pd.DataFrame(residuals)

In [63]:
def calculate_spi(prcp_data, scale=1):
    # Step 1: Calculate L-moments
    n = len(prcp_data)
    prcp_data_sorted = np.sort(prcp_data)
    
    # L-moment ratio
    l_moment_1 = np.sum(prcp_data_sorted) / n
    l_moment_2 = np.sum((2 * np.arange(1, n + 1) - 1 - n) * prcp_data_sorted) / (n ** 2)
    
    # Step 2: Estimate parameters of gamma distribution
    k = l_moment_1 / l_moment_2
    theta = l_moment_2 / k
    
    # Step 3: Calculate SPI values
    spi_values = gamma.ppf((np.arange(1, n + 1) - 0.35) / (n + 0.3), a=k, scale=theta * scale)
    
    return spi_values

In [11]:
def calc_temp_features(df_weather_region, df_year):
    for month in range(4,11):    
        daily_max_temperatures = df_weather_region.resample('D').max()
        monthly_avg_max_temperatures = daily_max_temperatures.resample('MS').mean()
        
    #     dist1_df_month = dist1_df.resample('MS').mean()
        month_data = monthly_avg_max_temperatures[monthly_avg_max_temperatures.index.month == month]
        column_to_append = month_data['t2m'].tolist()
        df_year.loc[:, f'average_max_temp_in_{month}'] = column_to_append
    return df_year

In [80]:
def calc_spi_features(df_weather_region, df_year):
    for month in range(4, 11):
        # tried resampling in various ways but none worked
        tp_in_month = df_weather_region[df_weather_region.index.month == month]

        spi = SPI()

        # Assuming spi.calculate is the SPI calculation function
        spi_values = spi.calculate(
            tp_in_month.reset_index(),
            'time',
            'tp',
            freq="M",
            scale=1,
            fit_type="lmom",
            dist_type="gam"
        )
        
        # may have to aggregate here somehow

        # Add each SPI column separately
        for col_name in spi_values.columns:
            df_year[f'SPI_in_{month}_{col_name}'] = spi_values[col_name]
            
    return df_year

In [13]:
available_regions = [region for region in districts_with_full_data_list if region in df_rms['region_index'].to_list()]

In [81]:
dfs_of_years = []
for region in available_regions:
    print(region)
    df_weather_region, df_year = merge_canola_weather_data(region)
    df_year.index = df_year.index.year
    df_year['region'] = region
    df_year = calc_temp_features(df_weather_region, df_year)
    df_year = calc_spi_features(df_weather_region, df_year)
    dfs_of_years.append(df_year)

1
2
3
31
32
33
34
61
63
64
65
66
91
92
93
95
96
121
122
123
124
125
126
127
131
151
152
153
154
155
156
157
158
181
183
184
185
186
189
190
194
211
213
214
216
217
218
219
220
221
222
223
224
225
241
243
244
245
246
247
248
250
251
252
253
254
255
256
271
273
276
277
278


IndexError: list index out of range

In [82]:
df_full = pd.concat(dfs_of_years)

In [83]:
df_full

Unnamed: 0_level_0,Canola,region,average_max_temp_in_4,average_max_temp_in_5,average_max_temp_in_6,average_max_temp_in_7,average_max_temp_in_8,average_max_temp_in_9,average_max_temp_in_10,SPI_in_4_time,...,SPI_in_7_tp_calculated_index,SPI_in_8_time,SPI_in_8_tp,SPI_in_8_tp_calculated_index,SPI_in_9_time,SPI_in_9_tp,SPI_in_9_tp_calculated_index,SPI_in_10_time,SPI_in_10_tp,SPI_in_10_tp_calculated_index
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1990,0.127132,1,284.965759,290.526489,297.082458,299.412781,300.752075,296.791382,286.036377,1992-04-23 22:00:00,...,0.532586,1992-08-21 22:00:00,0.000073,0.492898,1992-09-23 22:00:00,-1.862645e-09,-0.236714,1992-10-21 22:00:00,-1.862645e-09,-0.380289
1991,2.520378,1,287.648346,291.984650,296.854401,297.652344,299.938751,292.543549,282.321960,1992-04-23 23:00:00,...,0.532586,1992-08-21 23:00:00,0.000073,0.492898,1992-09-23 23:00:00,-1.862645e-09,-0.236714,1992-10-21 23:00:00,-1.862645e-09,-0.380289
1992,-6.339489,1,283.242584,293.701874,295.445709,294.548126,297.033264,292.229004,285.503967,1992-04-24 00:00:00,...,0.532586,1992-08-22 00:00:00,0.000332,0.734092,1992-09-24 00:00:00,-1.862645e-09,-0.236714,1992-10-22 00:00:00,-1.862645e-09,-0.380289
1993,4.147971,1,284.697113,292.762695,293.425385,294.037415,296.177429,291.031281,284.688232,1992-04-24 01:00:00,...,-0.464812,1992-08-22 01:00:00,0.000159,0.606211,1992-09-24 01:00:00,-1.862645e-09,-0.236714,1992-10-22 01:00:00,-1.862645e-09,-0.380289
1994,2.081733,1,285.144440,293.432678,294.774536,297.061951,297.072021,294.903839,286.052094,1992-04-24 02:00:00,...,-0.464812,1992-08-22 02:00:00,0.000179,0.625827,1992-09-24 02:00:00,-1.862645e-09,-0.236714,1992-10-22 02:00:00,-1.862645e-09,-0.380289
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,0.027410,277,277.149414,294.338806,295.337128,297.314148,298.153961,286.258423,279.045258,1992-04-25 02:00:00,...,-0.700221,1992-08-23 02:00:00,0.000367,0.638719,1992-09-25 02:00:00,1.048520e-04,0.587394,1992-10-23 02:00:00,-1.862645e-09,-0.538109
2019,7.202161,277,283.645691,288.994080,294.978119,296.750946,294.959564,289.805023,278.502686,1992-04-25 03:00:00,...,-0.700221,1992-08-23 03:00:00,0.000623,0.754503,1992-09-25 03:00:00,1.057480e-04,0.588694,1992-10-23 03:00:00,-1.862645e-09,-0.538109
2020,0.338078,277,280.380188,290.371033,294.232025,297.146240,297.938080,290.888153,279.097870,1992-04-25 04:00:00,...,-0.700221,1992-08-23 04:00:00,0.000751,0.799938,1992-09-25 04:00:00,1.057480e-04,0.588694,1992-10-23 04:00:00,-1.862645e-09,-0.538109
2021,-14.767915,277,282.236755,289.601898,297.171356,300.447906,296.123688,294.560272,285.297791,1992-04-25 05:00:00,...,-0.700221,1992-08-23 05:00:00,0.000777,0.808475,1992-09-25 05:00:00,1.057480e-04,0.588694,1992-10-23 05:00:00,-1.862645e-09,-0.538109
