In [101]:
import pandas as pd
import numpy as np

from datetime import datetime

import glob
import xarray as xr
import os

import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import statsmodels.api as sm

from statsmodels.tsa.holtwinters import SimpleExpSmoothing, ExponentialSmoothing 

from sklearn.linear_model import LinearRegression

from sktime.transformations.series.detrend import Detrender
from sktime.forecasting.trend import PolynomialTrendForecaster
from sktime.utils.plotting import plot_series

import seaborn as sns

In [102]:
#read dataframe 
canola_2 = df = pd.read_csv('../data/rm-yields-data.csv', header=0, index_col=0, parse_dates=True)
canola_small = canola_2.iloc[:, [0, 2]].copy()

In [103]:
#read dataframe 
canola_2 = df = pd.read_csv('../data/rm-yields-data.csv', header=0, index_col=0, parse_dates=True)
canola_small = canola_2.iloc[:, [0, 2]].copy()

start_year = 1938
start_analysis = 1990
exclude_years = start_analysis - start_year
#cut 70s and 80s as well 
#cut of first 52 observations (NAs)
canola_small.drop(canola_small.index[:exclude_years], inplace=True)

#filter out every observation that contains NAs
canola_filtered = canola_small.groupby('RM').filter(lambda group: not group['Canola'].isnull().any())

# how may districts? 148
num_districts = canola_filtered.groupby('RM').ngroups
print(num_districts)
#excluding 70s and 80s lead to 36 more colmplete districts 

184


In [104]:
# Group by 'RM' and check if 'Canola' has any missing values in each group
districts_with_full_data = canola_filtered.groupby('RM')['Canola'].apply(lambda group: not group.isnull().any())

# Extract the list of districts with full data
districts_with_full_data_list = districts_with_full_data[districts_with_full_data].index.tolist()

In [105]:
# select weather data
#open only the years from 1990 til 2022

# Define the directory path and pattern for the NetCDF files
directory_path = '../data/all_raw_data/'
file_pattern = 'data_*.nc'

# Get a list of files matching the pattern
files_to_open = glob.glob(os.path.join(directory_path, file_pattern))

# Open only the files for the years 1990 to 2022
years_to_open = list(map(str, range(start_analysis, 2023)))
files_to_open = [file for file in files_to_open if any(year in file for year in years_to_open)]

# Use open_mfdataset to open the selected files
cop_all_90 = xr.open_mfdataset(files_to_open, combine='by_coords')

In [106]:
# center points for regions
df_regions = pd.read_csv(r'../data/cgn_sk_csv_eng.csv')
df_rms = df_regions[['Geographical Name','Latitude', 'Longitude']][df_regions['Generic Term'] == 'Rural Municipality']
df_rms['region_index'] = df_rms['Geographical Name'].str.split(' ').str[-1].astype(int)

297

In [107]:
def get_center(region):
    avg_lat = df_rms['Latitude'][df_rms['region_index'] == region].item()
    avg_long = df_rms['Longitude'][df_rms['region_index'] == region].item()
    return avg_lat, avg_long

def detrend_ts(df_region):
    # linear detrending
    forecaster = PolynomialTrendForecaster(degree=2)
    transformer = Detrender(forecaster=forecaster)
    yt = transformer.fit_transform(df_region['Canola'])
    return yt

In [108]:
def merge_canola_weather_data(region = 310):
    # select data from region with center point
    center_lat, center_long = get_center(region)
    cropped_data_tmp = cop_all_90.sel(longitude=center_long, latitude=center_lat,method='nearest')

    # get residuals for canola yield
    df_tmp = canola_filtered[canola_filtered['RM'] == region]
    residuals = detrend_ts(df_tmp)

    # merge weather data and canola residuals
    df_weather_region = cropped_data_tmp.to_dataframe()

    df_weather_region['region'] = region

    column_to_append = residuals.tolist()
    years = df_weather_region.index.year
    df_weather_region['Canola_detrended'] = [column_to_append[year - start_analysis] for year in years]
    df_weather_region.drop(['longitude','latitude'],axis=1,inplace=True)

    return df_weather_region, pd.DataFrame(residuals)

In [109]:
def calc_temp_features(df_weather_region, df_year):
    for month in range(4,11):    
        daily_max_temperatures = df_weather_region.resample('D').max()
        monthly_avg_max_temperatures = daily_max_temperatures.resample('MS').mean()
        
    #     dist1_df_month = dist1_df.resample('MS').mean()
        month_data = monthly_avg_max_temperatures[monthly_avg_max_temperatures.index.month == month]
        column_to_append = month_data['t2m'].tolist()
        df_year.loc[:, f'average_max_temp_in_{month}'] = column_to_append
    return df_year

In [110]:
available_regions = [region for region in districts_with_full_data_list if region in df_rms['region_index'].to_list()]
len(available_regions)

183

In [111]:
available_regions.remove(278) 

In [112]:
dfs_of_years = []
for region in available_regions:
    #print(region)
    df_weather_region, df_year = merge_canola_weather_data(region)
    df_year.index = df_year.index.year
    df_year['region'] = region
    df_year = calc_temp_features(df_weather_region,df_year)
    dfs_of_years.append(df_year)

ValueError: Need at least 3 dates to infer frequency

In [93]:
dfs_of_years = []

region = 278
#for region in available_regions:
#print(region)
df_weather_region, df_year = merge_canola_weather_data(region)
df_year.index = df_year.index.year
df_year['region'] = region
df_year = calc_temp_features(df_weather_region,df_year)
dfs_of_years.append(df_year)

IndexError: list index out of range

In [94]:
available_regions

[1,
 2,
 3,
 31,
 32,
 33,
 34,
 61,
 63,
 64,
 65,
 66,
 91,
 92,
 93,
 95,
 96,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 131,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 181,
 183,
 184,
 185,
 186,
 189,
 190,
 194,
 211,
 213,
 214,
 216,
 217,
 218,
 219,
 220,
 221,
 222,
 223,
 224,
 225,
 241,
 243,
 244,
 245,
 246,
 247,
 248,
 250,
 251,
 252,
 253,
 254,
 255,
 256,
 271,
 273,
 276,
 277,
 278,
 280,
 281,
 282,
 283,
 284,
 288,
 304,
 305,
 307,
 308,
 309,
 310,
 312,
 313,
 314,
 315,
 316,
 317,
 320,
 331,
 333,
 334,
 335,
 336,
 337,
 338,
 339,
 340,
 341,
 342,
 343,
 344,
 345,
 346,
 347,
 349,
 350,
 351,
 352,
 366,
 367,
 368,
 369,
 370,
 371,
 372,
 373,
 376,
 377,
 378,
 379,
 380,
 381,
 382,
 394,
 395,
 397,
 398,
 399,
 400,
 401,
 402,
 403,
 404,
 405,
 406,
 409,
 410,
 411,
 426,
 427,
 428,
 429,
 430,
 431,
 435,
 436,
 437,
 438,
 439,
 440,
 442,
 456,
 457,
 458,
 459,
 460,
 461,
 463,
 464,
 466,
 467,
 468,
 471,
 472,
 486,
 487,


In [113]:
df_full = pd.concat(dfs_of_years)

In [114]:
df_full

unique_region_count = df_full['region'].nunique()
print(unique_region_count)

179


In [115]:
df_1990 = df_full.loc[1990]

df_1990.rename(columns={'region': 'region_index'}, inplace=True)

df_1990_coord = pd.merge(df_1990, df_rms, on='region_index')

df_1990_coord
#df_1990_coord.to_csv('C:/Users/maris/python_notebooks/XAI_TS_Forecasting/data/df_1990_coord.csv', sep=',', index=True, header=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1990.rename(columns={'region': 'region_index'}, inplace=True)


Unnamed: 0,Canola,region_index,average_max_temp_in_4,average_max_temp_in_5,average_max_temp_in_6,average_max_temp_in_7,average_max_temp_in_8,average_max_temp_in_9,average_max_temp_in_10,Geographical Name,Latitude,Longitude
0,0.127132,1,284.965759,290.526489,297.082458,299.412781,300.752075,296.791382,286.036377,Argyle No. 1,49.154052,-101.479393
1,-4.250299,2,285.097748,290.557587,296.971375,299.475037,300.810242,296.785919,286.065613,Mount Pleasant No. 2,49.134400,-101.814863
2,-4.254749,3,284.934357,290.402344,296.612885,299.007629,300.521759,296.678101,285.860657,Enniskillen No. 3,49.162144,-102.206941
3,-3.237970,31,284.553070,290.128845,296.719269,298.715149,300.327728,296.484863,285.571869,Storthoaks No. 31,49.421968,-101.535339
4,-0.365171,32,284.487640,290.010986,296.416443,298.459534,300.064056,296.286194,285.277344,Reciprocity No. 32,49.412910,-101.873375
...,...,...,...,...,...,...,...,...,...,...,...,...
174,8.047634,497,283.672668,289.998840,295.684509,295.920105,296.795441,293.605438,280.930847,Medstead No. 497,53.437744,-108.121150
175,-0.579032,499,284.496735,290.571686,296.169250,296.506378,297.331329,294.314758,281.821869,Mervin No. 499,53.507469,-108.820647
176,5.553036,501,284.257782,290.010315,295.045013,296.041473,296.657806,293.918488,281.664215,Frenchman Butte No. 501,53.584865,-109.647239
177,9.416794,502,284.265106,290.014374,294.966156,296.034210,296.618378,293.906158,281.671234,Britannia No. 502,53.504863,-109.697586
