# Current State

## Domain Background

- Canola life cycle: May (seeding), late June - early July (bolting, flowering), July (peak flowering), mid August - early September (swathing), September (harvest)

- minimum growth temp.: -5 degrees celsius (~268 degrees Kelvin)

## Hypothesis Propositions

In [None]:
# I. High temperatures (low precipitation) have a negative impact on Canola yield in Saskatchewan.
## avg. temp in growing season, cum. heat days, heatwaves (X consecutive days above threshold), heat stress index

# II. Greater-than-average precipitation has a positive impact on Canola yield in Saskatchewan.
## heavy tp frequency

# III. Cooler-than-average nocturnal temperatures have a positive impact on Canola yield in Saskatchewan.
## avg nocturnal temperature

# IV. The beginning of July, i.e. the early flowering period, is the critical time period for the effects in (I).

# V. The critical threshold for temperature-caused yield loss is 30 degrees celsius.

# VI. Increased precipitation may offset the negative impact of high temperatures on floweing canola.

# VII. Canola crop yield increases (decreases) with increasing average temperatures in June and August (in July).

# VIII. The crucial time for high precipitation benefits is the month of July, i.e. peak flowering season.

## Preparation

### Libraries

In [None]:
import pandas as pd
import numpy as np

import glob
import xarray as xr

import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import statsmodels.api as sm

from statsmodels.tsa.holtwinters import SimpleExpSmoothing, ExponentialSmoothing 

from sklearn.linear_model import LinearRegression

from sktime.transformations.series.detrend import Detrender
from sktime.forecasting.trend import PolynomialTrendForecaster
from sktime.utils.plotting import plot_series

import seaborn as sns

## Data Preparations 

### Canola yield

In [None]:
#read dataframe 
canola_2 = df = pd.read_csv('../data/rm-yields-data.csv', header=0, index_col=0, parse_dates=True)
canola_small = canola_2.iloc[:, [0, 2]].copy()

#cut of first 33 observations (NAs)
canola_small.drop(canola_small.index[:33], inplace=True)

#filter out every observation that contains NAs
canola_filtered = canola_small.groupby('RM').filter(lambda group: not group['Canola'].isnull().any())

# how may districts? 148
num_districts = canola_filtered.groupby('RM').ngroups

#extract only the first district 
canola_dist1 = canola_filtered[canola_filtered['RM'] == 1]

In [None]:
#print(canola_filtered)

df_reset = canola_filtered.reset_index()
#print(df_reset)
# Pivot the DataFrame
pivot_df = df_reset.pivot(index='Year', columns='RM', values='Canola')

#print(pivot_df)
# Calculate correlation matrix
correlation_matrix = pivot_df.corr()
correlation_matrix
# Display or use the correlation matrix
#print(correlation_matrix)

selected_columns = pivot_df.columns[:20]  
correlation_matrix_subset = pivot_df[selected_columns].corr()
sns.heatmap(correlation_matrix_subset, cmap='coolwarm', annot=True, fmt=".2f")
plt.show()


In [None]:
# Group by 'RM' and check if 'Canola' has any missing values in each group
districts_with_full_data = canola_filtered.groupby('RM')['Canola'].apply(lambda group: not group.isnull().any())

# Extract the list of districts with full data
districts_with_full_data_list = districts_with_full_data[districts_with_full_data].index.tolist()

# Print or use the list as needed
print("Districts with full data:", districts_with_full_data_list)
print(len(districts_with_full_data_list))

### Weather Data 

In [None]:
cop_all = xr.open_mfdataset(paths='../data/raw_data/*.nc', combine='by_coords')

In [None]:
# use mean of latitude, longitude dims as they only deviate marginally
cop_all.items

In [None]:
cop_all_centralized = cop_all.mean(dim=['latitude', 'longitude'])

cop_all_centralized.items

In [None]:
column_to_append = canola_dist1['Canola'].tolist()

dist1_df = cop_all_centralized.to_dataframe()

years = dist1_df.index.year
dist1_df['Canola'] = [column_to_append[year - 1971] for year in years]

dist1_df.tail()


### Visualization

In [None]:
canola_dist1 = canola_filtered[canola_filtered['RM'] == 1]

plt.figure(figsize=(8, 6))
plt.plot(canola_dist1['Canola'], label=f'Canola in district 1')
plt.title('Canola in District 1')
plt.legend()
plt.show()

### Differencing 

In [None]:
# Differencing Orders (confirms that d is likely 0)
fig, (ax1, ax2, ax3) = plt.subplots(3)
ax1.plot(canola_dist1.Canola); ax1.set_title('Original Series'); ax1.axes.xaxis.set_visible(False)
# 1st Differencing
ax2.plot(canola_dist1.Canola.diff()); ax2.set_title('1st Order Differencing'); ax2.axes.xaxis.set_visible(False)
# 2nd Differencing
ax3.plot(canola_dist1.Canola.diff().diff()); ax3.set_title('2nd Order Differencing')
plt.show()

### Stationarity 

In [None]:
# Split into test and trainingsdata 

test_set = canola_dist1.iloc[-10:]
training_set = canola_dist1.iloc[:-10]

#check for stationarity with Augmented Dickey-Fuller (ADF) test 

#in the whole dataset
result = adfuller(canola_dist1['Canola'])
print(f'ADF Statistic: {result[0]}')
print(f'p-value: {result[1]}')

#in the trainingset
result = adfuller(training_set['Canola'])
print(f'ADF Statistic: {result[0]}')
print(f'p-value: {result[1]}')



### Detrending 

In [None]:
canola_dist1.index = canola_dist1.index.to_period('A')

#change frequency 
frequency = canola_dist1.index.freq

print(frequency)

# linear detrending
forecaster = PolynomialTrendForecaster(degree=2)
transformer = Detrender(forecaster=forecaster)
yt = transformer.fit_transform(canola_dist1['Canola'])


forecaster = PolynomialTrendForecaster(degree=2)
fh_ins = -np.arange(len(canola_dist1['Canola'])) 
y_pred = forecaster.fit(canola_dist1['Canola']).predict(fh=fh_ins)

plot_series(canola_dist1['Canola'], y_pred, yt, labels=["y_train", "fitted quadratic trend", "residuals"]);

residuals = yt

#ADF 
result = adfuller(residuals)
print(f'ADF Statistic: {result[0]}')
print(f'p-value: {result[1]}')


In [None]:
# add detrended time series to dataframe 
canola_dist1.loc[:,'Canola_detrend'] = yt

In [None]:
dist1_df.head()

# Feature Extraction 

### Average Temperature

In [None]:
# get average temperature annual

dist1_df_annual = dist1_df.resample('A').mean()
dist1_df_annual.head()

column_to_append = dist1_df_annual['t2m'].tolist()
canola_dist1.loc[:,'average_temp_in_year'] = column_to_append

canola_dist1.head()

In [None]:
# average temperature in one month 

for month in range(4,11):
    
    dist1_df_month = dist1_df.resample('MS').mean()
    month_data = dist1_df_month[dist1_df_month.index.month == month]

    column_to_append = month_data['t2m'].tolist()
    canola_dist1.loc[:, f'average_temp_in_{month}'] = column_to_append

canola_dist1.head()

In [None]:
dist1_df.head()

### Number of hot days 

In [None]:
#hot days in one year 
dist1_df_hot = dist1_df 

# add extra columns containing the years and the months
dist1_df_hot['year'] = dist1_df_hot.index.year
dist1_df_hot['month'] = dist1_df_hot.index.month
dist1_df['day'] = dist1_df_hot.index.day

# get variables year, month and canola from old data frame 
daily_df = dist1_df.resample('D').first()
#print(len(daily_df))
daily_df = daily_df[["Canola","year","month"]]

# Resample the data to daily frequency and get the maximum temperature for each day
daily_max_temperature = dist1_df_hot['t2m'].resample('D').max()

# add max temp per day to data frame
daily_df['max_temp'] = daily_max_temperature
print(len(daily_df))
#print(daily_df[210:230])

# count hot days per year 
hot_days_by_year = daily_df[daily_df["max_temp"] > 303].groupby('year').size()

hot_days_by_year = hot_days_by_year.reindex(range(dist1_df_hot['year'].min(), dist1_df_hot['year'].max() + 1), fill_value=0)

#hot_days_by_year.plot()
#plt.show()

#append hot da
column_to_append = hot_days_by_year.tolist()
canola_dist1.loc[:,'hot_days_in_year'] = column_to_append

# get number of hot days per month from may til august

for month in range(5, 9):
    # Filter data for the current month
    month_data = daily_df[daily_df['month'] == month]

    # hot days per year in the current month
    hot_days_by_month = month_data[month_data["max_temp"] > 303].groupby('year').size()

    # Reindex to include all years and fill NaN values with 0
    hot_days_by_month = hot_days_by_month.reindex(range(dist1_df_hot['year'].min(), dist1_df_hot['year'].max() + 1), fill_value=0)

    
    # append column to dataframe  
    column_to_append = hot_days_by_month.tolist()
    canola_dist1.loc[:,f'hot_days_in_{month}'] = column_to_append
    
    #plot 
    #hot_days_in_august_by_year.plot()
    #plt.show()


#print(canola_dist1.head())




### Days without rain

In [None]:
# resample and sum out every day
dist1_df_perci = dist1_df.resample('D').sum()

print(dist1_df_perci)

# problem: resample tries to apply the method to every day including days from noverber-march, fills it with nan. 
# The sum() of only nans is 0. In the data are more 0, so deleting all zeros kicks out reasonable oberservations as well
# sum(skipna=False) does not work with resample()

# use data frame without zero to calculate the 5% quantile (only!)
dist1_df_perci_wo0 = dist1_df_perci[dist1_df_perci['tp'] != 0]

#does it make sense to consider the overall quantile for all month? 

#calculate quantil 
quantile_5 = dist1_df_perci_wo0['tp'].quantile(0.05)
print("5% Quantile for 'tp':", quantile_5)


daily_df['sum_percipitation'] = dist1_df_perci['tp']

days_without_rain_year = daily_df[daily_df["sum_percipitation"] <= quantile_5].groupby('year').size()

# #append hot da
column_to_append = days_without_rain_year.tolist()
canola_dist1.loc[:,'days_without_rain_year'] = column_to_append

#canola_dist1.columns

### days without rain per month 

for month in range(4, 11):
    # Filter data for the current month
    month_data = daily_df[daily_df['month'] == month]

    # hot days per year in the current month
    days_without_rain_month = month_data[month_data["sum_percipitation"] < quantile_5].groupby('year').size()

    # Reindex to include all years and fill NaN values with 0
    days_without_rain_month = days_without_rain_month.reindex(range(dist1_df_hot['year'].min(), dist1_df_hot['year'].max() + 1), fill_value=0)

    
    # append column to dataframe  
    column_to_append = days_without_rain_month.tolist()
    canola_dist1.loc[:,f'days_without_rain_{month}'] = column_to_append
    
    #plot 
    #hot_days_in_august_by_year.plot()
    #plt.show()


canola_dist1.columns

### Precipitation in one year/month

In [None]:
#percipitation in one year 
dist1_df_precipitation = dist1_df.resample('A').sum()
dist1_df_precipitation.head()

column_to_append = dist1_df_precipitation['tp'].tolist()
canola_dist1.loc[:,'precipitation_in_year'] = column_to_append

#percipitation in one month in one year

for month in range(4,11):
    
    dist1_df_month = dist1_df.resample('MS').sum()
    month_data = dist1_df_month[dist1_df_month.index.month == month]
    
    column_to_append = month_data['tp'].tolist()
    canola_dist1.loc[:,f'precipitation_in_{month}'] = column_to_append

canola_dist1.head()


### Linear Regression model 

In [None]:
canola_lr = canola_dist1.iloc[:, 2:]

canola_lr.columns

In [None]:
#linear regression using all covariates 

y = canola_lr.iloc[:, 0] 
X = canola_lr.iloc[:, 1:] 

X = sm.add_constant(X)

model = sm.OLS(y, X)
results = model.fit()

print(results.summary())

# multicollianrity

#correlation matrix
correlation_matrix = canola_lr.corr()
print(correlation_matrix)

sns.heatmap(correlation_matrix, cmap='coolwarm', annot=True, fmt=".2f")
plt.show()

#VIF Test 
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
vif["Variable"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif)

In [None]:
# PCA to tackle multicollinearity

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = canola_lr.iloc[:, 1:] 
scaler.fit(X)

pca = PCA()
pca.fit(X, y)
X_pca_new = pca.transform(X)   

def myplot(score,coeff,labels=None):
    xs = score[:,0]
    ys = score[:,1]
    n = coeff.shape[0]

    plt.scatter(xs ,ys, c = y) #without scaling
    for i in range(n):
        plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = 'r',alpha = 0.5)
        if labels is None:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, "Var"+str(i+1), color = 'g', ha = 'center', va = 'center')
        else:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = 'g', ha = 'center', va = 'center')

plt.xlabel("PC{}".format(1))
plt.ylabel("PC{}".format(2))
plt.grid()

myplot(X_pca_new[:,0:2], pca.components_) 
plt.show()

# pca_model = sm.OLS(y, X_pca)
# pca_results = pca_model.fit()

# print(pca_results.summary())

print(sum(pca.explained_variance_ratio_))


In [None]:
X.columns

In [None]:
# get the most important features on the PCs with names and save them into a pandas df

model = PCA(n_components=3).fit(X)
X_pc = model.transform(X)

# number of components
n_pcs= model.components_.shape[0]

# get the index of the most important feature on EACH component
# LIST COMPREHENSION HERE
most_important = [np.abs(model.components_[i]).argmax() for i in range(n_pcs)]

initial_feature_names = ['average_temp_in_year', 'average_temp_in_4', 'average_temp_in_5',
       'average_temp_in_6', 'average_temp_in_7', 'average_temp_in_8',
       'average_temp_in_9', 'average_temp_in_10', 'hot_days_in_year',
       'hot_days_in_5', 'hot_days_in_6', 'hot_days_in_7', 'hot_days_in_8',
       'days_without_rain_year', 'days_without_rain_4', 'days_without_rain_5',
       'days_without_rain_6', 'days_without_rain_7', 'days_without_rain_8',
       'days_without_rain_9', 'days_without_rain_10', 'precipitation_in_year',
       'precipitation_in_4', 'precipitation_in_5', 'precipitation_in_6',
       'precipitation_in_7', 'precipitation_in_8', 'precipitation_in_9',
       'precipitation_in_10']

# get the names
most_important_names = [initial_feature_names[most_important[i]] for i in range(n_pcs)]

# LIST COMPREHENSION HERE AGAIN
dic = {'PC{}'.format(i): most_important_names[i] for i in range(n_pcs)}

# build the dataframe
df_pca = pd.DataFrame(dic.items())

print(df_pca)

In [None]:
from sklearn.cross_decomposition import PLSCanonical, PLSRegression, CCA

pls = PLSRegression(n_components=2)
pls_reg = pls.fit(X, y)
print(pls_reg)

In [None]:
# linear regression using just average temp in month 

y = canola_lr.iloc[:, 0] 
X = canola_lr.iloc[:, 2:9] 

X = sm.add_constant(X)

model = sm.OLS(y, X)
results = model.fit()

print(results.summary())

In [None]:
# linear regression using all covs log-transformed; no difference 

y = canola_lr.iloc[:, 0] 
X = canola_lr.iloc[:, 2:9] 

X =  np.log1p(X)
X = sm.add_constant(X)

model = sm.OLS(y, X)
results = model.fit()

print(results.summary())

In [None]:
# linear regression using just hot days in month 

y = canola_lr.iloc[:, 0] 
X = canola_lr.iloc[:, 10:14] 

X = sm.add_constant(X)

model = sm.OLS(y, X)
results = model.fit()

print(results.summary())

In [None]:
# linear regression using just days without rain in month 

y = canola_lr.iloc[:, 0] 
X = canola_lr.iloc[:, 15:22] 

X = sm.add_constant(X)

model = sm.OLS(y, X)
results = model.fit()

print(results.summary())

In [None]:
# linear regression using just percipitation in month 

y = canola_lr.iloc[:, 0] 
X = canola_lr.iloc[:, 23:] 

X = sm.add_constant(X)

model = sm.OLS(y, X)
results = model.fit()

print(results.summary())

In [None]:
# linear regression using yearly variables

y = canola_lr.iloc[:, 0] 
X = canola_lr.loc[:, ['average_temp_in_year', 'precipitation_in_year','days_without_rain_year','hot_days_in_year']] 

X = sm.add_constant(X)

model = sm.OLS(y, X)
results = model.fit()

print(results.summary())

In [None]:
# linear regression using variables in may

y = canola_lr.iloc[:, 0] 
X = canola_lr.loc[:, ['average_temp_in_5', 'precipitation_in_5','days_without_rain_5','hot_days_in_5']]

X = sm.add_constant(X)

model = sm.OLS(y, X)
results = model.fit()

print(results.summary())

### Simple linear Regression model 

In [None]:
# Hot days in july

y = canola_lr.iloc[:, 0] 
X_1 = canola_lr[["hot_days_in_7"]]

X_1 = sm.add_constant(X_1)

model = sm.OLS(y, X_1)
results_one = model.fit()

print(results_one.summary())

plt.scatter(canola_lr[["hot_days_in_7"]], canola_lr[["Canola_detrend"]], label='Outliers')

# Plot the regression line
plt.plot(canola_lr[["hot_days_in_7"]], results_one.predict(), color='blue', label='Regression line complete')

# Add labels and a legend
plt.xlabel('Hot days in july')
plt.ylabel('Canola yield')
plt.legend()

# Show the plot
#plt.show()

# check for outliners:
df_july = canola_lr.iloc[:, [0,12]]

#print(df_july)

#the utliers are 2017 and 2021

years_to_remove = ['2017', '2021']

# Remove specific years from the DataFrame
df_july = df_july.drop(years_to_remove, axis=0)

# Display the resulting DataFrame
#print(df_july)

# lr with removed outliers 

y = df_july.iloc[:, 0] 
X_mod = df_july.iloc[:,1]

X_mod = sm.add_constant(X_mod)

model = sm.OLS(y, X_mod)
results_one = model.fit()

print(results_one.summary())

plt.scatter(df_july[["hot_days_in_7"]], df_july[["Canola_detrend"]], label='Data points')

# Plot the regression line
plt.plot(df_july[["hot_days_in_7"]], results_one.predict(), color='red', label='Regression line adjusted')

# Add labels and a legend
plt.xlabel('Hot days in july')
plt.ylabel('Canola yield')
plt.legend()




In [None]:
# Hot days in year

y = canola_lr.iloc[:, 0] 
X_1 = canola_lr[["hot_days_in_year"]]

X_1 = sm.add_constant(X_1)

model = sm.OLS(y, X_1)
results_one = model.fit()

print(results_one.summary())

plt.scatter(canola_lr[["hot_days_in_year"]], canola_lr[["Canola_detrend"]], label='Data points')

# Plot the regression line
plt.plot(canola_lr[["hot_days_in_year"]], results_one.predict(), color='red', label='Regression line')

# Add labels and a legend
plt.xlabel('Hot days per year')
plt.ylabel('Canola yield')
plt.legend()

# Show the plot
plt.show()

In [None]:
# average temp in july 

y = canola_lr.iloc[:, 0] 
X_1 = canola_lr[["average_temp_in_8"]]

X_1 = sm.add_constant(X_1)

model = sm.OLS(y, X_1)
results_one = model.fit()

print(results_one.summary())

plt.scatter(canola_lr[["average_temp_in_8"]], canola_lr[["Canola_detrend"]], label='Data points')

# Plot the regression line
plt.plot(canola_lr[["average_temp_in_8"]], results_one.predict(), color='red', label='Regression line')

# Add labels and a legend
plt.xlabel('Average temp in july')
plt.ylabel('Canola yield')
plt.legend()

# Show the plot
plt.show()

# Ordinal scale? 

In [None]:
# average temp in year

y = canola_lr.iloc[:, 0] 
X_1 = canola_lr[["average_temp_in_year"]]

X_1 = sm.add_constant(X_1)

model = sm.OLS(y, X_1)
results_one = model.fit()
residuals = results_one.resid
fitted_values = results_one.fittedvalues

print(results_one.summary())

plt.scatter(canola_lr[["average_temp_in_year"]], canola_lr[["Canola_detrend"]], label='Data points')

# Plot the regression line
plt.plot(canola_lr[["average_temp_in_year"]], results_one.predict(), color='red', label='Regression line')

# Add labels and a legend
plt.xlabel('Average temp in year')
plt.ylabel('Canola yield')
plt.legend()

# Show the plot
plt.show()

In [None]:
sm.qqplot(residuals, line='45', fit=True)
plt.show()

In [None]:
plt.scatter(fitted_values, residuals)
plt.axhline(y=0, color='red', linestyle='--', linewidth=2)
plt.title("Residuals vs Fitted Values")
plt.xlabel("Fitted Values")
plt.ylabel("Residuals")
plt.show()

In [None]:
# percipitation in july

y = canola_lr.iloc[:, 0] 
X_1 = canola_lr[["precipitation_in_7"]]

X_1 = sm.add_constant(X_1)

model = sm.OLS(y, X_1)
results_one = model.fit()

print(results_one.summary())

plt.scatter(canola_lr[["precipitation_in_7"]], canola_lr[["Canola_detrend"]], label='Data points')

# Plot the regression line
plt.plot(canola_lr[["precipitation_in_7"]], results_one.predict(), color='red', label='Regression line')

# Add labels and a legend
plt.xlabel('Percipitation in july')
plt.ylabel('Canola yield')
plt.legend()

# Show the plot
plt.show()

In [None]:
len(canola_lr)

In [None]:
# days without rain 

y = canola_lr.iloc[:, 0] 
X_1 = canola_lr[["days_without_rain_5"]]

X_1 = sm.add_constant(X_1)

model = sm.OLS(y, X_1)
results_one = model.fit()

print(results_one.summary())

plt.scatter(canola_lr[["days_without_rain_5"]], canola_lr[["Canola_detrend"]], label='Data points')

# Plot the regression line
plt.plot(canola_lr[["days_without_rain_5"]], results_one.predict(), color='red', label='Regression line')

# Add labels and a legend
plt.xlabel('days without rain in 5')
plt.ylabel('Canola yield')
plt.legend()

# Show the plot
plt.show()

#non are significant 

### ARIMA 

In [None]:
# ACF and PACF plots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

plot_acf(training_set['Canola'], lags=20, ax=ax1) # p = 1
plot_pacf(training_set['Canola'], lags=20, ax=ax2) # q = 1

plt.show()


In [None]:
#fit model 

p, d, q = (0, 1, 1)
model = ARIMA(training_set['Canola'], order=(p, d, q), freq = "AS-JAN")
results = model.fit()

# Print model summary
print(results.summary())

# Plot diagnostics
results.plot_diagnostics(figsize=(10, 8))
plt.show()

In [None]:
#forecastingt 

forecast_steps = 10 
forecast = results.get_forecast(steps=forecast_steps)
forecast_ci = forecast.conf_int()

forecast_start_date = training_set.index[-1] + pd.DateOffset(years=1)
forecast_end_date = forecast.predicted_mean.index[-1]
full_index = pd.date_range(start=training_set.index[0], end=forecast_end_date, freq='AS-JAN')
forecast.predicted_mean = forecast.predicted_mean.reindex(full_index, fill_value=None)

plt.figure(figsize=(10, 6))
plt.plot(training_set['Canola'], label='Observed')
plt.plot(test_set.index, test_set['Canola'], label='Test Set (Observed)', color='green')
plt.plot(forecast.predicted_mean, color='red', label='Forecast')
plt.fill_between(forecast_ci.index, forecast_ci.iloc[:, 0], forecast_ci.iloc[:, 1], color='red', alpha=0.2)
plt.title('ARIMA Forecast')
plt.legend()
plt.show()

### Exponential Smoothing 

In [None]:
alpha_star = None
best_mse = None
dat = canola_dist1.iloc[:, :].values.astype('float32')
mean_results_for_all_possible_alpha_values = np.zeros(9)
for alpha in range(0, 9):
    pt = np.mean(dat[:, 0][0:5])
    mean_for_alpha = np.zeros(len(dat))
    mean_for_alpha[0] = np.power(dat[0][0] - pt, 2)
    for i in range(1, len(dat)):
        pt = pt + ((alpha + 1) * 0.1) * (dat[i - 1][0] - pt)
        mean_for_alpha[i] = np.power(dat[i][0] - pt, 2)
    mean_results_for_all_possible_alpha_values[alpha] = np.mean(mean_for_alpha)
alpha_star = (np.argmin(mean_results_for_all_possible_alpha_values) + 1) * 0.1
best_mse = np.min(mean_results_for_all_possible_alpha_values)
print("Best MSE = %s" % best_mse)
print("Optimal alpha = %s" % alpha_star)

In [None]:
model3 = SimpleExpSmoothing(training_set['Canola'])
model3_fit = model3.fit(smoothing_level=0.1,optimized=False)
model3_fit.params
y_hat_ses = test_set.copy()
y_hat_ses['ses_forecast'] = model3_fit.forecast(len(test_set))

print(y_hat_ses['ses_forecast'])

In [None]:
plt.figure(figsize=(25,7))
plt.grid()
plt.plot(training_set['Canola'], label='Train')
plt.plot(test_set['Canola'], label='Test')
plt.plot(y_hat_ses['ses_forecast'], label='Forecast')
plt.legend(loc='best')
plt.title('Simple Exponential Smoothing')
plt.show()

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Assuming 'data' is your DataFrame containing the variables
# 'Canola_detrend', 'average_temp_in_year', 'precipitation_in_year', etc.

# Extract relevant features and target variable
features = canola_lr.drop('Canola_detrend', axis=1).values
target = canola_lr['Canola_detrend'].values.reshape(-1, 1)

# Standardize the features
scaler = StandardScaler()
features = scaler.fit_transform(features)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Convert data to PyTorch tensors
X_train_tensor = torch.Tensor(X_train).unsqueeze(2)  # Add an extra dimension for the input sequence
y_train_tensor = torch.Tensor(y_train)

X_test_tensor = torch.Tensor(X_test).unsqueeze(2)
y_test_tensor = torch.Tensor(y_test)

# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out

# Hyperparameters
input_size = X_train_tensor.shape[2]
hidden_size = 50
num_layers = 2
output_size = 1
learning_rate = 0.001
num_epochs = 30 # still way too little

# Initialize the model, loss function, and optimizer
model = LSTMModel(input_size, hidden_size, num_layers, output_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training the model
for epoch in range(num_epochs):
    model.train()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 5 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}') # print loss every 5th epoch

# Evaluation on the test set
model.eval()
with torch.no_grad():
    test_outputs = model(X_test_tensor)
    test_loss = criterion(test_outputs, y_test_tensor)

print(f'Test Loss: {test_loss.item():.4f}')


In [None]:
import requests

url = "https://services9.arcgis.com/WJsMXAAF3vSdDYis/arcgis/rest/services/SaskAdmin_2016_rural_municipality/FeatureServer/0"
response = requests.get(url, params={"param1": "value1", "param2": "value2"}, headers={"Authorization": "Bearer YOUR_TOKEN"})

# Check the status code
# if response.status_code == 200:
#     # The request was successful
#     data = response.json()
# else:
#     # Handle the error
#     print(f"Error: {response.status_code}")