# Canola Yield: Exploratory and Sequential Analysis on all Regions

## Libraries

In [1]:
import glob

import xarray as xr

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import netCDF4

pd.options.display.max_rows = 1000

import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

# from standard_precip.spi import SPI
# from standard_precip.utils import plot_index



In [2]:
!pip install standard-precip

Collecting standard-precip
  Obtaining dependency information for standard-precip from https://files.pythonhosted.org/packages/9d/ab/61c9a06ab0c23e985cb36e3196ac7d9fb736047a60e48218c5a2411654cd/standard_precip-1.0-py3-none-any.whl.metadata
  Downloading standard_precip-1.0-py3-none-any.whl.metadata (423 bytes)
Downloading standard_precip-1.0-py3-none-any.whl (21 kB)
Installing collected packages: standard-precip
Successfully installed standard-precip-1.0


## Data

In [3]:
df = pd.read_csv('/kaggle/input/rm-yields-data/rm-yields-data.csv')

canola = df[['Year', 'RM', 'Canola']].copy()
canola['Year'] = pd.to_datetime(canola['Year'], format='%Y')

canola.drop(canola.index[:33], inplace=True)

df_pivot = canola.pivot_table(index='Year', columns='RM', values='Canola', aggfunc='mean')

# Count NaN values in each column
na_counts = df_pivot.isna().sum()

# Filter columns with 17 or fewer NaN values
filtered_columns = na_counts[na_counts <= 17].index

# Create a new DataFrame with selected columns
df_filtered = df_pivot[filtered_columns]
df_filtered = df_filtered.drop(index=df_filtered.index[:18])

canola_dist1 = canola[canola['RM'] == 1]

In [4]:
#read dataframe 
canola_2 = pd.read_csv('/kaggle/input/rm-yields-data/rm-yields-data.csv', header=0, index_col=0, parse_dates=True)
canola_small = canola_2.iloc[:, [0, 2]].copy()

#cut of first 33 observations (NAs)
canola_small.drop(canola_small.index[:33], inplace=True)

#filter out every observation that contains NAs
canola_filtered = canola_small.groupby('RM').filter(lambda group: not group['Canola'].isnull().any())

print(canola_filtered)

# how may districts? 148
num_districts = canola_filtered.groupby('RM').ngroups

             RM  Canola
Year                   
1971-01-01    1    18.0
1972-01-01    1    18.0
1973-01-01    1    20.0
1974-01-01    1    16.0
1975-01-01    1    15.0
...         ...     ...
2018-01-01  622    39.3
2019-01-01  622    46.1
2020-01-01  622    40.9
2021-01-01  622    23.9
2022-01-01  622    44.4

[7658 rows x 2 columns]


  canola_2 = pd.read_csv('/kaggle/input/rm-yields-data/rm-yields-data.csv', header=0, index_col=0, parse_dates=True)


In [5]:
canola_dist1 = canola_filtered[canola_filtered['RM'] == 1]

#change frequency to yearly 
canola_dist1.index = canola_dist1.index.to_period('A')

#change frequency 
frequency = canola_dist1.index.freq

#print(frequency)

# from sktime.forecasting.trend import PolynomialTrendForecaster

# Quadratic detrending
# forecaster = PolynomialTrendForecaster(degree=1)
# transformer = Detrender(forecaster=forecaster)
# yt = transformer.fit_transform(canola_dist1['Canola'])



# forecaster = PolynomialTrendForecaster(degree=2)
# fh_ins = -np.arange(len(canola_dist1['Canola'])) 
# y_pred = forecaster.fit(canola_dist1['Canola']).predict(fh=fh_ins)

# plot_series(canola_dist1['Canola'], y_pred, yt, labels=["Canola", "fitted quadratic trend", "residuals"]);

In [6]:
%ls /kaggle/input/copernicus-data/*.nc

/kaggle/input/copernicus-data/data_1970.nc
/kaggle/input/copernicus-data/data_1971.nc
/kaggle/input/copernicus-data/data_1972.nc
/kaggle/input/copernicus-data/data_1973.nc
/kaggle/input/copernicus-data/data_1974.nc
/kaggle/input/copernicus-data/data_1975.nc
/kaggle/input/copernicus-data/data_1976.nc
/kaggle/input/copernicus-data/data_1977.nc
/kaggle/input/copernicus-data/data_1978.nc
/kaggle/input/copernicus-data/data_1979.nc
/kaggle/input/copernicus-data/data_1980.nc
/kaggle/input/copernicus-data/data_1981.nc
/kaggle/input/copernicus-data/data_1982.nc
/kaggle/input/copernicus-data/data_1983.nc
/kaggle/input/copernicus-data/data_1984.nc
/kaggle/input/copernicus-data/data_1985.nc
/kaggle/input/copernicus-data/data_1986.nc
/kaggle/input/copernicus-data/data_1987.nc
/kaggle/input/copernicus-data/data_1988.nc
/kaggle/input/copernicus-data/data_1989.nc
/kaggle/input/copernicus-data/data_1990.nc
/kaggle/input/copernicus-data/data_1991.nc
/kaggle/input/copernicus-data/data_1992.nc
/kaggle/inp

In [7]:
cop_all = xr.open_mfdataset(paths='/kaggle/input/copernicus-data/*.nc', combine='by_coords') # , concat_dim="time"?

In [8]:
# use mean of latitude, longitude dims as they only deviate marginally
cop_all.items

<bound method Mapping.items of <xarray.Dataset>
Dimensions:    (longitude: 88, latitude: 41, time: 277344)
Coordinates:
  * longitude  (longitude) float32 -110.0 -109.9 -109.8 ... -101.5 -101.4 -101.3
  * latitude   (latitude) float32 53.0 52.9 52.8 52.7 ... 49.3 49.2 49.1 49.0
  * time       (time) datetime64[ns] 1970-04-01 ... 2023-10-31T23:00:00
Data variables:
    t2m        (time, latitude, longitude) float32 dask.array<chunksize=(5136, 41, 88), meta=np.ndarray>
    tp         (time, latitude, longitude) float32 dask.array<chunksize=(5136, 41, 88), meta=np.ndarray>
Attributes:
    Conventions:  CF-1.6
    history:      2024-01-22 09:15:12 GMT by grib_to_netcdf-2.24.0: /opt/ecmw...>

In [9]:
import xarray as xr

# Assuming cop_all is your xarray dataset

# Round latitude and longitude to form clusters
cop_all['latitude_clustered'] = cop_all['latitude'].round(1)
cop_all['longitude_clustered'] = cop_all['longitude'].round(1)

# Define the coarsening factor for latitude and longitude
coarsen_factor = 2  # Adjust this based on your desired level of coarsening

# Use coarsen to perform spatial averaging with 'trim' boundary option
coarsened_dataset = cop_all.coarsen(latitude=coarsen_factor, longitude=coarsen_factor, boundary='trim').mean()

# Optional: Drop the latitude_clustered and longitude_clustered variables if not needed
coarsened_dataset = coarsened_dataset.drop(['latitude_clustered', 'longitude_clustered'])

# Print the resulting dataset
print(coarsened_dataset)

<xarray.Dataset>
Dimensions:    (time: 277344, latitude: 20, longitude: 44)
Coordinates:
  * longitude  (longitude) float32 -109.9 -109.8 -109.6 ... -101.8 -101.6 -101.4
  * latitude   (latitude) float32 52.95 52.75 52.55 52.35 ... 49.55 49.35 49.15
  * time       (time) datetime64[ns] 1970-04-01 ... 2023-10-31T23:00:00
Data variables:
    t2m        (time, latitude, longitude) float32 dask.array<chunksize=(5136, 20, 44), meta=np.ndarray>
    tp         (time, latitude, longitude) float32 dask.array<chunksize=(5136, 20, 44), meta=np.ndarray>
Attributes:
    Conventions:  CF-1.6
    history:      2024-01-22 09:15:12 GMT by grib_to_netcdf-2.24.0: /opt/ecmw...


In [10]:
from standard_precip.spi import SPI
from standard_precip.utils import plot_index

In [11]:
# Convert xarray dataset to Pandas DataFrame
cop_all_df = coarsened_dataset.to_dataframe().reset_index()

# Select relevant columns (time, latitude, longitude, tp)
rainfall_data = cop_all_df[['time', 'latitude', 'longitude', 'tp']]

In [12]:
import xarray as xr
import pandas as pd
from scipy.stats import gamma
from standard_precip.spi import SPI

# Convert xarray dataset to Pandas DataFrame
new_tp_df = coarsened_dataset.to_dataframe().reset_index()

# Select relevant columns (time, latitude, longitude, tp)
rainfall_data = new_tp_df[['time', 'latitude', 'longitude', 'tp']]

# Initialize the SPI class
spi = SPI()

# Calculate the 1-Month SPI using Gamma function and L-moments
df_spi = spi.calculate(
    rainfall_data,
    'time',
    'tp',
    freq="M",
    scale=1,
    fit_type="lmom",
    dist_type="gam"
)

Found duplicate dates in dataframe. Removing duplicates and using first date found


In [13]:
print(df_spi)

                      time            tp  tp_calculated_index
0      1970-04-01 00:00:00  6.033108e-06             0.125820
1      1970-04-01 01:00:00  3.725290e-09            -0.389423
2      1970-04-01 02:00:00  3.725290e-09            -0.389423
3      1970-04-01 03:00:00  3.725290e-09            -0.389423
4      1970-04-01 04:00:00  2.775341e-07            -0.149919
...                    ...           ...                  ...
277339 2023-10-31 19:00:00  2.203928e-05             0.434702
277340 2023-10-31 20:00:00  2.240343e-05             0.437013
277341 2023-10-31 21:00:00  2.240343e-05             0.437013
277342 2023-10-31 22:00:00  2.240343e-05             0.437013
277343 2023-10-31 23:00:00  2.240343e-05             0.437013

[277344 rows x 3 columns]


In [19]:
# Add latitude and longitude columns back to the DataFrame
df_spi['latitude'] = coarsened_dataset['latitude'].values[0]
df_spi['longitude'] = coarsened_dataset['longitude'].values[0]

# Create a new xarray dataset from the Pandas DataFrame
df_spi_xr = xr.Dataset.from_dataframe(df_spi.set_index(['time', 'latitude', 'longitude']))

df_spi_xr1 = df_spi_xr[['tp_calculated_index']]

# Print the resulting xarray dataset
print(df_spi_xr1)

<xarray.Dataset>
Dimensions:              (time: 277344, latitude: 1, longitude: 1)
Coordinates:
  * time                 (time) datetime64[ns] 1970-04-01 ... 2023-10-31T23:0...
  * latitude             (latitude) float32 52.95
  * longitude            (longitude) float32 -109.9
Data variables:
    tp_calculated_index  (time, latitude, longitude) float64 0.1258 ... 0.437


In [20]:
coarsened_dataset.update(df_spi_xr1)

coarsened_dataset_monthly = coarsened_dataset.resample(time='1M').mean()

print(coarsened_dataset_monthly) # why only one pair of coords available for tp_calculated_index?

<xarray.Dataset>
Dimensions:              (longitude: 44, latitude: 20, time: 643)
Coordinates:
  * longitude            (longitude) float32 -109.9 -109.8 ... -101.6 -101.4
  * latitude             (latitude) float32 52.95 52.75 52.55 ... 49.35 49.15
  * time                 (time) datetime64[ns] 1970-04-30 ... 2023-10-31
Data variables:
    t2m                  (time, latitude, longitude) float32 dask.array<chunksize=(1, 20, 44), meta=np.ndarray>
    tp                   (time, latitude, longitude) float32 dask.array<chunksize=(1, 20, 44), meta=np.ndarray>
    tp_calculated_index  (time, latitude, longitude) float64 0.1413 nan ... nan
Attributes:
    Conventions:  CF-1.6
    history:      2024-01-22 09:15:12 GMT by grib_to_netcdf-2.24.0: /opt/ecmw...


In [27]:
# Calculate the number of steps to include 49.0
num_steps = int((52.95 - 49.15) / 0.2) + 1

x = xr.Dataset(
    {
        "soil_zone": (
            ("latitude", "longitude", "time"),
            np.random.randint(1, 4, size=(num_steps, 43, 643)),  # Adjust dimensions
        ),
    },
    coords={
        "latitude": np.linspace(52.95, 49.15, num=num_steps),  # Include 49.0
        "longitude": np.arange(-109.9, -101.4, 0.2),
        "time": pd.date_range("1970-04-30", periods=643, freq="M"),
    },
)

print(x)

<xarray.Dataset>
Dimensions:    (latitude: 20, longitude: 43, time: 643)
Coordinates:
  * latitude   (latitude) float64 52.95 52.75 52.55 52.35 ... 49.55 49.35 49.15
  * longitude  (longitude) float64 -109.9 -109.7 -109.5 ... -101.9 -101.7 -101.5
  * time       (time) datetime64[ns] 1970-04-30 1970-05-31 ... 2023-10-31
Data variables:
    soil_zone  (latitude, longitude, time) int64 3 1 3 2 3 2 1 ... 3 2 2 2 3 1 2


In [17]:
monthly_data.update(x)

print(monthly_data)

NameError: name 'monthly_data' is not defined

In [None]:
print(dim(monthly_data['soil_zone']))

In [None]:
print(monthly_data['mean_max_t2m'].values)

In [None]:
import matplotlib.pyplot as plt

# Access the values of mean_max_t2m
mean_max_t2m_values = monthly_data['mean_max_t2m'].values

# Plotting a specific time index (you can adjust the index as needed)
time_index = 600 # April 2020
plt.imshow(mean_max_t2m_values[time_index, :, :])

# Add labels and title
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title(f'Mean Max Temperature at {monthly_data["time"].values[time_index]}')

# Show the plot
plt.colorbar()
plt.show()

In [None]:
column_to_append = canola_dist1['Canola'].tolist()
print(column_to_append)

In [None]:
# dist1_df = cop_all.to_dataframe()

# Resample to weekly frequency and calculate the maximum for each week
weekly_max_t2m = cop_all['t2m'].resample(time='1W').max()

# Print the resulting dataset
print(weekly_max_t2m)

# years = dist1_df.index.year
# dist1_df['Canola'] = [column_to_append[year - 1971] for year in years]

## Exploratory Analysis

In [None]:
# Choose a specific time index to plot
time_index = 0

# Plotting
fig, ax = plt.subplots(subplot_kw={'projection': ccrs.PlateCarree()}, figsize=(10, 6))

# Plot the data
im = weekly_max_t2m.isel(time=time_index).plot.pcolormesh(ax=ax, transform=ccrs.PlateCarree(), cmap='viridis', add_colorbar=False)

# Add coastlines and borders
ax.coastlines()
ax.add_feature(cfeature.BORDERS, linestyle=':')

# Add colorbar
cbar = plt.colorbar(im, ax=ax, label='Temperature (K)')

# Add title
plt.title(f'Weekly Maximum 2m Temperature on {weekly_max_t2m.time.isel(time=time_index).values}', fontsize=16)

plt.show()

In [None]:
# Resample to monthly frequency and calculate the mean for each month
monthly_mean_max_t2m = cop_all['t2m'].resample(time='1M').max()

print(monthly_mean_max_t2m)

In [None]:
# Choose a specific June for the plot
target_june = '2023-06-30'  # Adjust the date accordingly

# Plotting
fig, ax = plt.subplots(subplot_kw={'projection': ccrs.PlateCarree()}, figsize=(10, 6))

# Plot the data for the specific June
im = monthly_mean_max_t2m.sel(time=target_june).plot.pcolormesh(ax=ax, transform=ccrs.PlateCarree(), cmap='viridis', add_colorbar=False)

# Add coastlines and borders
ax.coastlines()
ax.add_feature(cfeature.BORDERS, linestyle=':')

# Add colorbar
cbar = plt.colorbar(im, ax=ax, label='Temperature (K)')

# Add title
plt.title(f'Average Maximum 2m Temperature for June {target_june}', fontsize=16)

# Add latitude and longitude indices
ax.set_xticks(monthly_mean_max_t2m.longitude.values)
ax.set_yticks(monthly_mean_max_t2m.latitude.values)
ax.xaxis.set_major_formatter('{:.1f}°E'.format)
ax.yaxis.set_major_formatter('{:.1f}°N'.format)

plt.show()

In [None]:
# Specify the target Junes
target_june_1970 = '1970-06-30'
target_june_2023 = '2023-06-30'

# Calculate a common vmin and vmax for both plots
common_vmin = monthly_mean_max_t2m.min().values
common_vmax = monthly_mean_max_t2m.max().values

# Plotting for 1970 June
fig, ax1 = plt.subplots(subplot_kw={'projection': ccrs.PlateCarree()}, figsize=(10, 6))
im1 = monthly_mean_max_t2m.sel(time=target_june_1970).plot.pcolormesh(ax=ax1, transform=ccrs.PlateCarree(), cmap='viridis', add_colorbar=False, vmin=common_vmin, vmax=common_vmax)
ax1.coastlines()
ax1.add_feature(cfeature.BORDERS, linestyle=':')
ax1.set_xticks(monthly_mean_max_t2m.longitude.values)
ax1.set_yticks(monthly_mean_max_t2m.latitude.values)
ax1.xaxis.set_major_formatter('{:.1f}°E'.format)
ax1.yaxis.set_major_formatter('{:.1f}°N'.format)
cbar1 = plt.colorbar(im1, ax=ax1, label='Temperature (K)')
plt.title(f'Average Maximum 2m Temperature for June {target_june_1970}', fontsize=16)

# Plotting for 2023 June
fig, ax2 = plt.subplots(subplot_kw={'projection': ccrs.PlateCarree()}, figsize=(10, 6))
im2 = monthly_mean_max_t2m.sel(time=target_june_2023).plot.pcolormesh(ax=ax2, transform=ccrs.PlateCarree(), cmap='viridis', add_colorbar=False, vmin=common_vmin, vmax=common_vmax)
ax2.coastlines()
ax2.add_feature(cfeature.BORDERS, linestyle=':')
ax2.set_xticks(monthly_mean_max_t2m.longitude.values)
ax2.set_yticks(monthly_mean_max_t2m.latitude.values)
ax2.xaxis.set_major_formatter('{:.1f}°E'.format)
ax2.yaxis.set_major_formatter('{:.1f}°N'.format)
cbar2 = plt.colorbar(im2, ax=ax2, label='Temperature (K)')
plt.title(f'Average Maximum 2m Temperature for June {target_june_2023}', fontsize=16)

plt.show()

In [None]:
t2m_monthly_avg_max = monthly_mean_max_t2m.to_dataframe()

# Display the resulting DataFrame
print(t2m_monthly_avg_max)

In [None]:
monthly_total_precipitation = cop_all['tp'].resample(time='1M').sum()
monthly_total_precipitation_df = monthly_total_precipitation.to_dataframe()

In [None]:
print(monthly_total_precipitation_df)

In [None]:
# Combine the two DataFrames along the columns axis
combined_df = pd.concat([t2m_monthly_avg_max, monthly_total_precipitation_df], axis=1)

# Display the resulting DataFrame
print(combined_df)

In [None]:
combined_df.head

In [None]:
# Extract features (t2m and tp) for clustering
features = combined_df[['t2m', 'tp']]

# Standardize the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Impute missing values with the mean
imputer = SimpleImputer(strategy='mean')
features_scaled_imputed = imputer.fit_transform(features_scaled)

# Apply k-means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
combined_df['cluster'] = kmeans.fit_predict(features_scaled_imputed)

# Display the resulting DataFrame with clusters
print(combined_df)

In [None]:
# Scatter plot for temperature vs precipitation, color-coded by clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x='t2m', y='tp', hue='cluster', data=combined_df, palette='viridis', s=50)
plt.title('Clusters of Temperature vs Precipitation')
plt.xlabel('Temperature (t2m)')
plt.ylabel('Total Precipitation (tp)')
plt.show()

In [None]:
# Create a Cartopy map with a PlateCarree projection
fig, ax = plt.subplots(figsize=(10, 6), subplot_kw={'projection': ccrs.PlateCarree()})
ax.set_extent([-180, 180, -90, 90])  # Set the map extent

# Add countries as a background feature
ax.add_feature(cfeature.BORDERS, linestyle=':')
ax.add_feature(cfeature.COASTLINE)

# Scatter plot for each data point with color-coded clusters
sc = ax.scatter(combined_df.index.get_level_values('longitude'), combined_df.index.get_level_values('latitude'),
                c=combined_df['cluster'], cmap='viridis', s=50, alpha=0.7, transform=ccrs.PlateCarree())

# Add colorbar
cbar = plt.colorbar(sc, ax=ax, orientation='vertical', fraction=0.046, pad=0.04, label='Cluster')

# Display the map
plt.show()

In [None]:
# Extract latitude and longitude ranges from your data
min_lat, max_lat = combined_df.index.get_level_values('latitude').min(), combined_df.index.get_level_values('latitude').max()
min_lon, max_lon = combined_df.index.get_level_values('longitude').min(), combined_df.index.get_level_values('longitude').max()

# Create a Cartopy map with a PlateCarree projection and set the map extent
fig, ax = plt.subplots(figsize=(10, 6), subplot_kw={'projection': ccrs.PlateCarree()})
ax.set_extent([min_lon, max_lon, min_lat, max_lat])

# Add countries as a background feature
ax.add_feature(cfeature.BORDERS, linestyle=':')
ax.add_feature(cfeature.COASTLINE)

# Scatter plot for each data point with color-coded clusters
sc = ax.scatter(combined_df.index.get_level_values('longitude'), combined_df.index.get_level_values('latitude'),
                c=combined_df['cluster'], cmap='viridis', s=50, alpha=0.7, transform=ccrs.PlateCarree())

# Add colorbar
cbar = plt.colorbar(sc, ax=ax, orientation='vertical', fraction=0.046, pad=0.04, label='Cluster')

# Display the map
plt.show()

In [None]:
# NOT WORKING YET

def calculate_spi(precipitation_values):
    # Extract month from the multi-level index
    months = precipitation_values.index.get_level_values('time').month
    
    # Your SPI calculation logic here
    # For example, using gamma distribution fitting:
    params = gamma.fit(precipitation_values, loc=0)
    cdf = gamma.cdf(precipitation_values, *params)
    spi_values = gamma.ppf(cdf, *params)
    
    return spi_values

# Ensure the 'tp' column is numeric (it may be an object dtype after concatenation)
# combined_df['tp'] = pd.to_numeric(combined_df['tp'], errors='coerce')

# Apply the SPI calculation
# combined_df['SPI'] = combined_df['tp'].groupby(months).transform(calculate_spi)

In [None]:
selected_location = t2m_monthly_avg_max.xs((53.0, -110.0), level=('latitude', 'longitude'))
selected_location.plot(figsize=(12, 6), title='Temperature Time Series at Latitude 53.0, Longitude -110.0')
plt.xlabel('Time')
plt.ylabel('Temperature (K)')
plt.show()

In [None]:
# Select data for two locations
location1 = t2m_monthly_avg_max.xs((53.0, -110.0), level=('latitude', 'longitude'))
location2 = t2m_monthly_avg_max.xs((49.0, -102.0), level=('latitude', 'longitude'))

# Plot time series for both locations
plt.figure(figsize=(12, 6))
plt.plot(location1.index.get_level_values('time'), location1['t2m'], label='Northwest (53.0, -110.0)')
plt.plot(location2.index.get_level_values('time'), location2['t2m'], label='Southeast (52.0, -108.0)')

plt.title('Avg. Max. Temp. for Two Locations')
plt.xlabel('Time')
plt.ylabel('Temperature (K)')
plt.legend()
plt.show()

In [None]:
# Assuming 't2m_monthly_avg_max' is your DataFrame

# Reshape the DataFrame for heatmap
heatmap_data = t2m_monthly_avg_max.unstack(level='latitude')['t2m']

# Plot heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(heatmap_data, cmap='viridis', cbar_kws={'label': 'Temperature (K)'})
plt.title('Monthly Average Maximum Temperature Heatmap')
plt.xlabel('Latitude')
plt.ylabel('Time')
plt.show()

In [None]:
# Assuming 't2m_monthly_avg_max' is your DataFrame

# Extract decade from the 'time' index
t2m_monthly_avg_max['decade'] = (t2m_monthly_avg_max.index.get_level_values('time').year // 10) * 10

# Group by decade and latitude, then calculate the mean
heatmap_data_decade = t2m_monthly_avg_max.groupby(['decade', 'latitude']).mean()['t2m'].unstack()

# Plot heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(heatmap_data_decade, cmap='viridis', cbar_kws={'label': 'Temperature (K)'})
plt.title('Decadal Average Maximum Temperature Heatmap')
plt.xlabel('Latitude')
plt.ylabel('Decade')
plt.show()

In [None]:
# Scatter plot between latitude and temperature for a specific month (e.g., '2023-06-30')
selected_month = t2m_monthly_avg_max.xs('2023-06-30', level='time')
plt.figure(figsize=(10, 6))
sns.scatterplot(x=selected_month.index.get_level_values('latitude'), y=selected_month['t2m'])
plt.title('Scatter Plot of Temperature vs. Latitude for June 2023')
plt.xlabel('Latitude')
plt.ylabel('Temperature (K)')
plt.show()

In [None]:
# Assuming 'weekly_max_t2m' is your resulting dataset

# Choose a specific time index to plot
time_index = 0

# Plotting
fig, ax = plt.subplots(subplot_kw={'projection': ccrs.PlateCarree()}, figsize=(10, 6))

# Plot the data
im = weekly_max_t2m.isel(time=time_index).plot.pcolormesh(ax=ax, transform=ccrs.PlateCarree(), cmap='viridis', add_colorbar=False)

# Add coastlines and borders
ax.coastlines()
ax.add_feature(cfeature.BORDERS, linestyle=':')

# Add colorbar
cbar = plt.colorbar(im, ax=ax, label='Temperature (K)')

# Add title
plt.title(f'Weekly Maximum 2m Temperature on {weekly_max_t2m.time.isel(time=time_index).values}', fontsize=16)

# Add latitude and longitude indices
ax.set_xticks(weekly_max_t2m.longitude.values)
ax.set_yticks(weekly_max_t2m.latitude.values)
ax.xaxis.set_major_formatter('{:.1f}°E'.format)
ax.yaxis.set_major_formatter('{:.1f}°N'.format)

plt.show()

## Sequential Analysis: Heat and Drought Phases

In [None]:
# "Heat Phase": no. of occurrences of 5 (?) consecutive days above 300 degrees Kelvin

# CA drought monitor: https://agriculture.canada.ca/en/agricultural-production/weather/canadian-drought-monitor
# drought classes by SPI: https://droughtmonitor.unl.edu/About/AbouttheData/DroughtClassification.aspx

In [None]:
# Set the temperature threshold for defining heat phases
temperature_threshold = 310  # Adjust this threshold as needed

# Create a binary mask for values above the threshold
heat_mask = combined_df['t2m'] > temperature_threshold

# Identify consecutive True sequences (heat phases)
heat_phases = (heat_mask.astype(int).groupby((~heat_mask).cumsum()).cumsum() * heat_mask).groupby(level=0).max()

# Display the identified heat phases
print(heat_phases)

In [None]:
# Set the precipitation threshold for defining drought phases
precipitation_threshold = 1  # Adjust this threshold as needed

# Create a binary mask for values below the precipitation threshold
drought_mask = combined_df['tp'] < precipitation_threshold

# Identify consecutive True sequences (drought phases)
drought_phases = (drought_mask.astype(int).groupby((~drought_mask).cumsum()).cumsum() * drought_mask).groupby(level=0).max()

# Display the identified drought phases
print(drought_phases)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Plot the time series data
plt.figure(figsize=(10, 6))
plt.plot(combined_df.index.get_level_values('time'), combined_df['t2m'], label='Temperature')
plt.plot(combined_df.index.get_level_values('time'), combined_df['tp'], label='Precipitation')

# Create a mask for heat phases with the same length as the time series data
heat_phases_mask = np.repeat(heat_phases.values, len(combined_df) // len(heat_phases))

# Highlight heat phases
plt.fill_between(combined_df.index.get_level_values('time'), 0, 40, where=heat_phases_mask == 1, color='red', alpha=0.3, label='Heat Phase')

# Highlight drought phases
plt.fill_between(combined_df.index.get_level_values('time'), 0, 40, where=drought_phases == 1, color='orange', alpha=0.3, label='Drought Phase')

plt.xlabel('Time')
plt.ylabel('Temperature / Precipitation')
plt.legend()
plt.show()