# Appendix 01 - Data preprocessing: *Literature dates*

## Literature Monsoon Onset and Withdrawal normal dates
Normal dates for monsoon onset and withdrawal are taken form the article by PAI et al 2020. In this script the dates are gridded and a xarray daset is created with the onset and withdrawal dates.


In [1]:
# Required Imports
import numpy as np
import pandas as pd
import geopandas as gpd
from scipy.interpolate import griddata
import rioxarray
from shapely.geometry import mapping
import xarray as xr
import os

In [2]:
# -- IMPORTING DATA
# File Paths
country_boundary_file   = '../data/raw/external/GADM41_WORLD.gpkg'
dates_file              = '../data/raw/external/PAI_dates.xlsx'
gridded_dates_file      = '../data/preprocessed/external/monsoon_dates.nc'

# Check if the files exist [if preprocessed files exist abort the process]
if not os.path.exists(country_boundary_file):
    raise FileNotFoundError(f"{country_boundary_file} not found.")
if not os.path.exists(dates_file):
    raise FileNotFoundError(f"{dates_file} not found.")
if os.path.exists(gridded_dates_file):
    print(f"{dates_file} found. The file will be overwritten!")

# Load country boundary data
country_boundary = gpd.read_file(country_boundary_file, layer='ADM_1')
country_boundary = country_boundary[country_boundary["GID_0"].isin(['IND','NPL','BGD'])]


# Load date data
dates_tab = pd.read_excel(dates_file)

# Select relevant columns
dates_tab = dates_tab[['lat', 'lon', 'ons_num', 'wit_num']]
dates_tab.head()

../data/raw/external/PAI_dates.xlsx found. The file will be overwritten!


Unnamed: 0,lat,lon,ons_num,wit_num
0,30.73,76.78,180,267
1,28.646447,77.215772,179,269
2,30.377145,76.777422,174,267
3,28.802033,76.129984,182,267
4,29.156718,75.721187,185,265


In [3]:
# -- GRIDDING DATASET
# Extract longitude, latitude, onset, and withdrawal dates as numpy arrays
lon = dates_tab['lon'].to_numpy()
lat = dates_tab['lat'].to_numpy()
onset = dates_tab['ons_num'].to_numpy()
withdrawal = dates_tab['wit_num'].to_numpy()

# Create a list of (lon, lat) points
points = list(zip(lon, lat))

# Define the new grid resolution
lon_fin = np.arange(65, 100.25, 0.5)
lat_fin = np.arange(5, 40.25, 0.5)

# Create a meshgrid for the new grid
gridX, gridY = np.meshgrid(lon_fin, lat_fin)

# Interpolate onset and withdrawal dates using cubic method
regrid_ons = griddata(points, onset, (gridX, gridY), method='cubic')
regrid_wit = griddata(points, withdrawal, (gridX, gridY), method='cubic')

# Stack the onset and withdrawal dates into a single array
dates = np.stack((regrid_ons, regrid_wit), axis=2)

In [4]:
# -- CREATING XARRAY DATASET AND SAVING
# Create an xarray Dataset
ds = xr.Dataset(
    data_vars=dict(
        date=(["lat", "lon", "cluster"], dates),
    ),
    coords=dict(
        lon=lon_fin,
        lat=lat_fin,
        cluster=[0, 1]
    ),
     attrs=dict(
        Description="Interpolated onset and withdrawal dates of the monsoon season across India.",
        History=(
            "1. Country boundaries were loaded from GADM41_IND.gpkg.\n"
            "2. Monsoon onset and withdrawal dates were extracted from PAI_dates.xlsx.\n"
            "3. The data was gridded using a 0.25-degree resolution.\n"
            "4. Cubic interpolation was performed to estimate onset and withdrawal dates on the new grid.\n"
            "5. The interpolated data was stored in an xarray Dataset."
        )
    )
    )

# Masking with country shapefile
ds.rio.set_spatial_dims(x_dim="lon", y_dim="lat", inplace=True)
ds.rio.write_crs("epsg:4326", inplace=True)
ds_rast = ds.rio.clip(country_boundary.geometry.apply(mapping), country_boundary.crs, drop=True)

# Save xarray Dataset to a NetCDF file (optional)
ds_rast.to_netcdf(gridded_dates_file)

ds_rast

### Facciamolo fare a Chat-GPT 4o direttamente dalle immagini...

In [5]:
import pandas as pd
from datetime import datetime

# Coordinates and respective dates of monsoon onset/progress from the map
data = {
    "lat": [26, 27, 28, 28, 28, 27, 26, 25, 24, 24, 23, 22, 21, 21, 20, 19, 19, 18, 17, 16, 15, 15, 14, 13, 12, 11, 10, 9, 8, 9, 8, 11, 9, 8, 6, 7, 7],
    "lon": [73, 73, 73, 74, 75, 75, 75, 75, 75, 76, 76, 76, 76, 77, 77, 77, 78, 78, 78, 78, 78, 79, 79, 79, 79, 79, 79, 79, 79, 80, 80, 81, 81, 81, 82, 83, 85],
    "Onset_Progress_Date": ["8 July", "1 July", "25 June", "25 June", "25 June", "30 June", "5 July", "5 July", "30 June", "30 June", "25 June", "20 June", "15 June", "15 June", "10 June", "5 June", "5 June", "10 June", "10 June", "10 June", "10 June", "15 June", "15 June", "15 June", "15 June", "10 June", "5 June", "5 June", "5 June", "5 June", "5 June", "25 May", "30 May", "30 May", "5 June", "1 June", "1 June"]
}

# Convert dates to day of the year
def convert_to_day_of_year(date_str):
    date_obj = datetime.strptime(date_str, "%d %B")
    day_of_year = date_obj.timetuple().tm_yday
    return day_of_year

data["ons_num"] = [convert_to_day_of_year(date) for date in data["Onset_Progress_Date"]]

# Create DataFrame
df = pd.DataFrame(data)
df


Unnamed: 0,lat,lon,Onset_Progress_Date,ons_num
0,26,73,8 July,189
1,27,73,1 July,182
2,28,73,25 June,176
3,28,74,25 June,176
4,28,75,25 June,176
5,27,75,30 June,181
6,26,75,5 July,186
7,25,75,5 July,186
8,24,75,30 June,181
9,24,76,30 June,181
