<a href="https://colab.research.google.com/github/Kanahe1800/hanami-bloom-prediction/blob/main/Bloomwatch_Tempertaure_Clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



## Getting data from Nasa giovanni site
https://giovanni.gsfc.nasa.gov/giovanni/#service=TmAvMp&starttime=1980-01-01T00:00:00Z&endtime=2025-10-03T23:59:59Z&bbox=127,30,148.5,45.75&dataKeyword=Temperature


In [1]:
%%capture
!pip install earthaccess
!pip install netcdf4

In [2]:
import netCDF4
import numpy as np

In [3]:
import earthaccess
auth = earthaccess.login(strategy="interactive")

Enter your Earthdata Login username: tristan2005
Enter your Earthdata password: ··········


In [4]:
# Mean Temperature (for each month)
collection_shortname = "M2SMNXSLV"
version = "003"

# 1980-01-01 to 2025-08-31
# Version may vary, check Earthdata Search for the latest
time_range = ("2020-01-01", "2025-08-31")

# Define your spatial area of interest as a bounding box: [min_lon, min_lat, max_lon, max_lat]
bounding_box = (127,30,148.5,45.75) # Bounding box for Japan, found using https://boundingbox.klokantech.com/

# Search for granules
print("Searching for granules...")
granules = earthaccess.search_data(
    short_name=collection_shortname,
    temporal=time_range,
    bounding_box=bounding_box
)

print(f"Found {len(granules)} granules.")

Searching for granules...
Found 68 granules.


In [5]:
if granules:
    print("Downloading granules...")
    earthaccess.download(granules, "./downloaded_data")
    print("Download complete.")
else:
    print("No granules found for the specified criteria.")

Downloading granules...


QUEUEING TASKS | :   0%|          | 0/68 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/68 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/68 [00:00<?, ?it/s]

Download complete.


Unpack granules

In [6]:
import os
PATH_NAME = "/content/downloaded_data"

In [7]:
nc4_files = [os.path.join(PATH_NAME, f) for f in os.listdir(PATH_NAME) if f.endswith('.nc4')]
print(nc4_files)

['/content/downloaded_data/MERRA2_400.statM_2d_slv_Nx.202010.nc4', '/content/downloaded_data/MERRA2_400.statM_2d_slv_Nx.202505.nc4', '/content/downloaded_data/MERRA2_400.statM_2d_slv_Nx.202406.nc4', '/content/downloaded_data/MERRA2_400.statM_2d_slv_Nx.202303.nc4', '/content/downloaded_data/MERRA2_400.statM_2d_slv_Nx.202310.nc4', '/content/downloaded_data/MERRA2_400.statM_2d_slv_Nx.202208.nc4', '/content/downloaded_data/MERRA2_400.statM_2d_slv_Nx.202312.nc4', '/content/downloaded_data/MERRA2_400.statM_2d_slv_Nx.202211.nc4', '/content/downloaded_data/MERRA2_400.statM_2d_slv_Nx.202205.nc4', '/content/downloaded_data/MERRA2_400.statM_2d_slv_Nx.202012.nc4', '/content/downloaded_data/MERRA2_400.statM_2d_slv_Nx.202102.nc4', '/content/downloaded_data/MERRA2_400.statM_2d_slv_Nx.202011.nc4', '/content/downloaded_data/MERRA2_400.statM_2d_slv_Nx.202402.nc4', '/content/downloaded_data/MERRA2_400.statM_2d_slv_Nx.202207.nc4', '/content/downloaded_data/MERRA2_401.statM_2d_slv_Nx.202106.nc4', '/content

Convert granules to numpy arrays

In [8]:
import netCDF4
import numpy as np

if nc4_files:
    first_file = nc4_files[0]
    with netCDF4.Dataset(first_file, 'r') as nc_file:
        # Choose the variable you want to convert to a NumPy array
        variable_name = 'T2MMEAN'  # Example variable: Mean Temperature
        if variable_name in nc_file.variables:
            # Read the data for the chosen variable
            data_variable = nc_file.variables[variable_name]
            numpy_array = data_variable[:] # Read all data for the variable

            print(f"Successfully converted variable '{variable_name}' to a NumPy array.")
            print(f"Shape of the NumPy array: {numpy_array.shape}")
            # You can now use 'numpy_array' for your machine learning tasks
        else:
            print(f"Variable '{variable_name}' not found in the file.")
else:
    print("No .nc4 files found to read.")


Successfully converted variable 'T2MMEAN' to a NumPy array.
Shape of the NumPy array: (1, 361, 576)


In [9]:
print(numpy_array)

[[[219.31093 219.31093 219.31093 ... 219.31093 219.31093 219.31093]
  [218.51024 218.5043  218.49857 ... 218.52663 218.52106 218.5157 ]
  [219.04688 219.0514  219.05525 ... 219.03079 219.03667 219.04192]
  ...
  [261.9779  261.96783 261.95786 ... 262.00836 261.99817 261.988  ]
  [262.0416  262.0356  262.02954 ... 262.05978 262.05374 262.04767]
  [261.9626  261.9626  261.9626  ... 261.9626  261.9626  261.9626 ]]]


Use Bounding Box of Japan

In [10]:
if nc4_files:
    first_file = nc4_files[0]
    with netCDF4.Dataset(first_file, 'r') as nc_file:
        # Read latitude and longitude
        latitudes = nc_file.variables['lat'][:]
        longitudes = nc_file.variables['lon'][:]

        # Define the bounding box for Japan (min_lon, min_lat, max_lon, max_lat)
        # We already have this from a previous cell: bounding_box = (127, 30, 148.5, 45.75)
        min_lon, min_lat, max_lon, max_lat = bounding_box

        # Find the indices corresponding to the bounding box
        lat_indices = np.where((latitudes >= min_lat) & (latitudes <= max_lat))[0]
        lon_indices = np.where((longitudes >= min_lon) & (longitudes <= max_lon))[0]

        if lat_indices.size > 0 and lon_indices.size > 0:
            # Extract the data for the bounding box
            # Assuming numpy_array has shape (time, lat, lon)
            japan_data = numpy_array[:, lat_indices[:, None], lon_indices]

            print("Successfully extracted data for Japan.")
            print(f"Shape of the extracted data: {japan_data.shape}")
        else:
            print("No data found within the specified bounding box.")

else:
    print("No .nc4 files found to process.")

Successfully extracted data for Japan.
Shape of the extracted data: (1, 32, 34)


In [11]:
print(japan_data)

[[[296.818   297.2543  297.5142  ... 300.1629  300.366   300.54092]
  [296.37088 296.81308 297.0292  ... 299.9525  300.15622 300.31253]
  [295.91147 296.36343 296.5651  ... 299.77045 299.98325 300.15872]
  ...
  [277.96494 276.35504 275.94077 ... 285.58453 284.60916 284.41498]
  [278.04636 277.53503 276.47375 ... 285.34354 284.3535  283.95755]
  [277.46768 277.4142  277.15778 ... 285.16977 285.2063  284.69992]]]


# New Section

In [12]:
import os
import netCDF4
import numpy as np
import pandas as pd
from datetime import datetime

# Dictionary to store temperatures by year and month
yearly_monthly_temperatures = {}

tokyo_lat = 35.6895
tokyo_lon = 139.6917

for file_path in nc4_files:
    with netCDF4.Dataset(file_path, 'r') as nc_file:
        # Read the full latitude and longitude arrays
        full_latitudes = nc_file.variables['lat'][:]
        full_longitudes = nc_file.variables['lon'][:]

        # Find the index in the full latitude and longitude arrays closest to Tokyo's coordinates
        lat_idx_full = (np.abs(full_latitudes - tokyo_lat)).argmin()
        lon_idx_full = (np.abs(full_longitudes - tokyo_lon)).argmin()

        # Read the 'T2MMEAN' variable data
        temperature_data = nc_file.variables['T2MMEAN'][:]

        # Extract the temperature value for the Tokyo location
        # Assuming temperature_data has shape (time, lat, lon)
        tokyo_temperature_kelvin = temperature_data[0, lat_idx_full, lon_idx_full]

        # Convert the extracted temperature from Kelvin to Celsius
        tokyo_temperature_celsius = tokyo_temperature_kelvin - 273.15

        # Extract year and month from the filename
        # Filename format example: MERRA2_400.statM_2d_slv_Nx.YYYYMM.nc4
        filename = os.path.basename(file_path)
        year_month_str = filename.split('.')[-2] # Extracts YYYYMM
        year = int(year_month_str[:4])        # Extracts YYYY
        month = int(year_month_str[4:6])       # Extracts MM

        # Store the temperature by year and month
        if year not in yearly_monthly_temperatures:
            yearly_monthly_temperatures[year] = {}
        yearly_monthly_temperatures[year][month] = tokyo_temperature_celsius

# Create a pandas DataFrame from the yearly_monthly_temperatures dictionary
# Convert keys to integers for proper sorting
df_temperatures = pd.DataFrame.from_dict(yearly_monthly_temperatures, orient='index')

# Sort the DataFrame by year
df_temperatures = df_temperatures.sort_index()

# Select the months for winter
winter_months_df = df_temperatures[[11, 12, 1, 2, 3]].copy()

# Shift the year for November and December forward by one year
winter_months_df[11] = winter_months_df[11].set_axis(winter_months_df.index + 1)
winter_months_df[12] = winter_months_df[12].set_axis(winter_months_df.index + 1)

# Create a new DataFrame with the shifted November and December data
shifted_winter_months_df = pd.concat([winter_months_df[11], winter_months_df[12]], axis=1)

# Combine the shifted Nov/Dec data with the Jan/Feb/Mar data (which keeps the original year)
# This will align based on the new index (representing the year the winter season ends)
final_winter_df = pd.merge(shifted_winter_months_df, winter_months_df[[1, 2, 3]], left_index=True, right_index=True, how='left')


# Rename the columns to month names for better readability
month_names = {
    1: 'January', 2: 'February', 3: 'March',
    11: 'November', 12: 'December'
}
final_winter_df = final_winter_df.rename(columns=month_names)

# Sort columns by month number for correct winter order (Nov, Dec, Jan, Feb, Mar)
final_winter_df = final_winter_df[['November', 'December', 'January', 'February', 'March']]

# Print the resulting DataFrame
print("Tokyo Temperatures (Celsius) for Winter Seasons (November of year X-1 to March of year X):")
display(final_winter_df)

Tokyo Temperatures (Celsius) for Winter Seasons (November of year X-1 to March of year X):


Unnamed: 0,November,December,January,February,March
2020,,,6.747284,7.523773,9.974823
2021,13.548492,7.199646,4.982849,7.251862,11.646179
2022,13.121582,7.184021,3.734253,4.113037,9.996704
2023,14.07489,6.736786,4.881653,6.38092,11.932739
2024,14.111511,8.949127,6.385712,7.099243,8.817749
2025,13.428375,7.223083,5.554565,5.223755,9.930115
