# Data Loading and Preprocessing: Yum Yum

In [383]:
# Importing libraries and tools
import pandas as pd
import numpy as np
import json
import requests

In [384]:
# Checking data for the cherry blossoms full bloom dates in Japan
df_cherry = pd.read_csv("/content/sakura_full_bloom_dates.csv")
df_cherry.head()

Unnamed: 0,Site Name,Currently Being Observed,1953,1954,1955,1956,1957,1958,1959,1960,...,2018,2019,2020,2021,2022,2023,2024,2025,30 Year Average 1991-2020,Notes
0,Wakkanai,True,1953-05-30 00:00:00,1954-05-27 00:00:00,1955-05-23 00:00:00,1956-05-14 00:00:00,1957-05-22 00:00:00,1958-05-25 00:00:00,1959-05-12 00:00:00,1960-05-24 00:00:00,...,2018-05-15 00:00:00,2019-05-09 00:00:00,2020-05-12 00:00:00,2021-05-11 00:00:00,2022-05-09 00:00:00,2023-05-05 00:00:00,2024-05-04 00:00:00,2025-05-10 00:00:00,5 16,Sargent cherry (Prunus sargentii)
1,Rumoi,False,1953-05-13 00:00:00,1954-05-17 00:00:00,1955-05-16 00:00:00,1956-05-14 00:00:00,1957-05-14 00:00:00,1958-05-16 00:00:00,,,...,,,,,,,,,-,Sargent cherry (Prunus sargentii)
2,Asahikawa,True,1953-05-16 00:00:00,1954-05-18 00:00:00,1955-05-14 00:00:00,1956-05-13 00:00:00,1957-05-14 00:00:00,1958-05-15 00:00:00,1959-05-13 00:00:00,1960-05-16 00:00:00,...,2018-05-01 00:00:00,2019-05-04 00:00:00,2020-05-04 00:00:00,2021-05-03 00:00:00,2022-04-28 00:00:00,2023-04-28 00:00:00,2024-04-26 00:00:00,2025-05-02 00:00:00,5 7,Sargent cherry (Prunus sargentii)
3,Abashiri,True,1953-05-26 00:00:00,1954-05-19 00:00:00,1955-05-20 00:00:00,1956-05-13 00:00:00,,,1959-05-07 00:00:00,,...,2018-05-06 00:00:00,2019-05-07 00:00:00,2020-05-09 00:00:00,2021-05-07 00:00:00,2022-04-30 00:00:00,2023-05-01 00:00:00,2024-04-30 00:00:00,2025-05-10 00:00:00,5 13,Sargent cherry (Prunus sargentii)
4,Sapporo,True,1953-05-14 00:00:00,1954-05-08 00:00:00,1955-05-16 00:00:00,1956-05-09 00:00:00,1957-05-13 00:00:00,,1959-05-10 00:00:00,1960-05-09 00:00:00,...,2018-04-29 00:00:00,2019-04-29 00:00:00,2020-05-02 00:00:00,2021-04-27 00:00:00,2022-04-25 00:00:00,2023-04-21 00:00:00,2024-04-24 00:00:00,2025-04-27 00:00:00,5 6,


In [None]:
"""
This code cell here will drop columns we do not need, which inlcudes the "Notes"
section, and the years from 1953 - 1980 because does not sync well with the other
datasets
"""
df_cherry = df_cherry.drop(columns = ["Notes"])

In [385]:
# Checking the amount of cities in the dataset, and rows with currently unobserved sites
total_observed_cities = len(df_cherry["Currently Being Observed"])
count_rows_notobs = len(df_cherry[df_cherry["Currently Being Observed"] == False])

print(f"Total number of observed cities in this dataset: {total_observed_cities}")
print(f"Number of cities with unobserved cherry full bloom dates: {count_rows_notobs}")

Total number of observed cities in this dataset: 102
Number of cities with unobserved cherry full bloom dates: 44


In [386]:
# We will filter out cities that are not observed to keep the data consistent
unobserved_cities_condition = df_cherry["Currently Being Observed"] == False # the boolean condition to filter out unobserved cities
df_cherry_filtered = df_cherry.drop(df_cherry[unobserved_cities_condition].index) # creating a new dataframe with the filtered out cities

print(len(df_cherry["Currently Being Observed"])) # Checking if the original dataframe is untouched
print(len(df_cherry_filtered["Currently Being Observed"])) # Finding out if we correctly dropped the number of unobserved cities == 58

""" df_cherry_filtered will be the dataset we're working on from now """
df_cherry_filtered.head()

102
58


Unnamed: 0,Site Name,Currently Being Observed,1953,1954,1955,1956,1957,1958,1959,1960,...,2018,2019,2020,2021,2022,2023,2024,2025,30 Year Average 1991-2020,Notes
0,Wakkanai,True,1953-05-30 00:00:00,1954-05-27 00:00:00,1955-05-23 00:00:00,1956-05-14 00:00:00,1957-05-22 00:00:00,1958-05-25 00:00:00,1959-05-12 00:00:00,1960-05-24 00:00:00,...,2018-05-15 00:00:00,2019-05-09 00:00:00,2020-05-12 00:00:00,2021-05-11 00:00:00,2022-05-09 00:00:00,2023-05-05 00:00:00,2024-05-04 00:00:00,2025-05-10 00:00:00,5 16,Sargent cherry (Prunus sargentii)
2,Asahikawa,True,1953-05-16 00:00:00,1954-05-18 00:00:00,1955-05-14 00:00:00,1956-05-13 00:00:00,1957-05-14 00:00:00,1958-05-15 00:00:00,1959-05-13 00:00:00,1960-05-16 00:00:00,...,2018-05-01 00:00:00,2019-05-04 00:00:00,2020-05-04 00:00:00,2021-05-03 00:00:00,2022-04-28 00:00:00,2023-04-28 00:00:00,2024-04-26 00:00:00,2025-05-02 00:00:00,5 7,Sargent cherry (Prunus sargentii)
3,Abashiri,True,1953-05-26 00:00:00,1954-05-19 00:00:00,1955-05-20 00:00:00,1956-05-13 00:00:00,,,1959-05-07 00:00:00,,...,2018-05-06 00:00:00,2019-05-07 00:00:00,2020-05-09 00:00:00,2021-05-07 00:00:00,2022-04-30 00:00:00,2023-05-01 00:00:00,2024-04-30 00:00:00,2025-05-10 00:00:00,5 13,Sargent cherry (Prunus sargentii)
4,Sapporo,True,1953-05-14 00:00:00,1954-05-08 00:00:00,1955-05-16 00:00:00,1956-05-09 00:00:00,1957-05-13 00:00:00,,1959-05-10 00:00:00,1960-05-09 00:00:00,...,2018-04-29 00:00:00,2019-04-29 00:00:00,2020-05-02 00:00:00,2021-04-27 00:00:00,2022-04-25 00:00:00,2023-04-21 00:00:00,2024-04-24 00:00:00,2025-04-27 00:00:00,5 6,
6,Obihiro,True,1953-05-19 00:00:00,1954-05-12 00:00:00,1955-05-14 00:00:00,1956-05-09 00:00:00,1957-05-14 00:00:00,1958-05-16 00:00:00,1959-05-05 00:00:00,1960-05-15 00:00:00,...,2018-04-28 00:00:00,2019-04-30 00:00:00,2020-05-03 00:00:00,2021-04-27 00:00:00,2022-04-25 00:00:00,2023-04-22 00:00:00,2024-04-26 00:00:00,2025-05-03 00:00:00,5 5,Sargent cherry (Prunus sargentii)


In [387]:
# Sanity check to see if df_cherry_filtered dataset only contains observed cities
total_observed_cities_sanity = len(df_cherry_filtered["Currently Being Observed"])
count_rows_notobs_sanity = len(df_cherry_filtered[df_cherry_filtered["Currently Being Observed"] == False])

print(f"Total number of observed cities in this dataset: {total_observed_cities_sanity}")
print(f"Number of cities with unobserved cherry full bloom dates: {count_rows_notobs_sanity}")

Total number of observed cities in this dataset: 58
Number of cities with unobserved cherry full bloom dates: 0


# Getting New Data for our filtered cherry blossom dataset: Latitude and Longitude

In [388]:
def long_lat_for_city(city_name: str):
  """
  This function is to get the longitude, and latitude for the observed cities by
  calling the Nominatim API
  """
  # search API format for Nominatim: https://nominatim.openstreetmap.org/search?<params>
  # example to search a particular city: https://nominatim.openstreetmap.org/search?q=Tokyo&format=json

  nominatim_url = "https://nominatim.openstreetmap.org/search?"

  headers = { # for security reason, so that Nominatim can contact a member from the project if something were to happen
      'User-Agent': 'CherryBlossomAnalysis/1.0(longtran@uvic.ca)'
  }

  params = { # setting up parameters to send to Nominatim and getting a json file if successful
      'q': city_name,
      'format': 'json'
  }

  response = requests.get(nominatim_url, params = params, headers = headers) # this will be the response from Nominatim which is the longitude and latitude
  #print(response.status_code) # checking the success of the call

  data = response.json() # getting the information about the city in a Json file format
  #print(data) # checking for accuracy

  # Checking if the result is empty or not, and if so the way to handle it
  if data:
    # Get the first result
    city_info = data[0]

    # if data is successfully received, we well extract longitude and latitude
    latitude = city_info.get("lat")
    longitude = city_info.get("lon")

    # return the values as a tuple of floating point type
    return float(latitude), float(longitude)
  else:
    # if the file is empty, we return None
    return None, None

# long_lat_for_city("Sapporo") # example case

In [389]:
# We will first create the latitude and longitude column for df_cherry_filtered
df_cherry_filtered["Latitude"] = np.nan
df_cherry_filtered["Longitude"] = np.nan
df_cherry_filtered.head() # checking if the columns were made

Unnamed: 0,Site Name,Currently Being Observed,1953,1954,1955,1956,1957,1958,1959,1960,...,2020,2021,2022,2023,2024,2025,30 Year Average 1991-2020,Notes,Latitude,Longitude
0,Wakkanai,True,1953-05-30 00:00:00,1954-05-27 00:00:00,1955-05-23 00:00:00,1956-05-14 00:00:00,1957-05-22 00:00:00,1958-05-25 00:00:00,1959-05-12 00:00:00,1960-05-24 00:00:00,...,2020-05-12 00:00:00,2021-05-11 00:00:00,2022-05-09 00:00:00,2023-05-05 00:00:00,2024-05-04 00:00:00,2025-05-10 00:00:00,5 16,Sargent cherry (Prunus sargentii),,
2,Asahikawa,True,1953-05-16 00:00:00,1954-05-18 00:00:00,1955-05-14 00:00:00,1956-05-13 00:00:00,1957-05-14 00:00:00,1958-05-15 00:00:00,1959-05-13 00:00:00,1960-05-16 00:00:00,...,2020-05-04 00:00:00,2021-05-03 00:00:00,2022-04-28 00:00:00,2023-04-28 00:00:00,2024-04-26 00:00:00,2025-05-02 00:00:00,5 7,Sargent cherry (Prunus sargentii),,
3,Abashiri,True,1953-05-26 00:00:00,1954-05-19 00:00:00,1955-05-20 00:00:00,1956-05-13 00:00:00,,,1959-05-07 00:00:00,,...,2020-05-09 00:00:00,2021-05-07 00:00:00,2022-04-30 00:00:00,2023-05-01 00:00:00,2024-04-30 00:00:00,2025-05-10 00:00:00,5 13,Sargent cherry (Prunus sargentii),,
4,Sapporo,True,1953-05-14 00:00:00,1954-05-08 00:00:00,1955-05-16 00:00:00,1956-05-09 00:00:00,1957-05-13 00:00:00,,1959-05-10 00:00:00,1960-05-09 00:00:00,...,2020-05-02 00:00:00,2021-04-27 00:00:00,2022-04-25 00:00:00,2023-04-21 00:00:00,2024-04-24 00:00:00,2025-04-27 00:00:00,5 6,,,
6,Obihiro,True,1953-05-19 00:00:00,1954-05-12 00:00:00,1955-05-14 00:00:00,1956-05-09 00:00:00,1957-05-14 00:00:00,1958-05-16 00:00:00,1959-05-05 00:00:00,1960-05-15 00:00:00,...,2020-05-03 00:00:00,2021-04-27 00:00:00,2022-04-25 00:00:00,2023-04-22 00:00:00,2024-04-26 00:00:00,2025-05-03 00:00:00,5 5,Sargent cherry (Prunus sargentii),,


In [390]:
"""
This code cell will go through each row in the df_cherry_filtered dataset and
find each city's latitude and longitude values using the long_lat_for_city()
function.
.iterrows() syntax explanation:
- the index indicates the row num on the far left, and row contains the remaining attributes
- The .loc[row_number, column_name] syntax is: .loc(4, "Latitude") - Sapporo as an example.
If the API fails to get the data, the cell will be unfilled and cleaned off after
"""
for index, row in df_cherry_filtered.iterrows():
  # for each loop, we will call the long_lat_for_city() to get the coordinates
  # per city to then fill the cherry dataset with. If not, it will be left unfilled
  #print(row["Site Name"])
  coordinates = long_lat_for_city(row["Site Name"])
  if coordinates:
    df_cherry_filtered.loc[index, "Latitude"] = coordinates[0]
    df_cherry_filtered.loc[index, "Longitude"] = coordinates[1]

df_cherry_filtered.head() # check if all cells are filled

Wakkanai
Asahikawa
Abashiri
Sapporo
Obihiro
Kushiro
Muroran
Hakodate
Aomori
Akita
Morioka
Yamagata
Sendai
Fukushima
Niigata
Kanazawa
Toyama
Nagano
Utsunomiya
Fukui
Maebashi
Kumagaya
Mito
Gifu
Nagoya
Kofu
Choshi
Tsu
Shizuoka
Tokyo
Yokohama
Matsue
Tottori
Kyoto
Hikone
Hiroshima
Okayama
Kobe
Osaka
Wakayama
Nara
Matsuyama
Takamatsu
Kochi
Tokushima
Shimonoseki
Fukuoka
Saga
Oita
Nagasaki
Kumamoto
Kagoshima
Miyazaki
Naze
Ishigaki Island
Miyakojima
Naha
Minami Daito Island


In [392]:
"""
The final code cell to export the processed data
"""
df_cherry_filtered.to_csv("cherry_blossom_updated.csv", index = False)