## Cherry Blossom Peak Bloom Prediction - Feature Extraction for ML models
* Author: Julia Hsu
* Date: 2/28/2022

In [1]:
import pandas as pd
import numpy as np

### Load processed timeseires dataframe and bloom date dataframe

In [2]:
def get_time_series(city_timeseries):
  city_timeseries['Date'] =  pd.to_datetime(city_timeseries['Date'], format='%m/%d/%Y')
  city_timeseries['year'] = pd.DatetimeIndex(city_timeseries['Date']).year
  city_timeseries['month'] = pd.DatetimeIndex(city_timeseries['Date']).month
  return city_timeseries

In [3]:
def get_bloom_df(city_bloom):
  city_bloom['bloom_date'] = pd.to_datetime(city_bloom['bloom_date'], format='%m/%d/%Y')
  city_bloom['year'] = pd.DatetimeIndex(city_bloom['bloom_date']).year
  city_bloom['month'] = pd.DatetimeIndex(city_bloom['bloom_date']).month
  return city_bloom

In [4]:
# December's temperature affect the bloom date in the next comming year 
def convert_year(row):
  if row['month'] == 12:
    bloom_year = row['year'] +1
  else:
    bloom_year = row['year']
  return bloom_year

### Format seasonal dataframe, including season's label (winter or spring) and the month

In [5]:
def get_city_seasons_df(city_timeseries):
  city_season_timeseries = city_timeseries[(city_timeseries['month']<=3) | (city_timeseries['month'] ==12)]
  city_season_timeseries['season'] = np.where( (city_season_timeseries['month'] <=2) | (city_season_timeseries['month'] == 12), 'winter', 'spring')
  city_season_timeseries['bloom_year'] = city_season_timeseries.apply(lambda row: convert_year(row), axis = 1)
  # The average daily temperature = (Tmax + Tmin) /2
  city_season_timeseries['daily_avg_temp'] = (city_season_timeseries['Tmax'] + city_season_timeseries['Tmin']) /2
  return city_season_timeseries

### Feature Extraction

#### Calculate the average of daily tempature, maximum tempature and minimum tempature per month respectively

* The average daily temperature = (Tmax + Tmin) /2

In [6]:
def get_monthly_avg_temps(city_season_timeseries):
  # Average daily temperature 
  city_monthly_avg_temp = city_season_timeseries.groupby(['bloom_year','month'])['daily_avg_temp'].agg('mean').reset_index()
  city_monthly_avg_tmax = city_season_timeseries.groupby(['bloom_year','month'])['Tmax'].agg('mean').reset_index()
  city_monthly_avg_tmin = city_season_timeseries.groupby(['bloom_year','month'])['Tmin'].agg('mean').reset_index()

  city_monthly_avg_temps = pd.merge(city_monthly_avg_temp, city_monthly_avg_tmax, on = ['bloom_year','month']) 
  city_monthly_avg_temps = pd.merge(city_monthly_avg_temps,city_monthly_avg_tmin, on = ['bloom_year','month'] )

  city_monthly_avg_temps.rename(columns = {'daily_avg_temp':'avg_daily_temp_per_month','Tmax':'avg_tmax_per_month','Tmin':'avg_tmin_per_month'}, inplace = True)
  return city_monthly_avg_temps


#### Calculate the average of daily tempature, maximum tempature and minimum tempature per season (winter and spring) respectively


In [7]:
def get_seasonal_avg_temps(city_season_timeseries):
  city_seasons_avg_temp = city_season_timeseries.groupby(['bloom_year','season'])['daily_avg_temp'].agg('mean').reset_index()
  city_seasons_avg_tmax = city_season_timeseries.groupby(['bloom_year','season'])['Tmax'].agg('mean').reset_index()
  city_seasons_avg_tmin = city_season_timeseries.groupby(['bloom_year','season'])['Tmin'].agg('mean').reset_index()

  city_seasons_avg_temps = pd.merge(city_seasons_avg_temp, city_seasons_avg_tmax, on = ['bloom_year','season'])
  city_seasons_avg_temps = pd.merge(city_seasons_avg_temps, city_seasons_avg_tmin, on = ['bloom_year','season'])
  
  city_seasons_avg_temps.rename(columns = {'daily_avg_temp':'avg_daily_temp_per_season','Tmax':'avg_tmax_per_season','Tmin':'avg_tmin_per_season'},inplace = True)
  return city_seasons_avg_temps

In [8]:
### function for formating features dataframe
def format_features_df(bloom_df, season_avg_temps, monthly_avg_temps):
  bloom_features = pd.DataFrame()

  years = bloom_df['year'].unique().tolist()
  years.remove(1979)
  for year in years:
    df = bloom_df[bloom_df['year'] == year]
    season_df = season_avg_temps[season_avg_temps['bloom_year'] == year]
    monthly_df = monthly_avg_temps [monthly_avg_temps['bloom_year'] == year]

    df['winter_avg_daily_temp'] = season_df.loc[season_df['season'] == 'winter']['avg_daily_temp_per_season'].values
    df['winter_avg_tmax'] =  season_df.loc[season_df['season'] == 'winter']['avg_tmax_per_season'].values
    df['winter_avg_tmin'] = season_df.loc[season_df['season'] == 'winter']['avg_tmin_per_season'].values

    df['spring_avg_daily_temp'] = season_df.loc[season_df['season'] == 'spring']['avg_daily_temp_per_season'].values
    df['spring_avg_tmax'] = season_df.loc[season_df['season'] == 'spring']['avg_tmax_per_season'].values
    df['spring_avg_tmin'] = season_df.loc[season_df['season'] == 'spring']['avg_tmin_per_season'].values

    


    df['dec_avg_daily_temp'] = monthly_df.loc[monthly_df['month'] == 12]['avg_daily_temp_per_month'].values
    df['dec_avg_tmax'] = monthly_df.loc[monthly_df['month'] == 12]['avg_tmax_per_month'].values
    df['dec_avg_tmin'] = monthly_df.loc[monthly_df['month'] == 12]['avg_tmin_per_month'].values

    df['jan_avg_daily_temp'] = monthly_df.loc[monthly_df['month'] == 1]['avg_daily_temp_per_month'].values
    df['jan_avg_tmax'] = monthly_df.loc[monthly_df['month'] == 1]['avg_tmax_per_month'].values
    df['jan_avg_tmin'] = monthly_df.loc[monthly_df['month'] == 1]['avg_tmin_per_month'].values

    df['feb_avg_daily_temp'] = monthly_df.loc[monthly_df['month'] == 2]['avg_daily_temp_per_month'].values
    df['feb_avg_tmax'] =  monthly_df.loc[monthly_df['month'] == 2]['avg_tmax_per_month'].values
    df['feb_avg_tmin'] = monthly_df.loc[monthly_df['month'] == 2]['avg_tmin_per_month'].values

    df['march_avg_daily_temp'] = monthly_df.loc[monthly_df['month'] == 3]['avg_daily_temp_per_month'].values
    df['march_avg_tmax'] = monthly_df.loc[monthly_df['month'] == 3]['avg_tmax_per_month'].values
    df['march_avg_tmin'] = monthly_df.loc[monthly_df['month'] == 3]['avg_tmin_per_month'].values

    bloom_features = bloom_features.append(df)
  return bloom_features




#### Washington D.C.

In [None]:
dc_raw_timeseries = pd.read_csv( "../../data/processed_data/Timeseries_weather_washingtondc_Tbase.csv")
dc_timeseries_df = get_time_series(dc_raw_timeseries)

dc_bloom = pd.read_csv('../../data/processed_data/washingtondc.csv')
dc_bloom_df = get_bloom_df(dc_bloom)

dc_season_timeseries = get_city_seasons_df(dc_timeseries_df)

In [10]:
dc_seasons_avg_temps = get_seasonal_avg_temps(dc_season_timeseries)
dc_monthly_avg_temps = get_monthly_avg_temps(dc_season_timeseries)

In [None]:
dc_bloom_features = format_features_df(dc_bloom_df, dc_seasons_avg_temps, dc_monthly_avg_temps)

In [12]:
# merge the features table with processed timeseries table that has AGDD values overtime 
dc_bloom_features = pd.merge(dc_bloom_features,dc_timeseries_df[['Date','AGDD','GDD']], left_on = 'bloom_date', right_on = 'Date')

In [13]:
kept_cols = ['location', 'lat', 'long', 'alt', 'year', 'month','bloom_date', 'bloom_doy',
       'AGDD_Bloom', 'AGDD_Mar_13', 'DOY_Mar_13',
       'winter_avg_daily_temp', 'winter_avg_tmax', 'winter_avg_tmin',
       'spring_avg_daily_temp', 'spring_avg_tmax', 'spring_avg_tmin',
       'dec_avg_daily_temp', 'dec_avg_tmax', 'dec_avg_tmin',
       'jan_avg_daily_temp', 'jan_avg_tmax', 'jan_avg_tmin',
       'feb_avg_daily_temp', 'feb_avg_tmax', 'feb_avg_tmin']

In [14]:
dc_bloom_features = dc_bloom_features[kept_cols]
dc_bloom_features.to_csv('../../data/processed_data/features/dc_bloom_features.csv')

#### Japan Kyoto

In [None]:
kyoto_raw_timeseries = pd.read_csv( "../../data/processed_data/Timeseries_weather_Japan_Kyoto_Tbase.csv")
kyoto_timeseries_df = get_time_series(kyoto_raw_timeseries)

kyoto_bloom = pd.read_csv('../../data/processed_data/kyoto.csv')
kyoto_bloom_df = get_bloom_df(kyoto_bloom)

kyoto_season_timeseries = get_city_seasons_df(kyoto_timeseries_df)

In [16]:
kyoto_seasons_avg_temps = get_seasonal_avg_temps(kyoto_season_timeseries)
kyoto_monthly_avg_temps = get_monthly_avg_temps(kyoto_season_timeseries)

In [None]:
kyoto_bloom_features = format_features_df(kyoto_bloom_df, kyoto_seasons_avg_temps, kyoto_monthly_avg_temps)

In [18]:
# merge the features table with processed timeseries table that has AGDD values overtime 
kyoto_bloom_features = pd.merge(kyoto_bloom_features,kyoto_timeseries_df[['Date','AGDD','GDD']], left_on = 'bloom_date', right_on = 'Date')
kyoto_bloom_features = kyoto_bloom_features[kept_cols]
kyoto_bloom_features.to_csv('../../data/processed_data/features/kyoto_bloom_features.csv')

#### Switzerland Liestal

In [None]:
liestal_raw_timeseries = pd.read_csv( "../../data/processed_data/Timeseries_weather_Switzerland_Liestal_Tbase.csv")
liestal_timeseries_df = get_time_series(liestal_raw_timeseries)
liestal_bloom = pd.read_csv('../../data/processed_data/liestal.csv')
liestal_bloom_df = get_bloom_df(liestal_bloom)

liestal_season_timeseries = get_city_seasons_df(liestal_timeseries_df)

In [20]:
liestal_seasons_avg_temps = get_seasonal_avg_temps(liestal_season_timeseries)
liestal_monthly_avg_temps = get_monthly_avg_temps(liestal_season_timeseries)

In [None]:
liestal_bloom_features = format_features_df(liestal_bloom_df, liestal_seasons_avg_temps, liestal_monthly_avg_temps)

In [29]:
liestal_bloom_features = pd.merge(liestal_bloom_features,liestal_timeseries_df[['Date','AGDD','GDD']], left_on = 'bloom_date', right_on = 'Date')
liestal_bloom_features = liestal_bloom_features[kept_cols]
liestal_bloom_features.to_csv('../../data/processed_data/features/liestal_bloom_features.csv')

#### Vancouver

In [None]:
vancouver_raw_timeseries = pd.read_csv( "../../data/processed_data/Timeseries_weather_Vancouver_Tbase.csv")
vancouver_timeseries_df = get_time_series(vancouver_raw_timeseries)

vancouver_season_timeseries = get_city_seasons_df(vancouver_timeseries_df)

In [23]:
vancouver_seasons_avg_temps = get_seasonal_avg_temps(vancouver_season_timeseries)
vancouver_monthly_avg_temps = get_monthly_avg_temps(vancouver_season_timeseries)

In [24]:
vancouver_bloom_df = pd.DataFrame({'year': vancouver_seasons_avg_temps['bloom_year'].unique().tolist()})
vancouver_bloom_df['location'] = 'vancouver'
vancouver_bloom_df['lat'] = 49.2236916
vancouver_bloom_df['long'] = -123.1636251
vancouver_bloom_df['alt'] = 24
vancouver_bloom_df = vancouver_bloom_df[['location','lat','long','alt','year']]

In [None]:
vancouver_features = format_features_df(vancouver_bloom_df, vancouver_seasons_avg_temps, vancouver_monthly_avg_temps)

In [26]:
vancouver_features.to_csv('../../data/processed_data/features/vancouver_features.csv')

### Format future weather features for prediction 

In [27]:
def get_future_weather_features(location, lat, lng, alt, city_timeseries_df, city_seasons_avg_temps, city_monthly_avg_temps):
  future_weather_features = pd.DataFrame(columns = ['location', 'lat', 'long', 'alt', 'year','AGDD_Mar_13','winter_avg_daily_temp', 'winter_avg_tmax', 'winter_avg_tmin',
       'spring_avg_daily_temp', 'spring_avg_tmax', 'spring_avg_tmin',
       'dec_avg_daily_temp', 'dec_avg_tmax', 'dec_avg_tmin',
       'jan_avg_daily_temp', 'jan_avg_tmax', 'jan_avg_tmin',
       'feb_avg_daily_temp', 'feb_avg_tmax', 'feb_avg_tmin',
       'march_avg_daily_temp', 'march_avg_tmax', 'march_avg_tmin'])
  
  future_weather_features.loc[0,'location'] = location
  future_weather_features.loc[0,'lat'] = lat
  future_weather_features.loc[0,'long'] = lng
  future_weather_features.loc[0,'alt'] = alt
  future_weather_features.loc[0,'year'] = 2022

  future_weather_features.loc[0, 'AGDD_Mar_13']  = city_timeseries_df.loc[city_timeseries_df['Date'] == '2022-03-13']['AGDD'].values[0]

  future_weather_features.loc[0, 'winter_avg_daily_temp']  = city_seasons_avg_temps.loc[ (city_seasons_avg_temps['season'] == 'winter') & (city_seasons_avg_temps['bloom_year'] == 2022)]['avg_daily_temp_per_season'].values[0]
  future_weather_features.loc[0, 'winter_avg_tmax']  = city_seasons_avg_temps.loc[ (city_seasons_avg_temps['season'] == 'winter') & (city_seasons_avg_temps['bloom_year'] == 2022)]['avg_tmax_per_season'].values[0]
  future_weather_features.loc[0, 'winter_avg_tmin']  = city_seasons_avg_temps.loc[ (city_seasons_avg_temps['season'] == 'winter') & (city_seasons_avg_temps['bloom_year'] == 2022)]['avg_tmin_per_season'].values[0]

  future_weather_features.loc[0, 'spring_avg_daily_temp']  = city_seasons_avg_temps.loc[ (city_seasons_avg_temps['season'] == 'spring') & (city_seasons_avg_temps['bloom_year'] == 2022)]['avg_daily_temp_per_season'].values[0]
  future_weather_features.loc[0, 'spring_avg_tmax']  = city_seasons_avg_temps.loc[ (city_seasons_avg_temps['season'] == 'spring') & (city_seasons_avg_temps['bloom_year'] == 2022)]['avg_tmax_per_season'].values[0]
  future_weather_features.loc[0, 'spring_avg_tmin']  = city_seasons_avg_temps.loc[ (city_seasons_avg_temps['season'] == 'spring') & (city_seasons_avg_temps['bloom_year'] == 2022)]['avg_tmin_per_season'].values[0]

  future_weather_features.loc[0, 'dec_avg_daily_temp']  = city_monthly_avg_temps.loc[ (city_monthly_avg_temps['month'] == 12) & (city_monthly_avg_temps['bloom_year'] == 2022)]['avg_daily_temp_per_month'].values[0]
  future_weather_features.loc[0, 'dec_avg_tmax']  = city_monthly_avg_temps.loc[ (city_monthly_avg_temps['month'] == 12) & (city_monthly_avg_temps['bloom_year'] == 2022)]['avg_tmax_per_month'].values[0]
  future_weather_features.loc[0, 'dec_avg_tmin']  = city_monthly_avg_temps.loc[ (city_monthly_avg_temps['month'] == 12) & (city_monthly_avg_temps['bloom_year'] == 2022)]['avg_tmin_per_month'].values[0]

  future_weather_features.loc[0, 'jan_avg_daily_temp']  = city_monthly_avg_temps.loc[ (city_monthly_avg_temps['month'] == 1) & (city_monthly_avg_temps['bloom_year'] == 2022)]['avg_daily_temp_per_month'].values[0]
  future_weather_features.loc[0, 'jan_avg_tmax']  = city_monthly_avg_temps.loc[ (city_monthly_avg_temps['month'] == 1) & (city_monthly_avg_temps['bloom_year'] == 2022)]['avg_tmax_per_month'].values[0]
  future_weather_features.loc[0, 'jan_avg_tmin']  = city_monthly_avg_temps.loc[ (city_monthly_avg_temps['month'] == 1) & (city_monthly_avg_temps['bloom_year'] == 2022)]['avg_tmin_per_month'].values[0]

  future_weather_features.loc[0, 'feb_avg_daily_temp']  = city_monthly_avg_temps.loc[ (city_monthly_avg_temps['month'] == 2) & (city_monthly_avg_temps['bloom_year'] == 2022)]['avg_daily_temp_per_month'].values[0]
  future_weather_features.loc[0, 'feb_avg_tmax']  = city_monthly_avg_temps.loc[ (city_monthly_avg_temps['month'] == 2) & (city_monthly_avg_temps['bloom_year'] == 2022)]['avg_tmax_per_month'].values[0]
  future_weather_features.loc[0, 'feb_avg_tmin']  = city_monthly_avg_temps.loc[ (city_monthly_avg_temps['month'] == 2) & (city_monthly_avg_temps['bloom_year'] == 2022)]['avg_tmin_per_month'].values[0]

  future_weather_features.loc[0, 'march_avg_daily_temp']  = city_monthly_avg_temps.loc[ (city_monthly_avg_temps['month'] == 3) & (city_monthly_avg_temps['bloom_year'] == 2022)]['avg_daily_temp_per_month'].values[0]
  future_weather_features.loc[0, 'march_avg_tmax']  = city_monthly_avg_temps.loc[ (city_monthly_avg_temps['month'] == 3) & (city_monthly_avg_temps['bloom_year'] == 2022)]['avg_tmax_per_month'].values[0]
  future_weather_features.loc[0, 'march_avg_tmin']  = city_monthly_avg_temps.loc[ (city_monthly_avg_temps['month'] == 3) & (city_monthly_avg_temps['bloom_year'] == 2022)]['avg_tmin_per_month'].values[0]

  return future_weather_features
    

In [28]:
dc_future_weather_features = get_future_weather_features('washingtondc', 38.88535, -77.038628, 0, dc_timeseries_df, dc_seasons_avg_temps, dc_monthly_avg_temps)
kyoto_future_weather_features = get_future_weather_features('kyoto', 35.011983, 135.676114, 44, kyoto_timeseries_df, kyoto_seasons_avg_temps, kyoto_monthly_avg_temps)
liestal_future_weather_features = get_future_weather_features('liestal', 47.4814, 7.730519, 350, liestal_timeseries_df, liestal_seasons_avg_temps, liestal_monthly_avg_temps)
vancouver_future_weather_features = get_future_weather_features('vancouver', 49.223692, -123.163625, 24, vancouver_timeseries_df, vancouver_seasons_avg_temps, vancouver_monthly_avg_temps)

In [30]:
future_weather_features_all_loc = dc_future_weather_features.append(kyoto_future_weather_features)
future_weather_features_all_loc = future_weather_features_all_loc.append(liestal_future_weather_features)
future_weather_features_all_loc = future_weather_features_all_loc.append(vancouver_future_weather_features)
future_weather_features_all_loc.reset_index(drop = True, inplace = True)
future_weather_features_all_loc.to_csv('../../data/processed_data/features/future_weather_features_all_loc.csv')