In [1]:
import pandas as pd
import numpy as np

from datetime import datetime
from dateutil.relativedelta import relativedelta
from suntimes import SunTimes  

In [2]:
identifying_columns = ['Stations_id', 'lat', 'lon', 'Management', 'vargroup', 'Referenzjahr']

In [3]:
ds_list = []
for variable_name in ['t2m', 't2min', 't2max', 'vpd', 'tp', 'ssrd']:
    ds_SSA = pd.read_csv(f'C:\\Users\\wlwc1989\\Documents\\Phenology_Test_Notebooks\\phenology_dwd\\results_for_comparing\\Maize_ML_data_SSA_{variable_name}_varieties2.csv').dropna(how='all')
    ds_SSA['observed time to beginning of flowering'] = pd.to_timedelta(ds_SSA['observed time to beginning of flowering']).dt.days#, 'D'
    ds_SSA = ds_SSA.loc[ds_SSA['observed time to beginning of flowering'] > 30] #30
    ds_list.append(ds_SSA)

In [5]:
numdays = 200
ds_full = ds_list[0]
for variable_ind, variable_name in enumerate(['t2min', 't2max', 'vpd', 'tp', 'ssrd']):
    print(variable_name)
    ds_full = ds_full.merge(ds_list[variable_ind + 1][identifying_columns + [f'{variable_name} at day {n}' for n in range (numdays)]], on = identifying_columns)

t2min
t2max
vpd
tp
ssrd


In [8]:
def get_categories(anthesis_date):
    return np.array([anthesis_date < n for n in range(200)]).squeeze().astype(np.int64)
cats = ds_full[['observed time to beginning of flowering']].apply(get_categories, args = (), axis = 1)
cats_unlisted = cats.apply(lambda x: pd.Series(x)) #to_frame().T#.explode()#.T.explode()
cats_unlisted = cats_unlisted.rename(columns = {n: f'dev stage at day {n}' for n in range(183)})
ds_full = ds_full.merge(cats_unlisted, left_index=True, right_index = True)

In [9]:
ds_full.loc[:, [f'DTF at day {n}' for n in range(193)]] = np.tile(ds_full['observed time to beginning of flowering'].values, (193, 1)).T - np.tile(np.arange(0, 193), (len(ds_full),1))

In [10]:
def get_photoperiods(inputs):
    #inputs = [lon, lat, day of start]
    day_of_start = datetime(inputs.iloc[2].year, inputs.iloc[2].month, inputs.iloc[2].day)
    lon = inputs.iloc[0]
    lat = inputs.iloc[1]
    sun = SunTimes(lon, lat) #can put altitude in also
    lengths = [(pd.to_datetime(sun.setutc(day_of_start + relativedelta(days=n))) - pd.to_datetime(sun.riseutc(day_of_start + relativedelta(days=n)))).seconds/3600 for n in range(170)]
    return np.array(lengths).squeeze().astype(np.float64)

In [11]:
def get_station_locations(dataset, ds_stations):
    ds_stations.index = ds_stations['LocationID']
    lat = [ds_stations._get_value(row, col) for row, col in zip(dataset['sitecode'], ['Latitude' for count in range(len(dataset))])] #station_data.lookup(row_labels = dataset['Stations_id'], col_labels = ['geograph.Breite'])
    lon = [ds_stations._get_value(row, col) for row, col in zip(dataset['sitecode'], ['Longitude' for count in range(len(dataset))])] #station_data._lookup(dataset['Stations_id'], ['geograph.Laenge'])
    dataset['lat'] = lat
    dataset['lon'] = lon
    dataset['lat'] = dataset['lat'].map(lambda x: x[0] if isinstance(x, float) == False else x)
    dataset['lon'] = dataset['lon'].map(lambda x: x[0] if isinstance(x, float) == False else x)
    return dataset

In [12]:
stations_data_lobell = pd.read_csv('C:\\Users\\wlwc1989\\Documents\\Phenology_Test_Notebooks\\phenology_dwd\\African_data\\Lobell2011\\EIL_site_latlon.csv')
ds_full['sitecode'] = ds_full['Stations_id']
ds_full = get_station_locations(ds_full, stations_data_lobell)

  ds_full['sitecode'] = ds_full['Stations_id']


In [15]:
ds_full['Planting date'] = pd.to_datetime(ds_full['WC SOS date'])
photoperiods = ds_full[['lon', 'lat', 'Planting date']].apply(get_photoperiods, args = (), axis = 1)
photoperiods_unlisted = photoperiods.apply(lambda x: pd.Series(x)) #to_frame().T#.explode()#.T.explode()
photoperiods_unlisted = photoperiods_unlisted.rename(columns = {n: f'photoperiod at day {n}' for n in range(170)})
ds_full = ds_full.merge(photoperiods_unlisted, left_index=True, right_index = True)

  ds_full['Planting date'] = pd.to_datetime(ds_full['WC SOS date'])


In [16]:
AEZ_data = pd.read_csv('C:\\Users\\wlwc1989\\Documents\\Phenology_Test_Notebooks\\phenology_dwd\\African_data\\Lobell2011\\EIL_site_latlon_with_AEZ.csv')
AEZ_data = AEZ_data.rename(columns={'LocationID':'Stations_id'}).drop('Unnamed: 0', axis=1)

In [17]:
ds_full = ds_full.merge(AEZ_data[['Stations_id', 'AEZ']], on='Stations_id', how='left')

In [24]:
ds_full.to_csv(f'C:\\Users\\wlwc1989\\Documents\\Phenology_Test_Notebooks\\phenology_dwd\\results_for_comparing\\saved_ML_datasets\\SSA_ML_ERA5_1999_2008_with_varieties2.csv')

In [2]:
ds_t2m_DE = pd.read_csv('C:\\Users\\wlwc1989\\Documents\\Phenology_Test_Notebooks\\phenology_dwd\\results_for_comparing\\Maize_ML_data_ERA5_t2m_pd_SOS.csv').dropna()
ds_t2m_DE['observed time to beginning of flowering'] = pd.to_timedelta(ds_t2m_DE['observed time to beginning of flowering']).dt.days#, 'D'
ds_t2m_DE = ds_t2m_DE.loc[ds_t2m_DE['observed time to beginning of flowering'] > 30] #30

In [2]:
ds_list = []
path_home = 'C:\\Users\\wlwc1989\\Documents\\Phenology_Test_Notebooks\\phenology_dwd\\results_for_comparing\\Maize_ML_data_ERA5_'
path_suffix = '_pd_SOS.csv'
path_suffix = '_pd_SOS_90s.csv'
path_suffix = '_pd_SOS_2001_2024.csv'
for variable_name in ['t2m', 't2min', 't2max', 'vpd', 'tp', 'ssrd']:
    ds_DE = pd.read_csv(path_home + variable_name + path_suffix).dropna(how='all')
    ds_DE['observed time to beginning of flowering'] = pd.to_timedelta(ds_DE['observed time to beginning of flowering']).dt.days#, 'D'
    ds_DE = ds_DE.loc[ds_DE['observed time to beginning of flowering'] > 30] #30
    ds_list.append(ds_DE)

In [3]:
identifying_columns = ['Stations_id', 'lat', 'lon', 'Referenzjahr']
ds_full = ds_list[0]
for variable_ind, variable_name in enumerate(['t2min', 't2max', 'vpd', 'tp', 'ssrd']):
    ds_full = ds_full.merge(ds_list[variable_ind + 1][identifying_columns + [f'{variable_name} at day {n}' for n in range (180)]], on = identifying_columns)

In [8]:
def get_categories(anthesis_date):
    return np.array([anthesis_date < n for n in range(183)]).squeeze().astype(np.int64)
cats = ds_full[['observed time to beginning of flowering']].apply(get_categories, args = (), axis = 1)
cats_unlisted = cats.apply(lambda x: pd.Series(x)) #to_frame().T#.explode()#.T.explode()
cats_unlisted = cats_unlisted.rename(columns = {n: f'dev stage at day {n}' for n in range(183)})
ds_full = ds_full.merge(cats_unlisted, left_index=True, right_index = True)

: 

In [29]:
ds_full.loc[:, [f'DTF at day {n}' for n in range(193)]] = np.tile(ds_full['observed time to beginning of flowering'].values, (193, 1)).T - np.tile(np.arange(0, 193), (len(ds_full),1))
ds_full.loc[:, [f'dev stage at day {n}' for n in range(193)]] =(ds_full.loc[:, [f'DTF at day {n}' for n in range(193)]] <= 0).astype(int).values
#np.any((ds_full.loc[:, [f'dev stage at day {n}' for n in range(193)]].values*ds_full.loc[:, [f'DTF at day {n}' for n in range(193)]].values) >0)

In [30]:
def get_photoperiods(inputs):
    #inputs = [lon, lat, day of start]
    day_of_start = datetime(inputs.iloc[2].year, inputs.iloc[2].month, inputs.iloc[2].day)
    lon = inputs.iloc[0]
    lat = inputs.iloc[1]
    sun = SunTimes(lon, lat) #can put altitude in also
    lengths = [(pd.to_datetime(sun.setutc(day_of_start + relativedelta(days=n))) - pd.to_datetime(sun.riseutc(day_of_start + relativedelta(days=n)))).seconds/3600 for n in range(170)]
    return np.array(lengths).squeeze().astype(np.float64)

In [31]:
ds_full['Planting date'] = pd.to_datetime(ds_full['Planting date'])
photoperiods = ds_full[['lon', 'lat', 'Planting date']].apply(get_photoperiods, args = (), axis = 1)
photoperiods_unlisted = photoperiods.apply(lambda x: pd.Series(x)) #to_frame().T#.explode()#.T.explode()
photoperiods_unlisted = photoperiods_unlisted.rename(columns = {n: f'photoperiod at day {n}' for n in range(170)})
ds_full = ds_full.merge(photoperiods_unlisted, left_index=True, right_index = True)

In [32]:
ds_full.to_csv(f'C:\\Users\\wlwc1989\\Documents\\Phenology_Test_Notebooks\\phenology_dwd\\results_for_comparing\\saved_ML_datasets\\DE_ML_ERA5_2001_2024_full.csv')

In [None]:
ds_full = pd.read_csv(f'C:\\Users\\wlwc1989\\Documents\\Phenology_Test_Notebooks\\phenology_dwd\\results_for_comparing\\saved_ML_datasets\\DE_ML_ERA5_90s.csv')
ds2 = pd.read_csv('C:\\Users\\wlwc1989\\Documents\\Phenology_Test_Notebooks\\phenology_dwd\\results_for_comparing\\saved_ML_datasets\\DE_ML_ERA5_2001_2024.csv')
ds_mega = pd.concat([ds_full, ds2], ignore_index=True, sort=False)
ds_varieties = ds_mega.loc[ds_mega['SORTE'] != ' Mais, Sorte unbekannt'].dropna(subset = ['SORTE'])

In [17]:
ds_mega.loc[ds_mega['SORTE'].isin([' Mais, frühe Reife, FAO 220',
       ' Mais, frühe Reife, FAO 210', ' Mais, mittelfrühe Reife, FAO 240',
       ' Mais, mittelfrühe Reife, FAO 230',
       ' Mais, mittelspäte Reife, FAO 280', ' Mais, frühe Reife, FAO 200',
       ' Mais, frühe Reife, FAO 190',
       ' Mais, mittelfrühe Reife, FAO 250', ' Mais, frühe Reife, FAO 180',
       ' Mais, mittelspäte Reife, FAO 260', ' Mais, späte Reife, FAO 350',
       ' Mais, mittelspäte Reife, FAO 270',
       ' Mais, mittelspäte Reife, FAO 290', ' Mais, späte Reife, FAO 300',
       ' Mais, späte Reife, FAO 310', ' Mais, frühe Reife, FAO 170',
       ' Mais, späte Reife, FAO 320', 
       ' Mais, späte Reife, FAO 330',])]# ==' Mais, frühe Reife, FAO 220']

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Planting date,Referenzjahr,Stations_id,lat,lon,Objekt_id,Reporting method,Historic or recent,...,photoperiod at day 160,photoperiod at day 161,photoperiod at day 162,photoperiod at day 163,photoperiod at day 164,photoperiod at day 165,photoperiod at day 166,photoperiod at day 167,photoperiod at day 168,photoperiod at day 169
1,1,10,1991-04-25,1991,7516,54.4000,9.9667,215,annual,historic,...,11.533333,11.450000,11.400000,11.316667,11.250000,11.166667,11.100000,11.016667,10.950000,10.866667
2,2,11,1992-04-24,1992,7516,54.4000,9.9667,215,annual,historic,...,11.550000,11.483333,11.400000,11.333333,11.250000,11.183333,11.116667,11.050000,10.966667,10.900000
3,3,12,1993-04-24,1993,7516,54.4000,9.9667,215,annual,historic,...,11.566667,11.483333,11.433333,11.350000,11.283333,11.200000,11.133333,11.066667,10.983333,10.916667
4,4,13,1994-04-28,1994,7516,54.4000,9.9667,215,annual,historic,...,11.300000,11.216667,11.166667,11.083333,11.016667,10.933333,10.866667,10.783333,10.716667,10.650000
5,5,14,1995-05-02,1995,7516,54.4000,9.9667,215,annual,historic,...,11.016667,10.950000,10.900000,10.816667,10.750000,10.666667,10.600000,10.533333,10.450000,10.383333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26127,15222,84498,2001-05-06,2001,19734,51.5448,10.8748,215,annual,historic,...,10.850000,10.783333,10.716667,10.650000,10.583333,10.516667,10.450000,10.400000,10.333333,10.266667
26128,15223,84499,2002-04-26,2002,19734,51.5448,10.8748,215,annual,historic,...,11.516667,11.450000,11.366667,11.316667,11.250000,11.183333,11.116667,11.050000,10.983333,10.933333
26155,15250,84600,2001-04-20,2001,19740,52.8719,13.3927,215,annual,historic,...,11.883333,11.800000,11.733333,11.683333,11.600000,11.533333,11.450000,11.400000,11.333333,11.250000
26164,15259,84686,2001-05-03,2001,19741,52.1458,11.9563,215,annual,historic,...,11.016667,10.950000,10.883333,10.816667,10.750000,10.700000,10.616667,10.550000,10.483333,10.433333
