## Add growing degree days to raw tall data from seasons with days to flowering and/or days to flag leaf emergence

In [1]:
import datetime as datetime
import numpy as np
import pandas as pd

### Functions

In [3]:
def save_to_csv_without_timestamp(list_of_dfs, list_of_output_filenames):

    for i,j in zip(list_of_dfs, list_of_output_filenames):
        i.to_csv(j, index=False)

### MAC Season 4

In [4]:
s4_0 = pd.read_csv('data/raw/mac_season_four_2020-04-22.csv', low_memory=False)
print(s4_0.shape)
# s4_0.head()

(375150, 39)


### Season 4 Days to Flowering

In [5]:
fl_0 = pd.read_csv('data/processed/mac_season_4_days_gdd_to_flowering.csv')
print(fl_0.shape)
# fl_0.head()

(156, 12)


In [6]:
# s4_0.columns

In [7]:
s4_sites = s4_0[['site_id', 'sitename', 'lat', 'lon', 'cultivar', 'cultivar_id', 'treatment', 'treatment_id']]
# s4_sites.head()

Get information from original raw data that was not included in the `days_to_flowering` dataset

In [8]:
s4_sites_unique = s4_sites.drop_duplicates(ignore_index=True)
print(s4_sites_unique.shape)
# s4_sites_unique.head()

(2218, 8)


In [9]:
fl_sites = fl_0.merge(s4_sites_unique, left_on='plot', right_on='sitename', how='left')
print(fl_sites.shape)
# fl_sites.head()

(156, 20)


In [10]:
s4_1 = s4_0.drop(labels='Unnamed: 0', axis=1)
# s4_1.head(2)

In [11]:
data = {'checked': 0, 'result_type': 'traits', 'id': np.nan, 'citation_id': np.nan, 'site_id': fl_sites.site_id.values,
        'treatment_id': fl_sites.treatment_id.values, 'sitename': fl_sites['plot'].values, 'city': 'Maricopa', 
        'lat': fl_sites.lat.values, 'lon': fl_sites.lon.values, 'scientificname': 'Sorghum bicolor', 'commonname': 'sorghum',
        'genus': 'Sorghum', 'species_id': 2588, 'cultivar_id': fl_sites.cultivar_id.values, 'author': np.nan, 'citation_year': np.nan,
        'treatment': fl_sites.treatment_x.values, 'date': np.nan, 'time': np.nan, 'raw_date': np.nan, 'month': np.nan,
        'year': np.nan, 'dateloc': np.nan, 'trait': 'gdd_to_flowering', 'trait_description': 'accumulated growing degree days to flowering', 
        'mean': fl_sites.gdd_to_flowering.values, 'units': 'C', 'n': np.nan, 'statname': np.nan, 'stat': np.nan, 'notes': np.nan,
        'access_level': np.nan, 'cultivar': fl_sites.cultivar.values, 'entity': np.nan, 'method_name': np.nan, 'view_url': np.nan,
        'edit_url': np.nan}

Do not need to specifiy `np.nan` values since `pd.concat()` will fill those as needed.

In [12]:
gdd_flowering_s4 = pd.DataFrame(data=data)
print(gdd_flowering_s4.shape)
# gdd_flowering_s4.tail(3)

(156, 38)


In [13]:
# gdd_flowering_s4.isnull().sum()

Add `gdd_to_flowering` dataframe to raw season 4 data

In [14]:
tall_s4 = pd.concat([s4_1, gdd_flowering_s4], ignore_index=True)
print(tall_s4.shape)
tall_s4.tail(3)

(375306, 38)


Unnamed: 0,checked,result_type,id,citation_id,site_id,treatment_id,sitename,city,lat,lon,...,n,statname,stat,notes,access_level,cultivar,entity,method_name,view_url,edit_url
375303,0,traits,,,6000006019,6000000024,MAC Field Scanner Season 4 Range 46 Column 5,Maricopa,33.076165,-111.974983,...,,,,,,PI535796,,,,
375304,0,traits,,,6000006013,6000000024,MAC Field Scanner Season 4 Range 46 Column 7,Maricopa,33.076165,-111.97495,...,,,,,,PI534120,,,,
375305,0,traits,,,6000006027,6000000024,MAC Field Scanner Season 4 Range 51 Column 7,Maricopa,33.076345,-111.97495,...,,,,,,PI641830,,,,


### Season 4 Days to Flag Leaf Emergence

In [15]:
fle_0 = pd.read_csv('data/processed/mac_season_4_days_gdd_to_flag_leaf_emergence.csv')
print(fle_0.shape)
# fle_0.head()

(176, 12)


In [16]:
print(s4_sites_unique.shape)
# s4_sites.tail()

(2218, 8)


In [17]:
fle_sites = fle_0.merge(s4_sites_unique, left_on='plot', right_on='sitename', how='left')
print(fle_sites.shape)
# fle_sites.head()

(176, 20)


In [18]:
# fle_sites.columns

In [19]:
fle_sites_1 = fle_sites[['gdd_to_flag_leaf_emergence', 'site_id', 'sitename', 'lat', 'lon', 'cultivar', 'cultivar_id',
                        'treatment_y', 'treatment_id']]
print(fle_sites_1.shape)
# fle_sites_1.head()

(176, 9)


In [20]:
fle_data = {'checked': 0, 'result_type': 'traits', 'id': np.nan, 'citation_id': np.nan, 'site_id': fle_sites_1.site_id.values,
        'treatment_id': fle_sites_1.treatment_id.values, 'sitename': fle_sites_1.sitename.values, 'city': 'Maricopa', 
        'lat': fle_sites_1.lat.values, 'lon': fle_sites_1.lon.values, 'scientificname': 'Sorghum bicolor', 'commonname': 'sorghum',
        'genus': 'Sorghum', 'species_id': 2588, 'cultivar_id': fle_sites_1.cultivar_id.values, 'author': np.nan, 'citation_year': np.nan,
        'treatment': fle_sites_1.treatment_y.values, 'date': np.nan, 'time': np.nan, 'raw_date': np.nan, 'month': np.nan,
        'year': np.nan, 'dateloc': np.nan, 'trait': 'gdd_to_flag_leaf_emergence', 'trait_description': 'accumulated growing degree days to flag leaf emergence', 
        'mean': fle_sites_1.gdd_to_flag_leaf_emergence.values, 'units': 'C', 'n': np.nan, 'statname': np.nan, 'stat': np.nan, 'notes': np.nan,
        'access_level': np.nan, 'cultivar': fle_sites_1.cultivar.values, 'entity': np.nan, 'method_name': np.nan, 'view_url': np.nan,
        'edit_url': np.nan}

In [21]:
fle_s4 = pd.DataFrame(data=fle_data)
print(fle_s4.shape)
# fle_s4.head()

(176, 38)


In [22]:
fl_fle_tall_s4 = pd.concat([tall_s4, fle_s4], ignore_index=True)
print(fl_fle_tall_s4.shape)
fl_fle_tall_s4.tail(3)

(375482, 38)


Unnamed: 0,checked,result_type,id,citation_id,site_id,treatment_id,sitename,city,lat,lon,...,n,statname,stat,notes,access_level,cultivar,entity,method_name,view_url,edit_url
375479,0,traits,,,6000006019,6000000024,MAC Field Scanner Season 4 Range 46 Column 5,Maricopa,33.076165,-111.974983,...,,,,,,PI535796,,,,
375480,0,traits,,,6000006013,6000000024,MAC Field Scanner Season 4 Range 46 Column 7,Maricopa,33.076165,-111.97495,...,,,,,,PI534120,,,,
375481,0,traits,,,6000006027,6000000024,MAC Field Scanner Season 4 Range 51 Column 7,Maricopa,33.076345,-111.97495,...,,,,,,PI641830,,,,


### MAC Season 6
Does not contain any added growing degree day traits

### KSU

In [23]:
ksu = pd.read_csv('data/raw/ksu_data_2020-06-11.csv')
print(ksu.shape)
# ksu.head()

(29079, 39)


In [24]:
ksu_1 = ksu.drop(labels='Unnamed: 0', axis=1)

In [25]:
ksu_fl_0 = pd.read_csv('data/processed/ksu_flowering_2020-06-15T164738.csv')
print(ksu_fl_0.shape)
# ksu_fl_0.head()

(164, 10)


In [26]:
ksu_sites = ksu_1[['site_id', 'sitename', 'cultivar', 'cultivar_id']]
print(ksu_sites.shape)
# ksu_sites.head(3)

(29079, 4)


In [27]:
ksu_sites_unique = ksu_sites.drop_duplicates()
print(ksu_sites_unique.shape)
# ksu_sites_unique.tail(3)

(919, 4)


In [28]:
ksu_fl_sites = ksu_fl_0.merge(ksu_sites_unique, how='left', left_on=['sitename', 'cultivar'],
                              right_on=['sitename', 'cultivar'])
print(ksu_fl_sites.shape)
# ksu_fl_sites.head(3)

(164, 12)


In [29]:
ksu_fl_data = {'checked': 0, 'result_type': 'traits', 'site_id': ksu_fl_sites.site_id.values, 'sitename': ksu_fl_sites.sitename.values,
              'city': 'Ashland', 'lat': ksu_fl_sites.lat.values, 'lon': ksu_fl_sites.lon.values, 'scientificname': 'Sorghum bicolor',
              'commonname': 'sorghum', 'genus': 'Sorghum', 'species_id': 2588, 'cultivar_id': ksu_fl_sites.cultivar_id.values,
              'trait': 'gdd_to_flowering', 'trait_description': 'accumulated growing degree days to flowering', 'mean': ksu_fl_sites.gdd.values,
              'units': 'C', 'cultivar': ksu_fl_sites.cultivar.values}

In [30]:
ksu_fl_df = pd.DataFrame(data=ksu_fl_data)
print(ksu_fl_df.shape)
# ksu_fl_df.head(3)

(164, 17)


In [31]:
tall_ksu_df = pd.concat([ksu_1, ksu_fl_df], ignore_index=True)
print(tall_ksu_df.shape)
tall_ksu_df.tail(3)

(29243, 38)


Unnamed: 0,checked,result_type,id,citation_id,site_id,treatment_id,sitename,city,lat,lon,...,n,statname,stat,notes,access_level,cultivar,entity,method_name,view_url,edit_url
29240,0,traits,,,6000009084,,Ashland Bottoms KSU 2016 Season Range 35 Pass 7,Ashland,39.140104,-96.631815,...,,,,,,PI329403,,,,
29241,0,traits,,,6000004390,,Ashland Bottoms KSU 2016 Season Range 22 Pass 12,Ashland,39.139639,-96.631632,...,,,,,,PI329256,,,,
29242,0,traits,,,6000009164,,Ashland Bottoms KSU 2016 Season Range 32 Pass 9,Ashland,39.139996,-96.631742,...,,,,,,PI329665,,,,


### Clemson

### Convert dataframes to `.csv` files

In [32]:
list_of_dfs = [fl_fle_tall_s4, tall_ksu_df]
list_of_df_filepaths = ['data/interim/tall_season_four.csv', 'data/interim/tall_ksu_data.csv']

save_to_csv_without_timestamp(list_of_dfs, list_of_df_filepaths)