In [None]:
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt


In [None]:
os.chdir('/Users/danielmederer/Data Storage Folder')
os.getcwd()

In [None]:
#####Read the TRY database with a selection of specific columns
TRYdata = pd.read_csv('/Users/danielmederer/Data Storage Folder/Try_full.txt', sep = "\t", 
                      usecols = ['AccSpeciesName', 'ObservationID', 'DatasetID', 'DataID', 'ObsDataID', 'TraitID', 
                                 'TraitName', 'StdValue', "OrigUnitStr", "Reference",
                                 "ValueKindName", "Replicates", 'UnitName','ErrorRisk'],
                      encoding='latin_1')
#'OriglName','OrigValueStr', 'OrigUnitStr',

In [None]:
TRYdata.loc[(TRYdata['TraitName'] == "Leaf nitrogen (N) content per leaf area")].sample(30)

In [None]:
## Correct seed mass conversion error by TRY

# select rows where 'DatasetID' == 541 and 'DataID'== 30 and multiply the values in StdValue by 1000
TRYdata.loc[(TRYdata['DatasetID'] == 541) & (TRYdata['DataID'] == 30), 'StdValue'] *= 1000
test = TRYdata.loc[(TRYdata['DatasetID']== 541) & (TRYdata['DataID']== 30 )]

test

In [None]:
## Drop chl rows with other units than mass/area

TRYdata.drop(TRYdata.loc[TRYdata['OrigUnitStr'] == "SPAD"].index,axis=0,inplace=True)
TRYdata.drop(TRYdata.loc[TRYdata['OrigUnitStr'] == "spad"].index,axis=0,inplace=True)
TRYdata.drop(TRYdata.loc[TRYdata['OrigUnitStr'] == "CCM Chlorophyll Content Index"].index,axis=0,inplace=True)
TRYdata.drop(TRYdata.loc[TRYdata['OrigUnitStr'] == "Falker Chlorofilog Chlorophyll Content Index FCI"].index,axis=0,inplace=True)
TRYdata.drop(TRYdata.loc[TRYdata['OrigUnitStr'] == "CCI "].index,axis=0,inplace=True)
TRYdata.drop(TRYdata.loc[TRYdata['OrigUnitStr'] == "nan"].index,axis=0,inplace=True)
TRYdata.dropna(subset=["OrigUnitStr"],inplace=True)


TRYdata

In [None]:

TRYdata.shape

In [None]:
#Drop tuples with no trait values and null StdValue values
TRYdata.dropna(subset=["TraitID"],inplace=True)
TRYdata.dropna(subset=["StdValue"],inplace=True)
TRYdata.dropna(subset=["AccSpeciesName"],inplace=True)
print(TRYdata['TraitName'].isna().sum())
print(TRYdata['StdValue'].isna().sum())
print(TRYdata['AccSpeciesName'].isna().sum())
TRYdata

In [None]:
## Drop tuples with |ErrorRisk|>4 (distance from the mean/std)

TRYdata.drop(TRYdata[abs(TRYdata["ErrorRisk"])>4].index,axis=0,inplace=True)
TRYdata

In [None]:
## Data testing for errors

test = TRYdata.loc[TRYdata['TraitName'] == "Leaf chlorophyll content per leaf area"]

test["OrigUnitStr"].unique()

In [None]:
TRYdata[TRYdata['TraitID']==413.0].sort_values(['StdValue'], ascending=[False])


In [None]:
TRYdata['ObservationID'].nunique()

In [None]:
TRYdata['ObsDataID'].nunique()

In [None]:
TRYdata['AccSpeciesName'].nunique()

In [None]:
TRYdata['TraitName'].nunique()

In [None]:
# Re-structure dataframe to get columns for each trait
exp_df = (TRYdata.pivot_table(columns='TraitName', index=['AccSpeciesName', 'ObservationID', 'ObsDataID', 'UnitName'], 
                       values='StdValue').reset_index().rename_axis(None, axis = 1))
exp_df

In [None]:

exp_df.columns

In [None]:
new_df = exp_df.rename(columns={'AccSpeciesName':'Species'})

new_df 


In [None]:
test = new_df.sort_values(["Wood vessel element length; stem conduit (vessel and tracheids) element length"], ascending=[False]) 
test.head()



In [None]:
#TRYdata.drop(['Unnamed: 0', 'Unnamed: 5', 'ObservationID'],axis=1,inplace=True)
# TRYdata.sample(30)

new_df.to_csv('TRY_processed_unfinished.csv', index = False)


In [None]:
######
######
######

In [None]:

# Re-load _processed_ TRY (from Fuzzy matching script) for merging with eyadata

TRYdata_pr = pd.read_csv('/Users/danielmederer/Data Storage Folder/TRY_values_processed_finished.csv', sep = ",")


#TRYdata_pr.columns = ["SLA_mm2mg", "chlc_mygcm2", 
                     # "LDMC_gg", "nitrogenc_gm2"]

TRYdata_pr.sample(10)

In [None]:
# Calculate LMA and get the correct units! (g/m2)

TRYdata_pr['LMA_gm2'] = 1 / TRYdata_pr['SLA_mm2mg'] * 1000


# Calculate EWT => NAs lead to NAs
TRYdata_pr['EWT_mgcm2'] = ((TRYdata_pr['LMA_gm2'] - TRYdata_pr['LMA_gm2'] 
                            * TRYdata_pr['LDMC_gg']) / TRYdata_pr['LDMC_gg']) /10

# Change unit from gm2 to mygcm2 for chlorophyll
TRYdata_pr['chlc_mygcm2'] = TRYdata_pr['chlc_mygcm2'] * 100



In [None]:
TRYdata_pr['EWT_mgcm2'].sort_values(ascending=[False])

In [None]:
test = TRYdata_pr.sort_values(['chlc_mygcm2'], ascending=[False]) 
test.head(300)

In [None]:
# next: concatenate TRY data with Eyas data

# load Eya's data
eyadata = pd.read_csv('/Users/danielmederer/Data Storage Folder/eyadata_processed_finished1.csv', sep = ",")

eyadata.columns

In [None]:
eya_combine = eyadata[['Anthocyanin content (Î¼g/cmÂ²)', 'Carotenoid content (Î¼g/cmÂ²)', 
                      'Chl content (Î¼g/cmÂ²)', 'LAI (mÂ²/mÂ²)', 'LDMC (g/g)', 'LMA (g/mÂ²)', 
                      'EWT (mg/cmÂ²)']]
                      
eya_combine.columns = ["antc_gcm2", "caroc_gcm2", "chlc_mygcm2", "LAI_m2m2", "LDMC_gg",
                      "LMA_gm2", "EWT_mgcm2"]

eya_combine.sample(20) 

In [None]:
# another conversion error test
test = eya_combine.sort_values(['EWT_mgcm2'], ascending=[False])
test

In [None]:
# Concatenate dataframes

df_merged = pd.concat([TRYdata_pr, eya_combine], axis=0, ignore_index=True)

#df_merged = pd.merge(TRYdata_pr, eya_combine, how = "outer")

df_merged

In [None]:
# Concatenate hierarchy data for species names

hier_eya =pd.read_csv('/Users/danielmederer/Data Storage Folder/hierarchy_processed.csv', sep = ",", encoding='latin_1')

hier_TRY = pd.read_csv('/Users/danielmederer/Data Storage Folder/TRY_hierarchy_processed_finished.csv', sep = ",", encoding='latin_1')

hier_combined = pd.concat([hier_TRY, hier_eya], axis=0, ignore_index=True)

hier_combined = hier_combined[["Species", "Genus", "Family"]]

hier_combined

In [None]:
# Save both
df_merged.to_csv('gapfilling_combined_data.csv', index = False)

hier_combined.to_csv('gapfilling_combined_hierarchy.csv', index = True)


In [None]:
#TRYdata.to_csv(r'TRY_processed.csv', index = False)