# Convert a collection of LiPD files to a Pandas dataframe and save as a pickle file

The LiPD format is good for saving the metadata of a record in a clear structure, yet it is numerically slow to load.
Therefore, we'd like to convert it to another data structure that is fast to load.
In this notebook, we demonstrate how to convert a collection of PAGES2k LiPD files to a Pandas dataframe and save it to a pickle file for later use.

In [1]:
import numpy as np
import pandas as pd
import lipd
import os

In [2]:
def lipd2df(lipd_dirpath, pkl_filepath, col_str=[
            'paleoData_pages2kID',
            'dataSetName', 'archiveType',                                                                                
            'geo_meanElev', 'geo_meanLat', 'geo_meanLon',
            'year', 'yearUnits',                                                                                         
            'paleoData_variableName',
            'paleoData_units',                                                                                           
            'paleoData_values',
            'paleoData_proxy']):
    
    ''' Convert a bunch of PAGES2k LiPD files to a pickle file of Pandas DataFrame to boost data loading                 
                                                   
    Args:                                          
        lipd_dirpath (str): the path of the PAGES2k LiPD files
        pkl_filepath (str): the path of the converted pickle file
        col_str (list of str): the name string of the variables to extract from the LiPD files
    
    Returns:                                                         
        df (Pandas DataFrame): the converted Pandas DataFrame
    '''
    
    # save the current working directory for later use, as the LiPD utility will change it in the background
    work_dir = os.getcwd()
    
    # LiPD utility requries the absolute path, so let's get it
    lipd_dirpath = os.path.abspath(lipd_dirpath)
    
    # load LiPD files from the given directory
    lipds = lipd.readLipd(lipd_dirpath)
    
    # extract timeseries from the list of LiDP objects
    ts_list = lipd.extractTs(lipds)
    
    # recover the working directory
    os.chdir(work_dir)
    
    # create an empty pandas dataframe with the number of rows to be the number of the timeseries (PAGES2k records),
    # and the columns to be the variables we'd like to extract 
    df_tmp = pd.DataFrame(index=range(len(ts_list)), columns=col_str)
    
    # loop over the timeseries and pick those for global temperature analysis
    i = 0                                                                                                                
    for ts in ts_list:
        if 'paleoData_useInGlobalTemperatureAnalysis' in ts.keys() and \
            ts['paleoData_useInGlobalTemperatureAnalysis'] == 'TRUE':
            for name in col_str:                                                                                         
                try:
                    df_tmp.loc[i, name] = ts[name]                                                                       
                except:
                    df_tmp.loc[i, name] = np.nan                                                                         
            i += 1 
            
    # drop the rows with all NaNs (those not for global temperature analysis)
    df = df_tmp.dropna(how='all')
    
    # save the dataframe to a pickle file for later use
    save_path = os.path.abspath(pkl_filepath)
    print(f'Saving pickle file at: {save_path}')
    df.to_pickle(save_path)
    
    return df

In [3]:
df = lipd2df('./data/pages2k_lipd', './data/pages2k_dataset.pkl')

Disclaimer: LiPD files may be updated and modified to adhere to standards

Found: 648 LiPD file(s)
reading: NAm-Bennington.Luckman.2001.lpd
reading: Asi-SouthChina.Wang.1998.lpd
reading: NAm-RedMountainPassSilverton.Graybill.1994.lpd
reading: Asi-BURGPW.PAGES2k.2013.lpd
reading: Asi-NEPA029.Krusic.2013.lpd
reading: Eur-Seebergsee.Larocque-Tobler.2012.lpd
reading: Asi-PAKI027.Cook.2013.lpd
reading: NAm-BlanchardRiver.Luckman.2013.lpd
reading: Arc-LakeE.DAndrea.2011.lpd
reading: Arc-Laanila.Lindholm.2013.lpd
reading: NAm-WrangellsComposite.DArrigo.2006.lpd
reading: NAm-Pintlers.Littell.2011.lpd
reading: Ocn-GingerbreadsBahamas.Saenger.2009.lpd
reading: Asi-MiddleYangtzeRiver.Zhang.1980.lpd
reading: Asi-NEPA015.Krusic.2013.lpd
reading: Ant-TaylorDome.Steig.2000.lpd
reading: Asi-HEBQIN.PAGES2k.2013.lpd
reading: NAm-Sugarloaf.Kenigsberg.2013.lpd
reading: NAm-ShermanCreekPass.Briffa.1996.lpd
reading: NAm-CanyonCreek.Lloyd.2002.lpd
reading: Arc-ScreamingLynxLake.Clegg.2011.lpd
reading: NAm-Vi

In [4]:
# now we can take a look at the dataframe
df

Unnamed: 0,paleoData_pages2kID,dataSetName,archiveType,geo_meanElev,geo_meanLat,geo_meanLon,year,yearUnits,paleoData_variableName,paleoData_units,paleoData_values,paleoData_proxy
0,NAm_153,NAm-Bennington.Luckman.2001,tree,1850,52.7,-118.3,"[1563.0, 1564.0, 1565.0, 1566.0, 1567.0, 1568....",AD,trsgi,,"[0.883, 0.942, 1.043, 0.945, 1.141, 0.715, 0.6...",TRW
1,Asi_245,Asi-SouthChina.Wang.1998,documents,2200,23,114,"[1500.0, 1510.0, 1520.0, 1530.0, 1540.0, 1550....",AD,temperature,index,"[0.157429639092533, -0.118646665464906, 0.3035...",historic
2,NAm_165,NAm-RedMountainPassSilverton.Graybill.1994,tree,3400,37.9,-107.7,"[1626.0, 1627.0, 1628.0, 1629.0, 1630.0, 1631....",AD,MXD,,"[1.071, 1.014, 1.014, 0.966, 1.005, 1.072, 1.0...",MXD
3,Asi_178,Asi-BURGPW.PAGES2k.2013,tree,2500,28.77,83.73,"[1303.0, 1304.0, 1305.0, 1306.0, 1307.0, 1308....",AD,trsgi,,"[0.889, 0.788, 0.836, 0.741, 0.613, 0.732, 0.7...",TRW
4,Asi_174,Asi-NEPA029.Krusic.2013,tree,4000,28.18,85.43,"[1559.0, 1560.0, 1561.0, 1562.0, 1563.0, 1564....",AD,trsgi,,"[1.172, 1.21, 1.145, 1.307, 1.368, 1.37, 1.152...",TRW
5,Eur_016,Eur-Seebergsee.Larocque-Tobler.2012,lake sediment,1830,46.15,7.5,"[2001.0, 2000.0, 1999.0, 1998.0, 1997.0, 1996....",AD,temperature,degC,"[0.87088, 0.90377, 0.94562, 0.93462, 0.91326, ...",midge
6,Asi_198,Asi-PAKI027.Cook.2013,tree,2670,35.35,71.93,"[1511.0, 1512.0, 1513.0, 1514.0, 1515.0, 1516....",AD,trsgi,,"[1.076, 1.556, 1.117, 1.522, 1.355, 1.344, 1.3...",TRW
7,NAm_145,NAm-BlanchardRiver.Luckman.2013,tree,931,59.9,-136.8,"[1670.0, 1671.0, 1672.0, 1673.0, 1674.0, 1675....",AD,trsgi,,"[1.117, 1.229, 1.221, 1.202, 0.801, 1.061, 1.0...",TRW
8,Arc_070,Arc-LakeE.D'Andrea.2011,lake sediment,170,67,-50.7,"[1876.20350642055, 1865.43185219727, 1854.9566...",AD,temperature,degC,"[-0.624, -0.644, -0.625, -0.625, -0.581, -0.58...",alkenone
9,Arc_071,Arc-Laanila.Lindholm.2013,tree,265,68.4917,27.3333,"[800.0, 801.0, 802.0, 803.0, 804.0, 805.0, 806...",AD,MXD,,"[-2.08, -0.99, -1.3, -2.03, -0.99, -1.39, -0.8...",MXD
