This example uses the user_assessments hdfs file from RandomDataset. User assessments file contains a user table and a assessments table, that imitate the data structure of in CSS (Covid Symptom Study) project.

In [None]:
!ls *hdf5

In [None]:
from exetera.core.session import Session
s = Session()  # not recommended, but to cover all the cells in the example, we use this way here
src = s.open_dataset('user_assessments.hdf5', 'r', 'src')
print(src.keys())
users = src['users']
print('Columns in users table:', users.keys())
# use describe to check the value in each column
users.describe(include=['bmi', 'has_diabetes', 'height_cm',  'year_of_birth'])

In [None]:
asmts = src['assessments']
print('Columns in users table:', asmts.keys())
asmts.describe(include=['abdominal_pain', 'brain_fog', 'date','loss_of_smell', 'temperature_f'])

<h3>Filtering</h3>
Filtering is performed through the use of the apply_filter function. This can be performed on <b>individual fields</b> or at a <b>dataframe level</b>. apply_filter applies the filter on data rows.



In [None]:
with Session() as s:
    dst = s.open_dataset('temp2.hdf5', 'w', 'dst')
    df = dst.create_dataframe('df')

    # apply a filter to the dataframe

    filt = (2022 - users['year_of_birth'].data[:]) > 18
    users.apply_filter(filt, ddf=df)  # non-destructive with ddf argument
    print(len(df['id']), ' adults out of ', len(users['id']), ' total subjects found.')

In [None]:
# Combining filters
# we can make use of fields directly rather than fetching the underlying numpy arrays
# we recommend this approach in general

filt = ((2022 - users['year_of_birth'].data[:]) > 18) & (users['has_diabetes'].data[:] == False)
print(filt)

# fetching numpy arrays
print(users['id'].data[filt])

<h3>Performance boost using numba</h3>
As the underlying data is fetched as a numpy array, you can utlize the numba @njit functions to accelarate the data process. For example in the case of summing up symptoms, use a seperate function with @njit decrator can speed up the performance. 

In [None]:
import numpy as np
import time

#sum up the symptoms without njit
test_length = 1000000000  # here we use the a test length rather than 50 rows in the dataset, 
                            # as the difference comes with more rows
symptoms = ['abdominal_pain', 'brain_fog',  'loss_of_smell']
t0 = time.time()
sum_symp = np.zeros(test_length, 'int32')
for i in symptoms:
    sum_symp += np.zeros(test_length, 'int32')
#print(sum_symp)
print(time.time()-t0)

In [None]:
#sum up the symptoms with njit
from numba import njit

@njit
def sum_symptom(symp_data, sum_data):
    sum_data += symp_data
    return sum_data

t0 = time.time()
sum_symp = np.zeros(test_length, 'int32')
for i in symptoms:
    sum_symp = np.zeros(test_length, 'int32')
#print(sum_symp)
print(time.time()-t0)  # 10x faster

<h3>Groupby</h3>

In [None]:
with Session() as s:
    dst = s.open_dataset('temp2.hdf5', 'w', 'dst')
    df = dst.create_dataframe('df')
    #drop duplicates
    asmts.drop_duplicates(by = 'user_id', ddf = df)
    print(len(df['user_id']), len(asmts['user_id']))
    
    #count
    df2 = dst.create_dataframe('df2')
    asmts.groupby(by = 'user_id').count(ddf = df2)
    print(len(df2['user_id']), len(asmts['user_id']))
    
    #min/ max
    df3 = dst.create_dataframe('df3')
    asmts.groupby(by = 'user_id').max(target ='date', ddf = df3)
    print(len(df3['user_id']), len(asmts['user_id']))
    df4 = dst.create_dataframe('df4')
    asmts.groupby(by = 'user_id').min(target ='date', ddf = df4)
    print(len(df4['user_id']), len(asmts['user_id']))

    #first/last
    df5 = dst.create_dataframe('df5')
    asmts.groupby(by = 'user_id').first(target ='date', ddf = df5)
    df6 = dst.create_dataframe('df6')
    asmts.groupby(by = 'user_id').last(target ='date', ddf = df6)

In [None]:
#transform rather than group by
with Session() as s:
    dst = s.open_dataset('temp2.hdf5', 'w', 'dst')
    df = dst.create_dataframe('df')
    
    symptoms = ['abdominal_pain', 'brain_fog',  'loss_of_smell']
    sum_symp = np.zeros(len(asmts['user_id']), 'int32')
    for i in symptoms:
        sum_symp += np.zeros(len(asmts['user_id']), 'int32')
    
    spans = asmts['user_id'].get_spans()  # make sure asmts dataframe is sorted based on user_id
    max_symp = np.zeros(len(asmts['user_id']), 'int32')
    for i in range(len(spans)-1):
        max_symp[spans[i]:spans[i+1]] = np.max(sum_symp.data[spans[i]:spans[i+1]])
    #write data back to df
    df.create_numeric('max_symp', 'int32').data.write(max_symp)
    print(len(df['max_symp'].data))  # note the field length is the same with transform
    

<h3>Join</h3>
ExeTera provides functions that provide pandas-like merge functionality on DataFrame instances. We have made this operation as familiar as possible to Pandas users, but there are a couple of differences that we should highlight:
<br>

&bull; merge is provided as a function in the dataframe unit, rather than as a member function on DataFrame instances 
<br>
&bull; merge takes three dataframe arguments, left, right and dest. This is due to the fact that DataFrames are always backed up by a datastore and so rather than create an in-memory destination dataframe, the resulting merged fields must be written to a dataframe of your choosing. 
<br>
&bull; Note, this can either be a separate dataframe or it can be the dataframe that you are merging to (typically left in the case of a "left" merge and right in the case of a "right" merge
<br>
&bull; merge takes a number of optional hint fields that can save time when working with large datasets. These specify whether the keys are unique or ordered and allow the merge to occur without first checking this
<br>
&bull; merge has a number of highly scalable algorithms that can be used when the key data is sorted and / or unique.

In [None]:
from exetera.core.dataframe import merge
with Session() as s:
    dst = s.open_dataset('temp2.hdf5', 'w', 'dst')
    df = dst.create_dataframe('df')
    merge(users, asmts, df, left_on='id', right_on='user_id', how='left')
    print(len(df['id_l'].data))  # note as there are 'id' field in both dataframe, thus a suffix '_l' and '_r'
                                    # are added to the merged dataframe 
    print(df.keys())

<h3>Sort</h3>

In [None]:
from exetera.core.dataframe import merge
with Session() as s:
    dst = s.open_dataset('temp2.hdf5', 'w', 'dst')
    df = dst.create_dataframe('df')
    merge(users, asmts, df, left_on='id', right_on='user_id', how='left')
    s.sort_on(df, df, ('id_l',))

In [None]:
from exetera.core.dataframe import merge
with Session() as s:
    dst = s.open_dataset('temp2.hdf5', 'w', 'dst')
    df = dst.create_dataframe('df')
    merge(users, asmts, df, left_on='id', right_on='user_id', how='left')
    df2 = dst.create_dataframe('df2')
    s.sort_on(df, df2, ('id_l',))

In [None]:
#sorting with an index
with Session() as s:
    dst = s.open_dataset('temp2.hdf5', 'w', 'dst')
    df = dst.create_dataframe('df')
    merge(users, asmts, df, left_on='id', right_on='user_id', how='left')

    index = s.dataset_sort_index((df['id_l'],))

    # apply indices to a destination dataframe
    df2 = dst.create_dataframe('df2')
    df.apply_index(index, df2)
    print(df2['id_l'].data[:])
    
    # apply indices in place
    df.apply_index(index)
    print(df['id_l'].data[:])

<h3>I/O</h3>

In [None]:
with Session() as s:
    dst = s.open_dataset('temp2.hdf5', 'w', 'dst')
    df = dst.create_dataframe('df')
    merge(users, asmts, df, left_on='id', right_on='user_id', how='left')

    #output a dataframe to to_csv
    df.to_csv('merged.csv')

    #output to csv with row filters
    row_filter = (2022-df['year_of_birth'].data[:]) > 18
    df.to_csv('adults.csv', row_filter)  # save the data you want without change the underlying data in df

    #output to csv with column filters
    df.to_csv('column_filtered.csv', column_filter=['id_l', 'year_of_birth', 'date', 'tested_covid_positive'])  # save the columns you want

In [None]:
!ls *csv

In [None]:
# close src dataset as we open dataset using s=Session()
# this is not necessary if we use context management by with Session as s:
s.close_dataset(src)