In [1]:
from pathlib import Path

import pandas as pd

from pdr_tests.utilz.new_coverage_utilz import (
    MetricLoader, pathtable_to_treeframe, tf_pathcounts
)

#### Load summary products for one or more manifests

**Note**: These are created with old data; they don't update just based on changes in pdr-tests. So you'll want to occasionally recreate them (TODO: add/point to instructions on how to do that) so you have updated metrics.

In [2]:
# change COVERAGE_ROOT_PATH if you've put the files somewhere else
COVERAGE_ROOT_PATH = Path.home() / "datascratch/pdr_coverage"
loader = MetricLoader(COVERAGE_ROOT_PATH / "coverage_metrics")

In [None]:
# now you can load products of whatever type, pivot, 
# and filter you like, for one or more manifests at once.

# TODO: explain explain etc.

# options are:
# pivot = one of ("dataset_pds", "dataset_ix", "ptype", "volume")
# filt = one of ("cov", "ucov", "inc", "all") 
# mtype = ("paths", "stats")

# don't specify 'filt' for stats files.

# run this if you'd like to see the names of all manifests
# with available metrics:
# ```
# measured_manifest_names(metrics_path)
# ```

#### The cell below shows PDS datasets with no coverage.

The default settings for 'loader' are `pivot="dataset_pds"` and `mtype="stats"`.
This is the most useful product type for this kind of analysis.

**Note**: You can change the ==0.0 to <=0.3 to get anything less than 30% coverage for example. You'll see that for this paticular example there is no change in the number of rows. This means any PDS datsets we have coverage for are covered at a higher rate than 30%. Which is good! We should have very high coverage in any volume we've already added definitions for!

In [3]:
# load stats products for PDS datasets at IMG-USGS.
# note that no manifests are actually named 'img_usgs' --
# they're 'img_usgs_clementine', etc. -- but the loader
# will handle a partial name, or a list of (partial) names
# as well as a complete name.

statstable = loader.load("img_usgs")
# show the ones with no coverage.
nocoverage = statstable.loc[statstable['coverage'] == 0]
nocoverage

Unnamed: 0,dataset_pds,n_inc,n_cov,n_ucov,coverage,n,volume,ptype,dataset_ix,manifest
0,dawn-a-fc2-2-edr-ceres-images,251074,0,251074,0.0,423170,"[Missions/Dawn/Ceres/DWNCAFC2_1A, Missions/Daw...",,,img_usgs_dawn
1,dawn-a-fc2-2-edr-vesta-images;dawn-a-fc2-3-rdr...,61330,0,61330,0.0,61340,"[Missions/Dawn/DWNVFC2_1B, Missions/Dawn/Vesta...",,,img_usgs_dawn
0,vg1/vg2-s-iss-2/3/4/6-processed,65396,0,65396,0.0,66789,"[Missions/Voyager/VGISS_0004, Missions/Voyager...",,,img_usgs_voyager
0,mess-h-mdis-5-dem-elevation,1531,0,1531,0.0,1643,[Missions/MESSENGER/MESSDEM_1001],,,img_usgs_messenger
1,mess-e/v/h-mdis-2-edr-rawdata,291008,0,291008,0.0,291331,[Missions/MESSENGER/MSGRMDS_1001],,,img_usgs_messenger
...,...,...,...,...,...,...,...,...,...,...
3,rs_moon_vis_15;a15lcl-l-vis-6-rock-sample-images,17884,0,17884,0.0,44738,[Missions/Apollo/Lunar_Sample_Photographs/A15V...,,,img_usgs_apollo
4,a17lcl-l-vis-6-rock-sample-images,11426,0,11426,0.0,28924,[Missions/Apollo/Lunar_Sample_Photographs/A17V...,,,img_usgs_apollo
5,a15c-l-mc-2-scanned-images,9768,0,9768,0.0,13422,"[Missions/Apollo/Metric_Camera/A15MC_0001, Mis...",,,img_usgs_apollo
6,a16c-l-mc-2-scanned-images,4994,0,4994,0.0,6815,[Missions/Apollo/Metric_Camera/A16MC_0001],,,img_usgs_apollo


In [4]:
# Volumes -- which are partial URLs at the respective node -- are expressed 
# as lists in dataset_pds stats products, which can be
# hard to read. To get a clearer look, we can get a flattened list out of a 
# stats table by doing this:
vols_for_uncov_datasets = nocoverage['volume'].explode().unique()
vols_for_uncov_datasets[:10]

array(['Missions/Dawn/Ceres/DWNCAFC2_1A',
       'Missions/Dawn/Ceres/DWNCHFC2_1A',
       'Missions/Dawn/Ceres/DWNCLFC2_1A',
       'Missions/Dawn/Ceres/DWNCSFC2_1A', 'Missions/Dawn/DWNCAFC2_1A',
       'Missions/Dawn/DWNCHFC2_1A', 'Missions/Dawn/DWNCLFC2_1A',
       'Missions/Dawn/DWNCSFC2_1A', 'Missions/Dawn/DWNVFC2_1B',
       'Missions/Dawn/Vesta/DWNVFC2_1B'], dtype=object)

In [5]:
# Of course, some PDS volumes are really, really large and
# contain many different datasets and product types. 
# We can drill down using the paths tables. 

# load paths products for uncovered datasets at IMG-USGS.
# this is probably going to make a big table!
img_ucov_paths = loader.load("img_usgs", mtype="paths", filt="ucov")
img_ucov_paths

Unnamed: 0,field,name,path,extension,inc,n,manifest
0,dataset_pds,vg1/vg2-s-iss-2/3/4/6-processed,Missions/Voyager/VGISS_0004/DIONE,IMG,56,56,img_usgs_voyager
1,dataset_pds,vg1/vg2-s-iss-2/3/4/6-processed,Missions/Voyager/VGISS_0004/DIONE,LBL,56,56,img_usgs_voyager
2,dataset_pds,vg1/vg2-s-iss-2/3/4/6-processed,Missions/Voyager/VGISS_0004/ENCELADU,IMG,31,31,img_usgs_voyager
3,dataset_pds,vg1/vg2-s-iss-2/3/4/6-processed,Missions/Voyager/VGISS_0004/ENCELADU,LBL,31,31,img_usgs_voyager
4,dataset_pds,vg1/vg2-s-iss-2/3/4/6-processed,Missions/Voyager/VGISS_0004/EPIMETHE,IMG,10,10,img_usgs_voyager
...,...,...,...,...,...,...,...
4817,dataset_pds,a17c-l-mc-2-scanned-images,Missions/Apollo/Metric_Camera/A17MC_0001/DATA/...,tif,145,145,img_usgs_apollo
4818,dataset_pds,a17c-l-mc-2-scanned-images,Missions/Apollo/Metric_Camera/A17MC_0001/DATA/...,lbl,148,148,img_usgs_apollo
4819,dataset_pds,a17c-l-mc-2-scanned-images,Missions/Apollo/Metric_Camera/A17MC_0001/DATA/...,tif,148,148,img_usgs_apollo
4820,dataset_pds,a17c-l-mc-2-scanned-images,Missions/Apollo/Metric_Camera/A17MC_0001/DATA/...,lbl,107,107,img_usgs_apollo


In [45]:
# then, if you'd like to drill down to a specific dataset...

# (this does not need to actually be random)
uncovered_set = nocoverage['dataset_pds'].sample().iloc[0]
print(uncovered_set)

# the 'name' field of a paths table always contains the pivot value 
set_paths = img_ucov_paths.loc[img_ucov_paths['name'] == uncovered_set]
set_paths

dawn-a-fc2-2-edr-ceres-images


Unnamed: 0,field,name,path,extension,inc,n,manifest
0,dataset_pds,dawn-a-fc2-2-edr-ceres-images,Missions/Dawn/Ceres/DWNCAFC2_1A/DATA/FITS/2014...,FIT,40,40,img_usgs_dawn
1,dataset_pds,dawn-a-fc2-2-edr-ceres-images,Missions/Dawn/Ceres/DWNCAFC2_1A/DATA/FITS/2014...,LBL,40,40,img_usgs_dawn
2,dataset_pds,dawn-a-fc2-2-edr-ceres-images,Missions/Dawn/Ceres/DWNCAFC2_1A/DATA/FITS/2014...,FIT,40,40,img_usgs_dawn
3,dataset_pds,dawn-a-fc2-2-edr-ceres-images,Missions/Dawn/Ceres/DWNCAFC2_1A/DATA/FITS/2014...,LBL,40,40,img_usgs_dawn
4,dataset_pds,dawn-a-fc2-2-edr-ceres-images,Missions/Dawn/Ceres/DWNCAFC2_1A/DATA/FITS/2014...,FIT,220,220,img_usgs_dawn
...,...,...,...,...,...,...,...
851,dataset_pds,dawn-a-fc2-2-edr-ceres-images,Missions/Dawn/DWNCSFC2_1A/DATA/IMG/20150604_SU...,IMG,436,436,img_usgs_dawn
852,dataset_pds,dawn-a-fc2-2-edr-ceres-images,Missions/Dawn/DWNCSFC2_1A/DATA/IMG/20150604_SU...,IMG,442,442,img_usgs_dawn
853,dataset_pds,dawn-a-fc2-2-edr-ceres-images,Missions/Dawn/DWNCSFC2_1A/DATA/IMG/20150604_SU...,IMG,482,482,img_usgs_dawn
854,dataset_pds,dawn-a-fc2-2-edr-ceres-images,Missions/Dawn/DWNCSFC2_1A/GEOMETRY,lbl,1,1,img_usgs_dawn


In [46]:
# you can use this to find paths to explore, to help see what we
# should exclude that we're not excluding (or include that we're 
# not including), etc. 

# for exapmle, here's a summary of the file extensions:
set_paths['extension'].value_counts()

extension
FIT    280
LBL    280
IMG    280
lbl      8
mk       8
Name: count, dtype: int64

In [47]:
# pdr-tests also includes convenience functions to help you slice it up and 
# get a look at the directory structure:
tf = pathtable_to_treeframe(set_paths)
tf_pathcounts(tf)

0         1     2      3            4     5     6              7              
Missions  Dawn  Ceres  DWNCHFC2_1A  DATA  FITS  20150816_HAMO  20150818_CYCLE1    24
                                                               20150920_CYCLE4    24
                                                               20151001_CYCLE5    24
                                                               20151012_CYCLE6    24
                                                               20150829_CYCLE2    24
                                                               20150909_CYCLE3    14
                                          IMG   20150816_HAMO  20150818_CYCLE1    12
                                                               20150829_CYCLE2    12
                                                               20150920_CYCLE4    12
                                                               20151001_CYCLE5    12
                                                               20151012

In [48]:
tf[[2, 'extension']].value_counts()

2            extension
Ceres        FIT          140
             LBL          140
             IMG          140
DWNCHFC2_1A  FIT           67
             IMG           67
             LBL           67
DWNCLFC2_1A  FIT           41
             LBL           41
             IMG           41
DWNCAFC2_1A  LBL           23
             IMG           23
             FIT           23
DWNCSFC2_1A  FIT            9
             IMG            9
             LBL            9
Ceres        mk             4
             lbl            4
DWNCHFC2_1A  lbl            1
             mk             1
DWNCAFC2_1A  mk             1
             lbl            1
DWNCLFC2_1A  lbl            1
             mk             1
DWNCSFC2_1A  lbl            1
             mk             1
Name: count, dtype: int64