In [1]:
from pdr_tests.utilz.coverage_utilz import load_and_count

#### Let's load in a coverage manifest.

**Note**: These are created with old data; they don't update just based on changes in pdr-tests. So you'll want to occasionally recreate them (TODO: add/point to instructions on how to do that) so you have updated metrics.

In [2]:
manifest = "node_manifests/coverage_manifests/atm_coverage.parquet"

metrics = load_and_count(manifest)

making part table for directory exclusion...1239141 files excluded
finding coverage
making countframes
counting dataset_ix
counting ptype
counting volume
counting dataset_pds


#### The cell below shows the partial url for volumes with no coverage in the volume column.

**Note**: You can change the ==0.0 to <=0.3 to get anything less than 30% coverage for example. You'll see that for this paticular example there is no change in the number of rows. This means any data volumes we have coverage for are covered at a higher rate than 30%. Which is good! We should have very high coverage in any volume we've already added definitions for!

In [3]:
no_coverage_df=metrics["volume"][metrics["volume"]["coverage"]==0.0]
no_coverage_df

Unnamed: 0,volume,count,labels,datasets_ix,ptypes,n_datasets_ix,n_ptypes,n_covered,n_uncovered,coverage
0,PDS/data/MROA_0001,2591,[],[],[],0,0,0,2567,0.0
1,PDS/data/MROM_0001,109,[],[],[],0,0,0,45,0.0
2,PDS/data/MROM_0002,399,[],[],[],0,0,0,190,0.0
3,PDS/data/MROM_0003,387,[],[],[],0,0,0,184,0.0
4,PDS/data/MROM_0004,399,[],[],[],0,0,0,190,0.0
...,...,...,...,...,...,...,...,...,...,...
1757,PDS/data/vg_2499,115,[],[],[],0,0,0,87,0.0
1758,PDS/data/vl_1001,44,"[,pds4_label]",[],[],0,0,0,9,0.0
1759,PDS/data/vl_1002,31,"[,pds4_label]",[],[],0,0,0,6,0.0
1760,PDS/data/vo_3001,15,[],[],[],0,0,0,2,0.0


#### Let's prioritize datasets without any indication they have pds4 coverage. 

**Note**: We shouldn't completely ignore the ones we are filtering out, they might not have full pds4 coverage or it might be like some of the `juno` data and end up the pds4 data has issues not present in the pds3 labels, but we still need to prioritize pds3 only data.

In [4]:
no_coverage_no_pds4_df = no_coverage_df[~no_coverage_df.labels.astype(str).str.contains(pat='pds4')]
no_coverage_no_pds4_df

Unnamed: 0,volume,count,labels,datasets_ix,ptypes,n_datasets_ix,n_ptypes,n_covered,n_uncovered,coverage
0,PDS/data/MROA_0001,2591,[],[],[],0,0,0,2567,0.0
1,PDS/data/MROM_0001,109,[],[],[],0,0,0,45,0.0
2,PDS/data/MROM_0002,399,[],[],[],0,0,0,190,0.0
3,PDS/data/MROM_0003,387,[],[],[],0,0,0,184,0.0
4,PDS/data/MROM_0004,399,[],[],[],0,0,0,190,0.0
...,...,...,...,...,...,...,...,...,...,...
1755,PDS/data/vg_2215,68,[],[],[],0,0,0,28,0.0
1756,PDS/data/vg_2216,84,[],[],[],0,0,0,36,0.0
1757,PDS/data/vg_2499,115,[],[],[],0,0,0,87,0.0
1760,PDS/data/vo_3001,15,[],[],[],0,0,0,2,0.0


#### Let's sort this by the count of data files in the volume so we can target higher data volumes first.

Now you can get the full url by going to the pdrdev spider that corresponds with this coverage (you can find those here: https://github.com/MillionConcepts/pdrdev/tree/main/pdrtestsuite/spiders) and then combining the value in in the volume column with the url in `start_url` for the spider to go right to the volume that will make the most impact on coverage for this manifest!

In [15]:
sorted_df = no_coverage_no_pds4_df.sort_values(by=['count', 'n_uncovered'], ascending=False)
sorted_df

Unnamed: 0,volume,count,labels,datasets_ix,ptypes,n_datasets_ix,n_ptypes,n_covered,n_uncovered,coverage
1718,PDS/data/vcolir_1001,82363,[],[],[],0,0,0,41166,0.0
1715,PDS/data/vcolir_0001,44923,[],[],[],0,0,0,22446,0.0
1677,PDS/data/mors_1101,42508,[],[],[],0,0,0,21246,0.0
1540,PDS/data/jnojir_1003,31523,[],[],[],0,0,0,15754,0.0
1571,PDS/data/jnojir_2003,31231,[],[],[],0,0,0,15608,0.0
...,...,...,...,...,...,...,...,...,...,...
1607,PDS/data/merimu_1001,22,[],[],[],0,0,0,5,0.0
1739,PDS/data/vg_2101,20,[],[],[],0,0,0,4,0.0
1630,PDS/data/mg_2401,19,[],[],[],0,0,0,2,0.0
1761,PDS/data/vo_3002,16,[],[],[],0,0,0,2,0.0


If you want to be able to explore different rows of this table (so you can look at more than the top and bottom 5 rows), you can use the following syntax:

In [16]:
sorted_df[0:10] # Shows the first ten rows

Unnamed: 0,volume,count,labels,datasets_ix,ptypes,n_datasets_ix,n_ptypes,n_covered,n_uncovered,coverage
1718,PDS/data/vcolir_1001,82363,[],[],[],0,0,0,41166,0.0
1715,PDS/data/vcolir_0001,44923,[],[],[],0,0,0,22446,0.0
1677,PDS/data/mors_1101,42508,[],[],[],0,0,0,21246,0.0
1540,PDS/data/jnojir_1003,31523,[],[],[],0,0,0,15754,0.0
1571,PDS/data/jnojir_2003,31231,[],[],[],0,0,0,15608,0.0
1539,PDS/data/jnojir_1002,30673,[],[],[],0,0,0,15328,0.0
1570,PDS/data/jnojir_2002,29577,[],[],[],0,0,0,14780,0.0
1720,PDS/data/vcolir_1003,29099,[],[],[],0,0,0,14534,0.0
1683,PDS/data/mslrem_1001,24739,[],[],[],0,0,0,12345,0.0
1728,PDS/data/vcouvi_0001,22371,[],[],[],0,0,0,11170,0.0


#### Now that we've gone over how to use the coverage manifests to prioritize datasets, let's look at the other information that the metrics `load_and_count` outputs.

In [7]:
metrics.keys()

dict_keys(['dataset_ix', 'ptype', 'volume', 'dataset_pds', 'cov', 'ucov', 'dir_ignore'])

`metrics['dataset_ix']` will output a dataframe of the ix datasets that have been identified are relevant to this manifest. This is a great way to know if your coverage manifest is out of date. Are there datasets here that you've added that _should_ be relevant to this manifest but aren't represented here?

In [8]:
metrics['dataset_ix']

Unnamed: 0,dataset_ix,count,ptypes,volumes,datasets_pds,n_dataset,n_volume,total_mb,mean_mb,labels
0,juno_jiram,273501,"[IMG_RDR, LOG_IMG_RDR, LOG_SPE_RDR, SPE_RDR, I...",[PDS/data/PDS4/juno_jiram_bundle],[JNO-JIRAM-3],1,1,31562.42,0.115401,"[,pds4_name,pds4_label]"
1,cassini_uvis,201161,"[euv, fuv, hdac]","[PDS/data/couvis_0001, PDS/data/couvis_0002, P...","[CO-J-UVIS-2-CUBE, CO-X-UVIS-2-CUBE, CO-S-UVIS...",4,60,434178.5,2.158363,[]
2,juno_gs,288,"[EDR_ODF, EDR_TNF, EDR_RSR]","[PDS/data/jnogrv_0001, PDS/data/jnogrv_1001]",[JNO-RSS-1],1,2,50326.17,174.743645,"[,pds4_label]"
3,juno_mwr,418920,"[EDR, GRDR, IRDR]","[PDS/data/jnomwr_0000, PDS/data/jnomwr_0000V1,...","[JNO-MWR-2, , JNO-J-MWR-2-EDR, JNO-X-MWR-3-RDR]",4,6,1446771.0,3.453573,"[, ,excluded, ,pds4_label]"


`metrics['ptype']` outputs a dataframe that matches the product types as defined in ix with the various pds datasets and volumes.

In [9]:
metrics['ptype']

Unnamed: 0,ptype,count,volumes,datasets_pds,n_dataset,n_volume,total_mb,mean_mb,labels
0,juno_jiram;IMG_RDR,34395,[PDS/data/PDS4/juno_jiram_bundle],[JNO-JIRAM-3],1,1,9818.296875,0.285457,"[,pds4_name,pds4_label]"
1,juno_jiram;LOG_IMG_RDR,34398,[PDS/data/PDS4/juno_jiram_bundle],[JNO-JIRAM-3],1,1,2.361923,6.9e-05,"[,pds4_name,pds4_label]"
2,juno_jiram;LOG_SPE_RDR,33977,[PDS/data/PDS4/juno_jiram_bundle],[JNO-JIRAM-3],1,1,2.333015,6.9e-05,"[,pds4_name,pds4_label]"
3,juno_jiram;SPE_RDR,33977,[PDS/data/PDS4/juno_jiram_bundle],[JNO-JIRAM-3],1,1,11148.703125,0.328125,"[,pds4_name,pds4_label]"
4,juno_jiram;IMG_EDR,34839,[PDS/data/PDS4/juno_jiram_bundle],[JNO-JIRAM-3],1,1,5083.699219,0.14592,"[,pds4_name,pds4_label]"
5,juno_jiram;LOG_IMG_EDR,34839,[PDS/data/PDS4/juno_jiram_bundle],[JNO-JIRAM-3],1,1,2.392204,6.9e-05,"[,pds4_name,pds4_label]"
6,juno_jiram;LOG_SPE_EDR,33538,[PDS/data/PDS4/juno_jiram_bundle],[JNO-JIRAM-3],1,1,2.302872,6.9e-05,"[,pds4_name,pds4_label]"
7,juno_jiram;SPE_EDR,33538,[PDS/data/PDS4/juno_jiram_bundle],[JNO-JIRAM-3],1,1,5502.328125,0.164062,"[,pds4_name,pds4_label]"
8,cassini_uvis;euv,79504,"[PDS/data/couvis_0001, PDS/data/couvis_0002, P...","[CO-J-UVIS-2-CUBE, CO-X-UVIS-2-CUBE, CO-S-UVIS...",4,60,220836.019531,2.777672,[]
9,cassini_uvis;fuv,83348,"[PDS/data/couvis_0001, PDS/data/couvis_0002, P...","[CO-J-UVIS-2-CUBE, CO-X-UVIS-2-CUBE, CO-S-UVIS...",4,60,212824.701172,2.553447,[]


`metrics['dataset_pds']` breaks down the data by dataset (as defined in the pds) rather than by volume url. If you like this better feel free to use the same code as above and sub in 'dataset_pds' for 'volume'.

In [10]:
metrics['dataset_pds']

Unnamed: 0,dataset_pds,count,labels,datasets_ix,ptypes,n_datasets_ix,n_ptypes,n_covered,n_uncovered,coverage
0,MRO-M-ACCEL-2-ACCELDATA,7767,"[, ,pds4_name,pds4_label]",[],[],0,0,0,5140,0.000000
1,MRO-M-MCS-2-EDR,63890,[],[],[],0,0,0,30330,0.000000
2,MRO-M-MCS-4-RDR,66962,[],[],[],0,0,0,31696,0.000000
3,MRO-M-MCS-5-DDR,63482,[],[],[],0,0,0,29956,0.000000
4,JNO-JIRAM-3,1096105,"[,pds4_name,pds4_label, ]",[juno_jiram],"[IMG_RDR, LOG_IMG_RDR, LOG_SPE_RDR, SPE_RDR, I...",1,8,273501,137473,0.665495
...,...,...,...,...,...,...,...,...,...,...
103,VG2-NSA-RSS-5-ROCC,115,[],[],[],0,0,0,87,0.000000
104,"('VL1/VL2-M-MET-4-BINNED-P-T-V', 'VL1/VL2-M-ME...",44,"[,pds4_label]",[],[],0,0,0,9,0.000000
105,VL1/VL2 MARS METEOROLOGY CALIBRATEDFOOTPAD TEMP,31,"[,pds4_label]",[],[],0,0,0,6,0.000000
106,VO1/VO2 MARS ATMOSPHERIC WATERDETECTOR 4,15,[],[],[],0,0,0,2,0.000000


The metrics below are less reliable, less defined (notice the lack of column names), and based on splitting urls, etc. You might find them useful as a way to explore the data using dataframe indexing. But in general, I'd recommend sticking to the keys above for the majority of your work.

In [11]:
metrics['ucov']

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,ext,count
0,PDS,data,PDS4,LADEE,uvs_bundle,data_calibrated,,,,,,,TAB,1038103
1,PDS,data,PDS4,LADEE,uvs_bundle,data_raw,,,,,,,TAB,1038103
2,PDS,data,PDS4,MAVEN,iuvs_raw_bundle,l1a,disk,2020,,,,,fits,83763
3,PDS,data,PDS4,MAVEN,iuvs_raw_bundle,l1a,limb,2020,,,,,fits,80718
4,PDS,data,PDS4,MAVEN,iuvs_calibrated_bundle,l1b,disk,2020,,,,,fits,75952
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29702,PDS,data,cors_0360,SCC9_ANCILLARY,,,,,,,,,TRO,1
29703,PDS,data,cors_0360,SCC9_281,,,,,,,,,158,1
29704,PDS,data,cors_0360,SCC9_281,,,,,,,,,2A1,1
29705,PDS,data,cors_0360,SCC9_281,,,,,,,,,2B1,1


In [12]:
metrics['cov']

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,ext,count
0,PDS,data,PDS4,juno_jiram_bundle,data_raw,,,,,,,,TAB,68377
1,PDS,data,PDS4,juno_jiram_bundle,data_calibrated,,,,,,,,TAB,68375
2,PDS,data,PDS4,juno_jiram_bundle,data_raw,,,,,,,,IMG,34839
3,PDS,data,PDS4,juno_jiram_bundle,data_calibrated,,,,,,,,IMG,34395
4,PDS,data,PDS4,juno_jiram_bundle,data_calibrated,,,,,,,,DAT,33977
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136,PDS,data,jnogrv_0001,DATA,,,,,,,,,ODF,21
137,PDS,data,jnogrv_0001D2,DATA,,,,,,,,,ODF,21
138,PDS,data,jnogrv_0001,DATA,,,,,,,,,TNF,16
139,PDS,data,jnogrv_0001D2,DATA,,,,,,,,,TNF,16


These are files/directories that are ignored based on prior rules we've setup. Directories that contain things like documentation and browse products are not something we want to target for `pdr` support.

In [13]:
metrics['dir_ignore']

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,ext,count
0,PDS,data,vcolir_0001,browse,,,,,,,,,lbl,22763
1,PDS,data,vcolir_0001,browse,,,,,,,,,jpg,22443
2,PDS,data,vcolir_1001,browse,l2b,,,,,,,,lbl,20652
3,PDS,data,vcolir_1001,browse,l2c,,,,,,,,lbl,20652
4,PDS,data,vcolir_1001,browse,l2b,,,,,,,,jpg,20572
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4672,PDS,data,cocirs_0401,DOCUMENT,,,,,,,,,LBL,1
4673,PDS,data,cocirs_0401,DOCUMENT,,,,,,,,,PDF,1
4674,PDS,data,cocirs_0401,DOCUMENT,,,,,,,,,TEX,1
4675,PDS,data,vcouvi_1003,browse,,,,,,,,,html,1
