# Inventory Summary

This notebook gives an overview of the station inventory, created by running the latest version of the analyze_inventory.py and make_station_configuration.py scripts.

The correct identification of stations, i.e. the mapping of the stations contained in the database directories to an existing entry in the OSCAR, IGRA2, WBAN, CHUAN inventories
is the first step for the harvesting and merging procedure.

## Counting all the original files 
Here we filter and count all the files in the database directory, that will be used for the harvesting and merging procedure.

In [58]:
import os,sys
import pandas as pd 
import glob
import numpy as np
import h5py as h5

In [35]:
# location of datasets
datasets = {'era5_1': '/mnt/users/scratch/leo/scratch/era5/odbs/1' ,
            'era5_2': '/mnt/users/scratch/leo/scratch/era5/odbs/2',
            'era5_3188': '/mnt/users/scratch/leo/scratch/era5/odbs/3188',
            'era5_1759': '/mnt/users/scratch/leo/scratch/era5/odbs/1759',
            'era5_1761': '/mnt/users/scratch/leo/scratch/era5/odbs/1761',
            'bufr': '/mnt/users/scratch/leo/scratch/era5/odbs/ai_bfr/',
            'ncar': '/scratch/das/federico/databases_service2/UADB_25012022/',
            'igra2': '/scratch/das/federico/databases_service2/IGRA2_20211231/', # dummy, use the igra2 station list file 
                               }
# inventory path (TO DO fix path )

inv_path = "/users/staff/federico/GitHub/CEUAS_master_SEPTEMBER2021/CEUAS/CEUAS/meta/inventory_comparison/code/inventories/"


databases = ['era5_2', 'era5_1759', 'era5_1761', 'era5_3188', 'bufr', 'igra2', 'era5_1']

databases = ['era5_2', 'era5_1759', 'era5_1761', 'era5_3188', 'bufr', 'era5_1', 'ncar' , 'igra2']

counts = { "database":[], 
           "all_files":[], 
           "processed":[],
           "stat_conf":[],
           "harvested":[],
           "identified":[],
           "inconsistent_coord":[], 
           "unidentified":[],
           "unidentified_Id":[],
           "failed":[]}



for db in databases:

    counts['database'].append(db)
    
    if not os.path.isdir( 'inventories/' + db ):
        os.makedirs( 'inventories/' + db )

    if db == 'era5_1':
        #flist=glob.glob(datasets[db] + "/era5.conv._*")
        flist = [ str(i) for i in range(4711) ]
    elif db == 'era5_2':
        flist = glob.glob("/mnt/users/scratch/leo/scratch/era5/odbs/2/era5.conv._*")
        flist = [f for f in flist if '.gz' not in f and '.nc' not in f ]
    elif db == 'era5_1759':
        flist=glob.glob("/mnt/users/scratch/leo/scratch/era5/odbs/1759/era5.1759.conv.*")
    elif db == 'era5_1761':
        flist = glob.glob("/mnt/users/scratch/leo/scratch/era5/odbs/1761/era5.1761.conv.*")
    elif db == 'era5_3188':
        flist = glob.glob("/mnt/users/scratch/leo/scratch/era5/odbs/3188/era5.3188.conv.*")
    elif db == 'ncar':
        flist = glob.glob( datasets[db] + '/uadb*')
    elif 'bufr' in db:
        flist = glob.glob(datasets[db] + '/' + 'era5.*.bfr')
    elif db == 'igra2':
        flist = glob.glob(datasets[db] + '/' + '.txt')
        

    comb = glob.glob(datasets[db] + '/'+'*.conv.19*') # removing wrong combined files per year
    flist = [f for f in flist if f not in comb]
    flist = [f for f in flist if '.gz' not in f and '.nc' not in f and '00000' not in f and '99999' not in f and 'undef' not in f]
    counts['all_files'].append(len(flist))

    tags = ['processed']
    
    #inconsistent_coord

    # checking station_configuration stations 
    stat_conf = '/users/staff/federico/GitHub/CEUAS_master_SEPTEMBER2021/CEUAS/'
    stat_conf = stat_conf + 'CEUAS/public/harvest/data/station_configurations/' 
    stat_conf = stat_conf + db + '_station_configuration_extended.csv'
    
    df = pd.read_csv(stat_conf, sep = '\t')
    counts['stat_conf'].append(len(df))

    # checking successfully harvested stations 
    harv_dir = '/scratch/das/federico/COP2_HARVEST_FEB2022/' + db 
    files = [f for f in os.listdir(harv_dir) if '.nc' in f and '-1_' not in f ]
    counts['harvested'].append(len(df))

    
        
        
    for t in tags:
        if db == 'igra2':
            file =  [inv_path + '/logs/' + f for f in os.listdir(inv_path + '/logs/') if db+'_'+t in f ]
        elif db == 'era5_1':
            file =  [inv_path + '/logs/' + f for f in os.listdir(inv_path + '/logs/') if '1_'+t in f and '1761' not in f ]

        else:
            db = db.replace('era5_','').replace('bufr','ai_bfr')
            file = [inv_path + '/logs/' + f for f in os.listdir(inv_path + '/logs/') if db+'_'+t in f and 'igra2' not in f ]
        #print(db, file)
            
        file = file[0]
        
        lines = open(file, 'r').readlines()
        counts['processed'].append(len(lines))
        counts['identified'].append(len([l for l in lines if 'identified' in l and 'unidentified' not in l ]))
        
        counts['unidentified'].append(len([l for l in lines if 'unidentified' in l and "Id" not in l ]))
        counts['unidentified_Id'].append(len([l for l in lines if 'unidentified' in l and "Id" in l ]))
        
        counts['inconsistent_coord'].append(len([l for l in lines if 'inconsistent_coord' in l ]))
        counts['failed'].append(len([l for l in lines if 'failed' in l ]))

        
        
    

In [36]:
counts


{'database': ['era5_2',
  'era5_1759',
  'era5_1761',
  'era5_3188',
  'bufr',
  'era5_1',
  'ncar',
  'igra2'],
 'all_files': [7944, 4741, 2412, 1353, 3037, 4711, 4599, 0],
 'processed': [7944, 4741, 2412, 1353, 3037, 4710, 4599, 2689],
 'stat_conf': [7080, 4338, 2240, 1344, 2669, 4029, 4289, 2689],
 'harvested': [7080, 4338, 2240, 1344, 2669, 4029, 4289, 2689],
 'identified': [7054, 4338, 2240, 1344, 2669, 4029, 4289, 2689],
 'inconsistent_coord': [200, 63, 85, 0, 91, 398, 133, 0],
 'unidentified': [591, 294, 60, 2, 243, 251, 133, 0],
 'unidentified_Id': [60, 43, 26, 0, 34, 31, 44, 0],
 'failed': [39, 3, 1, 7, 0, 1, 0, 0]}

In [37]:
#df = pd.DataFrame(counts, columns = ["Dataset" , "All" , "Tot Files" , "Identified" , "Coord Problem" , "Unidentified",  "Failed"])


In [38]:
df = pd.DataFrame(counts)

In [39]:
df

Unnamed: 0,database,all_files,processed,stat_conf,harvested,identified,inconsistent_coord,unidentified,unidentified_Id,failed
0,era5_2,7944,7944,7080,7080,7054,200,591,60,39
1,era5_1759,4741,4741,4338,4338,4338,63,294,43,3
2,era5_1761,2412,2412,2240,2240,2240,85,60,26,1
3,era5_3188,1353,1353,1344,1344,1344,0,2,0,7
4,bufr,3037,3037,2669,2669,2669,91,243,34,0
5,era5_1,4711,4710,4029,4029,4029,398,251,31,1
6,ncar,4599,4599,4289,4289,4289,133,133,44,0
7,igra2,0,2689,2689,2689,2689,0,0,0,0


## Check CUON station configuration 

In [44]:
merged = '/scratch/das/federico/MERGED_25FEB2022'
files = [f for f in os.listdir(merged) if '.nc' in f and 'Sensor' not in f ]
stat = list ( np.unique( [ s.split('_')[0] for s in files  ] ) ) 

print('Total merged stations: ' , len(stat) )

Total merged stations:  5187


In [45]:
cuon_stat_conf = '/users/staff/federico/GitHub/CEUAS_master_SEPTEMBER2021/CEUAS/CEUAS/public/merge/CUON_station_configuration.csv'
df = pd.read_csv(cuon_stat_conf, sep = '\t')
primary = list ( np.unique( df.primary_id ) ) 

print('Total CUON stat_conf stations: ' , len(primary) )

Total CUON stat_conf stations:  5218


In [48]:
missing_merged = []
for s in primary:
    if s not in stat:
        missing_merged.append(s)
    

In [49]:
missing_merged

['0-20000-0-04030',
 '0-20000-0-07156',
 '0-20000-0-10169',
 '0-20000-0-12812',
 '0-20000-0-12822',
 '0-20000-0-23022',
 '0-20000-0-34009',
 '0-20000-0-41509',
 '0-20000-0-47104',
 '0-20000-0-61701',
 '0-20000-0-63985',
 '0-20000-0-72202',
 '0-20000-0-72225',
 '0-20000-0-80425',
 '0-20000-0-81408',
 '0-20000-0-82578',
 '0-20000-0-91490',
 '0-20000-0-94110',
 '0-20000-0-94293',
 '0-20001-0-22522',
 '0-20300-0-04856',
 '0-20300-0-13928',
 '0-20300-0-23030',
 '0-20300-0-99014',
 '0-20300-0-99018',
 '0-20500-0-12812',
 '0-20500-0-33015',
 '0-20500-0-3869',
 '0-20500-0-53856',
 '0-20500-0-93119',
 '41247']

In [89]:
def check_coord(limit, file):
    
    ot = h5.File(file, 'r')['observations_table']
    lat = ot['latitude'][:]
    lon = ot['longitude'][:]
    
    cons = True
    
    if ( abs( min(lat) -max(lat) ) > limit ):
        cons = False
        
    if ( abs( min(lon) -max(lon) ) > limit ):
        cons = False    
        
    if cons:
        print('Cons LAT ' + file ,  min(lat) , ' ' , max(lat) )
        print('Cons LON ' + file ,  min(lon) , ' ' , max(lon) )

    
    else:
        print('INCONSISTENT LAT ' + file ,  min(lat) , ' ' , max(lat) )
        print('INCONSISTENT LON ' + file ,  min(lon) , ' ' , max(lon) )


        
        
        

In [83]:
f1 = '/scratch/das/federico/MERGED_25FEB2022/0-20500-0-10702_CEUAS_merged_v1.nc'
f2 = '/scratch/das/federico/MERGED_25FEB2022/0-20000-0-34139_CEUAS_merged_v1.nc'
f3 = '/scratch/das/federico/MERGED_25FEB2022/0-20300-0-99008_CEUAS_merged_v1.nc'
f4 = '/scratch/das/federico/MERGED_25FEB2022/0-20300-0-99021_CEUAS_merged_v1.nc'

f5 = '/scratch/das/federico/MERGED_25FEB2022/0-20000-0-58362_CEUAS_merged_v1.nc'

In [90]:
check_coord(1,f5)
source = h5.File(f5, 'r')['source_configuration']['source_file'][:]
source = list( np.unique ( [ b''.join(s).decode('utf-8') for s in source ] ) ) 
source

Cons LAT /scratch/das/federico/MERGED_25FEB2022/0-20000-0-58362_CEUAS_merged_v1.nc 31.17   31.42
Cons LON /scratch/das/federico/MERGED_25FEB2022/0-20000-0-58362_CEUAS_merged_v1.nc 121.42999   121.5


['0-20000-0-58362_bufr_harvested_era5.58367.bfr.nc',
 '0-20000-0-58362_era5_1759_harvested_era5.1759.conv._1:58362.gz.nc',
 '0-20000-0-58362_era5_1759_harvested_era5.1759.conv._1:58367.gz.nc',
 '0-20000-0-58362_era5_1759_harvested_era5.1759.conv._2:43203.gz.nc',
 '0-20000-0-58362_era5_1759_harvested_era5.1759.conv._2:43204.gz.nc',
 '0-20000-0-58362_era5_1761_harvested_era5.1761.conv._2:43203.gz.nc',
 '0-20000-0-58362_era5_1761_harvested_era5.1761.conv._2:43204.gz.nc',
 '0-20000-0-58362_era5_1761_harvested_era5.1761.conv._9:2063.gz.nc',
 '0-20000-0-58362_era5_1_harvested_era5.conv.??????.58362.txt.gz.nc',
 '0-20000-0-58362_era5_2_harvested_era5.conv._1:58367.gz.nc',
 '0-20000-0-58362_era5_2_harvested_era5.conv._58362.gz.nc',
 '0-20000-0-58362_era5_2_harvested_era5.conv._58367.gz.nc',
 '0-20000-0-58362_era5_2_harvested_era5.conv._9:2063.gz.nc',
 '0-20000-0-58362_era5_2_harvested_era5.conv._9:4055.gz.nc',
 '0-20000-0-58362_igra2_harvested_CHM00058362-data.txt.nc',
 '0-20000-0-58362_ncar_h

In [92]:
f6 = "/mnt/users/scratch/leo/scratch/converted_v7/0-20000-0-58362_CEUAS_merged_v1.nc"
check_coord(1,f6)
source = h5.File(f6, 'r')['source_configuration']['source_file'][:]
source = list( np.unique ( [ b''.join(s).decode('utf-8') for s in source ] ) ) 
source

Cons LAT /mnt/users/scratch/leo/scratch/converted_v7/0-20000-0-58362_CEUAS_merged_v1.nc 31.17   31.42
Cons LON /mnt/users/scratch/leo/scratch/converted_v7/0-20000-0-58362_CEUAS_merged_v1.nc 121.42999   121.5


['0-20000-0-58362_bufr_harvested_era5.58367.bfr.nc',
 '0-20000-0-58362_era5_1759_harvested_era5.1759.conv._1:58362.gz.nc',
 '0-20000-0-58362_era5_1759_harvested_era5.1759.conv._1:58367.gz.nc',
 '0-20000-0-58362_era5_1759_harvested_era5.1759.conv._2:43203.gz.nc',
 '0-20000-0-58362_era5_1759_harvested_era5.1759.conv._2:43204.gz.nc',
 '0-20000-0-58362_era5_1761_harvested_era5.1761.conv._2:43203.gz.nc',
 '0-20000-0-58362_era5_1761_harvested_era5.1761.conv._2:43204.gz.nc',
 '0-20000-0-58362_era5_1761_harvested_era5.1761.conv._9:2063.gz.nc',
 '0-20000-0-58362_era5_1_harvested_era5.conv.??????.58362.txt.gz.nc',
 '0-20000-0-58362_era5_2_harvested_era5.conv._1:58367.gz.nc',
 '0-20000-0-58362_era5_2_harvested_era5.conv._58362.gz.nc',
 '0-20000-0-58362_era5_2_harvested_era5.conv._58367.gz.nc',
 '0-20000-0-58362_era5_2_harvested_era5.conv._9:2063.gz.nc',
 '0-20000-0-58362_era5_2_harvested_era5.conv._9:4055.gz.nc',
 '0-20000-0-58362_igra2_harvested_CHM00058362-data.txt.nc',
 '0-20000-0-58362_ncar_h

In [86]:
source = list( np.unique(source) )


In [62]:
check_coord(1,f2)

INCONSISTENT LAT /scratch/das/federico/MERGED_25FEB2022/0-20000-0-34139_CEUAS_merged_v1.nc 43.77   51.1833
INCONSISTENT LON /scratch/das/federico/MERGED_25FEB2022/0-20000-0-34139_CEUAS_merged_v1.nc 11.25   40.7


In [61]:
check_coord(1,f3)

INCONSISTENT LAT /scratch/das/federico/MERGED_25FEB2022/0-20300-0-99008_CEUAS_merged_v1.nc 38.0   52.8
INCONSISTENT LON /scratch/das/federico/MERGED_25FEB2022/0-20300-0-99008_CEUAS_merged_v1.nc -71.0   -35.5


In [60]:
check_coord(1,f4)

INCONSISTENT LAT /scratch/das/federico/MERGED_25FEB2022/0-20300-0-99021_CEUAS_merged_v1.nc 27.67   50.0
INCONSISTENT LON /scratch/das/federico/MERGED_25FEB2022/0-20300-0-99021_CEUAS_merged_v1.nc -145.0   -145.0
