In [1]:
import os,sys
import numpy as np
import pandas as pd

from tqdm import tqdm


# datasets locations
base = '/scratch/das/federico/databases_service2/'
npsound = 'NPSOUND-NSIDC0060'
hara = 'HARA-NSIDC-0008'
ship = 'SHIPSOUND-NSIDC-0054'





# NPSOUND

# HARA
https://nsidc.org/sites/default/files/nsidc-0008-v001-userguide.pdf

... As a compromise, the following file structure is used: One file contains a one-year time series of
soundings for one station. 


*1.2.1 Header Record*
1. STATION
World Meteorological Organization (WMO) station identification number. All stations are ascribed a
WMO number, with the exception of "SHIP M" (Figure 1; Appendix 1). This was arbitrarily
assigned a station code of 80000.
2. LATITUDE
Latitude of the station in degrees and hundredths of a degree (N).
3. LONGITUDE
Longitude from 0 to 360 degrees, in degrees and hundredths of a degree, measured
counterclockwise from the Greenwich Meridian as viewed from the pole.
4. YEAR
Year of sounding.
5. MONTH
Month of sounding.
6. DAY
Day of sounding.
7. HOUR
Hour of sounding. This is usually 0000 GMT, 0600 GMT, 1200 GMT, or 1800 GMT. Prior to 1952,
however, soundings reported at 0800 GMT and 1400 GMT.
8.-10. PROC1, PROC2, PROC3
Special processing codes. These are only available for sounding type ID=1. They provide
information on special processing, and whether soundings were manually or automatically
processed (see Appendix 4). For all other sounding types (ID=2, 3, 4 or 5), PROC1, PROC2, and
PROC3 are assigned blanks. Pages 6 and 7 of Office Note 29 (Mulder (1977)), which describe
these codes for the ID=1 data, are given in Appendix 5.
11. REP
Report type. This will always be assigned the value of 011 or 0. It denotes that the sounding was
taken at a fixed land station.
12. ELEVATION
Station elevation in meters above mean sea level. Missing values are 99999.



1.2.2 Data Records
The variables are as follows:
1. PRESSURE
Pressure in tenths of millibars. Data from most stations are rounded to the nearest 1 mb. Missing
values are 99999.
2. GEOPOTENTIAL HEIGHT
Geopotential height of the pressure level in whole meters. Missing values are 99999.
3. TEMPERATURE
Temperature in tenths of a degree (degrees C). Missing values are 9999.
4. DEWPOINT DEPRESSION
Dewpoint depression in tenths of a degree (degrees C). Missing values are 999.
5. WIND DIRECTION
Wind direction, 0 to 360 degrees, measured clockwise from north (e.g., 90 degrees is east).
Missing values are 999.
6. WIND SPEED
Wind speed in whole meters per second. Missing values are 999.
7.-15. QG, QG1, QT, QT1, QD, QD1, QW, QW1, QP
Quality control flags for geopotential height (QG, QG1), temperature (QT, QT1), dewpoint
depression (QD, QD1), winds (QW, QW1), and pressure (QP). The values of QG, QT, QD, QW,
and QP depend on the sources of the sounding as identified by the value of ID (see Appendix 4).
These vary considerably among the sources of the original soundings, and on whether the original 
USER GUIDE: Historical Arctic Rawinsonde Archive, Version 1
Page 5 of 37 National Snow and Ice Data Center
nsidc.org
processing was accomplished automatically (via computer) or manually (designated by "auto" or
"man" Appendix 4). QG1, QT1, QD1 and QW1 are additional quality code flags for geopotential
height, temperature, dewpoint depression, and winds, based on our error-checking procedure,
described earlier. Values are either 'P' (passed limits check) or 'F' (failed limits check). No limits
check quality flag is given for pressure, as we used pressure as the test variable in the limits check.
*(These additional codes are not available on Volume 5. Please see Revision below.)
16. LEVCK
A quality control flag set to 'P' if no errors were detected in the level for any variable, based on the
limits check described above. If any errors were detected in the limits check, it is set to 'F'. *(This
variable is not available on Volume 5. Please see Revision below.)
17. LTYPE
Code for level type (surface, significant, or mandatory). Only relevant for soundings of type ID=4
(see Appendix 4). For sounding types other than ID=4, the value is assigned a blank. This variable
is not available on Volume 5. Please see Revision below.
18. LQUAL
Flag for quality of level. Only relevant for soundings of type ID=4 (see Appendix 4). This variable is
not available on Volume 5. Please see Revision below.

In [16]:
# find all stations_id

stations = []
years = os.listdir( base + '/' + hara )
for year in tqdm(years):
    ydir = base + '/' + hara  + '/' + year
    files = os.listdir( ydir )
    for f in files:
        stat = f.split('.')[0]
        if stat not in stations:
            stations.append(stat)
            
    

100%|██████████| 49/49 [00:00<00:00, 1187.36it/s]


In [3]:
stations[:5]

['70026', '71072', '71074', '71917', '71918']

In [4]:
years = os.listdir( base + '/' + hara )

meta_dic = {}

for v in ['stationid' , 'latitude' , 'longitude' , 'elevation', 'date']:
    meta_dic[v] = []

In [40]:
#year
def read_hara_data(file):
    stat = file.split('/')[-1].split('.')[0]
    year = file.split('/')[-2]
    #print(stat)
    data = {}
    var = ['date_time', 'gph', 'temp', 'dew', 'windsp', 'winddir', 'lat', 'lon', 'pressure','month','day','year','hour', 'elev']
    for v in var:
        data[v] = []
        
    with open(file, 'r', errors='replace') as filee:
        lines = filee.readlines()
    
    for l in lines:
        if stat == l[0:5]:
            lat_deg = l[5:8]
            lat_cent = l[8:10]
            lon_deg = l[10:13]
            lon_cent = l[13:15]
            lat = float( lat_deg + '.' + lat_cent )
            lon = float( lon_deg + '.' + lon_cent )
            
            if '289.97' in str(lat):
                print(file, '  ' , l)
            if '289.97' in str(lon):
                print(file, '  ' , l)
                
            # fix problems when reading files with inconsistent coordinates convention 
            if stat == 20667 and lon == '289.97':
                lon = '70.03'
                
            y,m,d = str(year), str(l[18:20]).replace(' ','') , str(l[20:22]).replace(' ','')
            if len(m) < 2 : m = '0' + m
            if len(d) < 2 : d = '0' + d
            
            hour = str(l[22:24]).replace(' ','')

            date_time = y + '-' + str(m) + '-' + str(d)
        
            elev = l.split(' ')[-1].replace('\n', '')

        else:
            pressure = l[:6]
            gph = l[6:12]
            temp = l[12:17]
            dew = l[17:21]
            
            winddir =l[21:25]
            windsp = l[25:29]
        
            data['date_time'].append(date_time)
            data['pressure'].append(pressure)
            data['gph'].append(gph)
            data['temp'].append(temp)

            data['dew'].append(dew)
            data['windsp'].append(windsp)
            data['winddir'].append(winddir)
            data['lat'].append(lat)
            data['lon'].append(lon)
            
            data['month'].append(m)
            data['year'].append(y)
            data['day'].append(d)
            data['hour'].append(hour)

            data['elev'].append(elev)

    df = pd.DataFrame( data )
    
    return df 
        

In [27]:
ff = '1948/70026.48'
d = read_hara_data(base + '/' + hara + '/1948/70026.48' ) 

In [34]:
d.head(5)

Unnamed: 0,date_time,gph,temp,dew,windsp,winddir,lat,lon,pressure,month,day,year,hour,elev
0,1948-01-01,8,-191,32,7,80,71.28,203.23,9870,1,1,1948,3,4
1,1948-01-01,280,-212,29,10,70,71.28,203.23,9500,1,1,1948,3,4
2,1948-01-01,686,-239,28,12,70,71.28,203.23,9000,1,1,1948,3,4
3,1948-01-01,1101,-251,28,17,80,71.28,203.23,8500,1,1,1948,3,4
4,1948-01-01,1545,-200,31,22,90,71.28,203.23,8000,1,1,1948,3,4


In [31]:
stations_ = stations[30:]

In [38]:
stats = [s for s in stations if '20667' in s ]

In [42]:
for s in stats:
    all_station_df = []
    
    for year in tqdm(years):
        ydir = base + '/' + hara  + '/' + year
        files = os.listdir( ydir )
        for f in files:
            file = ydir+'/'+f
            if s not in file:
                continue

            #print(file)
            data = read_hara_data(file)
            all_station_df.append(data)
            
    station_df = pd.concat(all_station_df)

    # saving the concatenated dataframe to csv 
    out_dir = '/scratch/das/federico/databases_service2/HARA-NSIDC-0008_csv/'
    #station_df.to_csv(out_dir + '/' + s + '_stationfile.csv' , sep = '\t' , index= False )
    
        

 88%|████████▊ | 43/49 [00:00<00:00, 135.20it/s]

/scratch/das/federico/databases_service2//HARA-NSIDC-0008/1973/20667.73    20667 733328997 73 810 0 990 11    612  12 1

/scratch/das/federico/databases_service2//HARA-NSIDC-0008/1973/20667.73    20667 733328997 7310 1 0 990 11    612  14 1



100%|██████████| 49/49 [00:00<00:00, 136.47it/s]


## Analyze inventory directory

In [45]:
inventory_dir = '/users/staff/federico/GitHub/CEUAS_master_JULY2922/CEUAS/CEUAS/meta/inventory_comparison_2/code/inventories/hara'

files = [f for f in os.listdir(inventory_dir) ]

ident  = [f for f in files if '_ident' in f ]
unid   = [f for f in files if 'noDist' in f ]
incons = [f for f in files if 'inco' in f ]


In [47]:
print('IDENTIFIED: ' , len(ident) ) 
print('UNIDENTIFIED: ' , len(unid) ) 
print('INCOSISTENT COORD: ' , len(incons) ) 



IDENTIFIED:  80
UNIDENTIFIED:  2
INCOSISTENT COORD:  7


## Analyze station configuration file 


In [48]:
sc = '/users/staff/federico/GitHub/CEUAS_master_JULY2922/CEUAS/CEUAS/meta/inventory_comparison_2/code/station_configuration/hara_station_configuration_extended.csv'

In [54]:
scdf = pd.read_csv(sc, sep = '\t')
hara_ids = scdf.primary_id.values
ids[:10]

array(['0-20000-0-25282', '0-20000-0-23383', '0-20000-0-22235',
       '0-20700-0-04210', '0-20000-0-04330', '0-20000-0-21908',
       '0-20001-0-01010', '0-20000-0-04231', '0-20000-0-25248',
       '0-20000-0-23552'], dtype=object)

In [53]:
# all ids found in the merged directory
merged_ids = [s.split('_')[0] for s in os.listdir('/scratch/das/federico/MERGED_FEB2023') if '.nc' in s and 'Sensor' not in s ]

In [55]:
# stations in HARA dataset but not in merged
missing_hara = [s for s in hara_ids if s not in merged_ids ]

In [58]:
print('MISSING HARA STATIONS: ' , len(missing_hara) )
for s in missing_hara:
    print(s)

MISSING HARA STATIONS:  9
0-20000-0-23383
0-20700-0-04210
0-20000-0-25248
0-20300-0-04310
0-20300-0-04340
0-20000-0-71090
0-20000-0-71938
0-20300-0-71051
0-20300-0-71072


In [None]:
4	North America, Central America and the Caribbean	Canada	CAN	0-20000-0-71072	71072	0	THUNDER BAY, ONT	48 22 19N	089 19 18W	199.30		199.30		mean sea level														Surface land meteorological station (SYNOP), GOS
