# AMMA Campaign


Utility to read and convert BUFR files from:


https://confluence.ecmwf.int/display/TCBUF/Radiosonde+BUFR+templates

https://confluence.ecmwf.int/display/TCBUF/Data+availability (see radiosondes)

https://www.ncei.noaa.gov/data/ecmwf-global-upper-air-bufr/doc/


See some BUFR instructions:
https://confluence.ecmwf.int/display/ECC/BUFR+tools



# AMMA
file:///home/federico/Downloads/1520-0477-2008bams2436_1.pdf  -> check WMO ids for inventory

http://www.amma-catch.org/sites/amma-catch.org/IMG/pdf/qjrms_ammafieldcampaigns_486.pdf 

https://journals.ametsoc.org/view/journals/bams/89/7/2008bams2436_1.xml







In [8]:
from eccodes import * 
import os,sys
import pandas as pd
import datetime
from datetime import datetime
import numpy as np

from tqdm import tqdm

In [28]:
def read_AMMA(file):

    f = open(file)
    all_data = {}
    
    
    """ Name of the columns as they will appear in the pandas dataframe (not necessarily CDM compliant) """
    #column_names = ['report_timestamp' , 'iday',  'station_id', 'latitude', 'longitude', 'pressure', 'value','varno@body']
            
    while 1:
        
        data = { 
        'statid': [] , 
        'latitude': [], 
        'longitude': [] , 

        'pressure': [],
        'airTemperature': [] , 
        'nonCoordinateGeopotentialHeight' : [],
                
        'typicalDate': [], 
        'typicalTime': [],

        'windSpeed': [],
        'windDirection': [],
                
        "airTemperature": [],
        "dewpointTemperature": [],
        'nonCoordinateGeopotentialHeight':[],
        'heightOfStation' : []
                
          }
        
        #lista = [] # temporary list
        bufr = codes_bufr_new_from_file(f)
   
        if bufr is None:
            break
   
        codes_set(bufr, 'unpack', 1) # eCcodes must expand all the descriptors and unpack the data section
    
        typicalDate = codes_get_array(bufr, "typicalDate")[0]
        typicalTime = codes_get_array(bufr, "typicalTime")[0]   
        
        year, month, day =  typicalDate[0:4], typicalDate[4:6] , typicalDate[6:8]
        hour, minutes = typicalTime[0:2] , typicalTime[2:4]
        
        # build station number 
        statid = "00000   "
        try:
            block = codes_get(bufr, "blockNumber")
            stnum = codes_get(bufr, "stationNumber")
            if (block > 0) and (block < 100):  # or block != CODES_MISSING_LONG
                statid = str.format("%.2i%.3i   " % (block, stnum))
        except Exception:
            statid = "00000   "
        if statid == "00000   ":
            statid = statid[0:8]
        statid = statid.replace(' ','')
       
        #if '64400' not in statid:
        #    continue 
            
            
        idate =  datetime.strptime(year + month + day + hour + minutes, '%Y%m%d%H%M')
        iday = int(year + month + day )

        try:
            pressure          = codes_get_array(bufr, "pressure") 
        except:
            continue
        try:
            airTemperature    = codes_get_array(bufr, "airTemperature")  
        except:
            airTemperature = np.empty(len(pressure))
            
        windDirection = codes_get_array(bufr, "windDirection")
        windSpeed     = codes_get_array(bufr, "windSpeed")
        heightOfStation = codes_get_array(bufr, "heightOfStation")
        latitude = codes_get_array(bufr, "latitude")
        longitude = codes_get_array(bufr, "longitude")

        
        try:  # not all the bufr files have the dewpoint 
            dewpointTemperature          = codes_get_array(bufr, "dewpointTemperature")
        except:
            dewpointTemperature= np.empty(len(airTemperature))
            dewpointTemperature[:] = np.nan
            
        num_lev             = len(pressure) # number of  distinct pressure levels 
        
        try:
            geopotential   = codes_get_array(bufr, "nonCoordinateGeopotentialHeight")         
        except:
            geopotential = np.full( (1,len(airTemperature)) , np.nan )[0,:]
                
        """
        if report_id == 0:
            ''' Check again but these values should remain the same for all cnt, so it makes no sense to read them every time '''
            lat                     = codes_get(bufr, "latitude")
            lon                    = codes_get(bufr, "longitude")
            alt                     = float(codes_get(bufr, "heightOfStation"))
            blockNumber    = codes_get(bufr, "blockNumber")
            stationNumber = codes_get(bufr, "stationNumber")
            #statid                = str(blockNumber*1000+stationNumber) # changed to int instead of str
            statid                = blockNumber*1000+stationNumber
            if statid not in     stations_id:
                stations_id.append(statid) 
        """
        
        codes_release(bufr)
   
        miss_value = -1.e100     
        
        #print(statid) 
        for i in range(len(pressure)):
            data['statid'].append(statid)
            
            
            data['heightOfStation'].append(heightOfStation[0])
            data['nonCoordinateGeopotentialHeight'].append(geopotential[i])
            data['latitude'].append(latitude[0])
            data['longitude'].append(longitude[0])
            data['pressure'].append(pressure[i])
            
            
            
            
            if airTemperature[i] >0.1:
                data['airTemperature'].append(airTemperature[i])
            else:
                data['airTemperature'].append(np.nan)

            if dewpointTemperature[i] > 0.1:
                data['dewpointTemperature'].append(dewpointTemperature[i])
            else:
                data['dewpointTemperature'].append(np.nan)
 
            data['windDirection'].append(windDirection[i])
            data['windSpeed'].append(windSpeed[i])
            data['typicalDate'].append(typicalDate)
            data['typicalTime'].append(typicalTime)

            """
            dp = dewpointTemperature[i]
            if press == miss_value:
                press = np.nan 
            if dp == miss_value:
                dp = np.nan
            if airT == miss_value :    # replacing none values with numpy nans
                airT = np.nan 
            if winds == miss_value:
                winds = np.nan
            if gph == miss_value:
                gph = np.nan                
            if windd == 2147483647 or windd == -2147483647:
                windd = np.nan 
            """ 
        
        #for k in data.keys():
        #    print(k, '  ' , len(data[k]) ) 
            
        #print(data)
        df = pd.DataFrame.from_dict(data)
        df = df.fillna(np.nan)
        df = df.replace(miss_value, '')

        df = df.reset_index()
        
        if statid not in all_data.keys():
            all_data[statid] = []
        
        all_data[statid].append(df)
        
    f.close()
    
    all_df = {}
    for k in all_data.keys():
        all_df[k] = pd.concat(all_data[k])
        
    return all_df 

In [3]:
#d = read_k('/scratch/das/federico/databases_service2/AMMA_BUFR/HRT2006041118')


In [4]:
#d = read_AMMA('/scratch/das/federico/databases_service2/AMMA_BUFR/HRT2006041118')
#d['61052']

In [5]:
#d['61052'].columns

In [29]:
# run in loop
amma_dir = '/scratch/das/federico/databases_service2/AMMA_BUFR' 

files = [f for f in os.listdir(amma_dir)  if 'py' not in f  and 'txt' not in f ]

#files = files[:10]

all_data_stations = {}

# first loop, extracts station data from single day file
# store each station in a pandas, and in a dictionary
for f in tqdm(files):
    if 'AMMA_split_csv' in f:
        continue 
    file = amma_dir + '/' + f 
    d = read_AMMA(file)
    for stat in d.keys():
        df = d[stat]
        if stat not in all_data_stations:
            all_data_stations[stat] = []
            
        all_data_stations[stat].append(df)

100%|██████████| 3566/3566 [05:06<00:00, 11.62it/s]


In [20]:
all_data_stations.keys()

dict_keys(['60360', '61223', '61291', '61442', '61641', '62721', '62730', '64458', '65330', '65503', '65510', '67027', '00000', '60390', '61052', '64700', '65418', '67002', '67005', '67095', '64750', '61090', '67781', '61024', '65125', '65344', '61687', '67774', '62760', '64400', '64650', '64910', '08589', '64500', '65578', '60680', '64450', '60018', '60571', '60630', '60656', '60715', '60760', '61901', '61902', '62378', '62403', '62423', '63450', '63894', '64870', '67083', '08594', '60155', '62306', '62337', '62414', '63741', '61415', '68032', '62641', '62650', '61831', '65202', '62010', '62640', '65387', '61226', '62600', '67881', '62840', '65046'])

In [12]:
len(all_data_stations.keys() ) 

72

In [31]:

# second loop, combine each station from single df into a single df in a time series
out = '/scratch/das/federico/databases_service2/AMMA_BUFR/AMMA_split_csv_22FEB2024'
if not os.path.isdir(out):
    os.mkdir(out)
    
for station in all_data_stations.keys():       
    #print(all_data_stations[station])
    print('COMBINING STATION ::: ' , station )
    df_combi = pd.concat(all_data_stations[station])
    df_combi = df_combi.sort_values(by=['typicalDate', 'typicalTime', 'pressure'])
    df_combi = df_combi.reset_index() 
    out_name = out + '/' + station + '_amma' + '.csv'
    
    df_combi.to_csv(out_name, sep = '\t' , na_rep='') 
    
    print('DONE STATION ::: ' , station )



        #print(out_name)

COMBINING STATION :::  60360
DONE STATION :::  60360
COMBINING STATION :::  61223
DONE STATION :::  61223
COMBINING STATION :::  61291
DONE STATION :::  61291
COMBINING STATION :::  61442
DONE STATION :::  61442
COMBINING STATION :::  61641
DONE STATION :::  61641
COMBINING STATION :::  62721
DONE STATION :::  62721
COMBINING STATION :::  62730
DONE STATION :::  62730
COMBINING STATION :::  64458
DONE STATION :::  64458
COMBINING STATION :::  65330
DONE STATION :::  65330
COMBINING STATION :::  65503
DONE STATION :::  65503
COMBINING STATION :::  65510
DONE STATION :::  65510
COMBINING STATION :::  67027
DONE STATION :::  67027
COMBINING STATION :::  00000
DONE STATION :::  00000
COMBINING STATION :::  60390
DONE STATION :::  60390
COMBINING STATION :::  61052
DONE STATION :::  61052
COMBINING STATION :::  64700
DONE STATION :::  64700
COMBINING STATION :::  65418
DONE STATION :::  65418
COMBINING STATION :::  67002
DONE STATION :::  67002
COMBINING STATION :::  67005
DONE STATION ::: 

# Simple map plot

In [None]:
def quick_analysis():
    """ Reads files from the directory and check which stations are there """
    # out -> defined above, output directory of the split csv files 

    stations, lats, lons = [], [], [] 
    for station in tqdm(os.listdir(out)):
        file = data_dir + '/' + station 
        df = pd.read_csv( file, sep='\t')
        try:
            lat, lon = df.latitude[0], df.longitude[0]
        except:
            pass
            #print(file)
        stations.append(station)
        lats.append(lat)
        lons.append(lon)
        
    
    map_data = pd.DataFrame.from_dict( { 'station':stations, 'lat':lats , 'lon':lons} )
    
    try:
        date = file.split('_')[1][0:6]
    except:
        print( file)
        
    return map_data, date 


        

In [None]:
map_data, date = quick_analysis()

In [None]:
import plotly.express as px

map = px.scatter_geo(map_data,
                    lat=map_data.lat,
                    lon=map_data.lon,
                    hover_name="station")

map.update_layout(
        height=1100, width=1500,
        title= { 'text': 'AMMA Station' ,  "yref": "paper","y": 0.9, "yanchor": "bottom" },

        font=dict( family="Courier New, monospace", size=20, #color="RebeccaPurple"
        ),
        # margin=dict(l=20, r=20, t=0, b=5),
        legend=dict(font=dict(family="Courier New, monospace", size=25, color="black"), title = "Sensor Id"),
        legend_title=dict(font=dict(family="Courier New, monospace", size=25, color="blue"))
    )
    
map.show()

# AMMA Metadata from https://journals.ametsoc.org/view/journals/bams/89/7/2008bams2436_1.xml

In [None]:
meta = pd.read_csv('/users/staff/federico/GitHub/CEUAS_master_JULY2922/CEUAS/CEUAS/meta/inventory_comparison_2/data/tables/AMMA_campaign_digitized_metadata.csv' , sep = ',')

In [None]:
meta

# Extend the basic table with information from the files such as min_date, max_date, elevation

In [None]:
elevation, min_date, max_date, files, lats, lons = [],[],[],[], [], [] 

stats = meta['WMO station No.'].values
out = '/scratch/das/federico/databases_service2/AMMA_BUFR/AMMA_split_csv'

"""
Index(['index', 'statid', 'latitude', 'longitude', 'pressure',
       'airTemperature', 'nonCoordinateGeopotentialHeight', 'typicalDate',
       'typicalTime', 'windSpeed', 'windDirection', 'dewpointTemperature',
       'heightOfStation'],
      dtype='object')
"""

for s in stats:
    s = str(s)
    # out -> defined above, output directory of the split csv files 

    print(s)
    
    try:
        file = [f for f in os.listdir(out) if s in f and '.csv' in f ][0]
        df = pd.read_csv(out+'/'+file, sep='\t')
        el = df['heightOfStation'].values[0]
        mind = min(df['typicalDate'].values) 
        maxd =  max(df['typicalDate'].values)
        lat =  df['latitude'].values[0]
        lon =  df['longitude'].values[0]
        
    except:
        el, mind, maxd, file, lat, lon = np.nan, np.nan, np.nan, np.nan, np.nan, np.nan 
        
    elevation.append(el)
    min_date.append (mind)
    max_date.append (maxd)
    files.append(file)
    lats.append(lat)
    lons.append(lon)

                     
meta['elevation'] = elevation
meta['min_date'] = min_date
meta['max_date'] = max_date
meta['conv_lat'] = lats
meta['conv_lon'] = lons

meta['files'] = files

meta.to_csv('/users/staff/federico/GitHub/CEUAS_master_JULY2922/CEUAS/CEUAS/meta/inventory_comparison_2/data/tables/AMMA_campaign_digitized_metadata_extended.csv' , sep = '\t')

In [None]:
meta

In [None]:
harv  = os.listdir('/scratch/das/federico/COP2_HARVEST_JAN2023/amma')

harv = [f.split('_')[0] for f in harv if harv if '.csv' in f and '20999' not in f ]
merged = [f.split('_')[0] for f in os.listdir('/scratch/das/federico/MERGED_FEB2023') if '.nc' in f ]



In [None]:
len(harv)

In [None]:
remove[:10]