# Check AMMA station

In [1]:
from eccodes import * 
import os,sys
import pandas as pd
import datetime
from datetime import datetime
import numpy as np

from tqdm import tqdm

import os,sys





In [61]:
def read_AMMA(file):

    f = open(file)
    all_data = {}
    
    
    """ Name of the columns as they will appear in the pandas dataframe (not necessarily CDM compliant) """
    #column_names = ['report_timestamp' , 'iday',  'station_id', 'latitude', 'longitude', 'pressure', 'value','varno@body']
            
    while 1:
        
        data = { 
        'statid': [] , 
        'latitude': [], 
        'longitude': [] , 

        'pressure': [],
        'airTemperature': [] , 
        'nonCoordinateGeopotentialHeight' : [],
                
        'typicalDate': [], 
        'typicalTime': [],

        'windSpeed': [],
        'windDirection': [],
                
        "airTemperature": [],
        "dewpointTemperature": [],
        'nonCoordinateGeopotentialHeight':[],
        'heightOfStation' : []
                
          }
        
        #lista = [] # temporary list
        bufr = codes_bufr_new_from_file(f)
   
        if bufr is None:
            break
   
        codes_set(bufr, 'unpack', 1) # eCcodes must expand all the descriptors and unpack the data section
    
        typicalDate = codes_get_array(bufr, "typicalDate")[0]
        typicalTime = codes_get_array(bufr, "typicalTime")[0]   
        
        year, month, day =  typicalDate[0:4], typicalDate[4:6] , typicalDate[6:8]
        hour, minutes = typicalTime[0:2] , typicalTime[2:4]
        
        # build station number 
        statid = "00000   "
        try:
            block = codes_get(bufr, "blockNumber")
            stnum = codes_get(bufr, "stationNumber")
            if (block > 0) and (block < 100):  # or block != CODES_MISSING_LONG
                statid = str.format("%.2i%.3i   " % (block, stnum))
        except Exception:
            statid = "00000   "
        if statid == "00000   ":
            statid = statid[0:8]
        statid = statid.replace(' ','')
       
        if '64400' not in statid:
            continue 
            
            
        idate =  datetime.strptime(year + month + day + hour + minutes, '%Y%m%d%H%M')
        iday = int(year + month + day )

        try:
            pressure          = codes_get_array(bufr, "pressure") 
        except:
            continue
        try:
            airTemperature    = codes_get_array(bufr, "airTemperature")  
        except:
            airTemperature = np.empty(len(pressure))
            
        windDirection = codes_get_array(bufr, "windDirection")
        windSpeed     = codes_get_array(bufr, "windSpeed")
        heightOfStation = codes_get_array(bufr, "heightOfStation")
        latitude = codes_get_array(bufr, "latitude")
        longitude = codes_get_array(bufr, "longitude")

        
        try:  # not all the bufr files have the dewpoint 
            dewpointTemperature          = codes_get_array(bufr, "dewpointTemperature")
        except:
            dewpointTemperature= np.empty(len(airTemperature))
            dewpointTemperature[:] = np.nan
            
        num_lev             = len(pressure) # number of  distinct pressure levels 
        
        try:
            geopotential   = codes_get_array(bufr, "nonCoordinateGeopotentialHeight")         
        except:
            geopotential = np.full( (1,len(airTemperature)) , np.nan )[0,:]
                
        """
        if report_id == 0:
            ''' Check again but these values should remain the same for all cnt, so it makes no sense to read them every time '''
            lat                     = codes_get(bufr, "latitude")
            lon                    = codes_get(bufr, "longitude")
            alt                     = float(codes_get(bufr, "heightOfStation"))
            blockNumber    = codes_get(bufr, "blockNumber")
            stationNumber = codes_get(bufr, "stationNumber")
            #statid                = str(blockNumber*1000+stationNumber) # changed to int instead of str
            statid                = blockNumber*1000+stationNumber
            if statid not in     stations_id:
                stations_id.append(statid) 
        """
        
        codes_release(bufr)
   
        miss_value = -1.e100     
        
        #print(statid) 
        for i in range(len(pressure)):
            data['statid'].append(statid)
            
            
            data['heightOfStation'].append(heightOfStation[0])
            data['nonCoordinateGeopotentialHeight'].append(geopotential[i])
            data['latitude'].append(latitude[0])
            data['longitude'].append(longitude[0])
            data['pressure'].append(pressure[i])
            
            
            
            
            if airTemperature[i] >0.1:
                data['airTemperature'].append(airTemperature[i])
            else:
                data['airTemperature'].append(np.nan)

            if dewpointTemperature[i] > 0.1:
                data['dewpointTemperature'].append(dewpointTemperature[i])
            else:
                data['dewpointTemperature'].append(np.nan)
 
            data['windDirection'].append(windDirection[i])
            data['windSpeed'].append(windSpeed[i])
            data['typicalDate'].append(typicalDate)
            data['typicalTime'].append(typicalTime)

            """
            dp = dewpointTemperature[i]
            if press == miss_value:
                press = np.nan 
            if dp == miss_value:
                dp = np.nan
            if airT == miss_value :    # replacing none values with numpy nans
                airT = np.nan 
            if winds == miss_value:
                winds = np.nan
            if gph == miss_value:
                gph = np.nan                
            if windd == 2147483647 or windd == -2147483647:
                windd = np.nan 
            """ 
        
        #for k in data.keys():
        #    print(k, '  ' , len(data[k]) ) 
            
        #print(data)
        df = pd.DataFrame.from_dict(data)
        df = df.fillna(np.nan)
        df = df.replace(miss_value, np.nan)

        df = df.reset_index()
        
        if statid not in all_data.keys():
            all_data[statid] = []
        
        all_data[statid].append(df)
        
    f.close()
    
    all_df = {}
    for k in all_data.keys():
        all_df[k] = pd.concat(all_data[k])
        
    return all_df           

In [62]:
amma_dir= '/scratch/das/federico/databases_service2/AMMA_BUFR/' 
files = [ amma_dir + '/' + f for f in os.listdir(amma_dir) ]



In [63]:
files[:10]

['/scratch/das/federico/databases_service2/AMMA_BUFR//PILOT2006060812',
 '/scratch/das/federico/databases_service2/AMMA_BUFR//TEMP2006011212',
 '/scratch/das/federico/databases_service2/AMMA_BUFR//PILOT2006081906',
 '/scratch/das/federico/databases_service2/AMMA_BUFR//PILOT2006040406',
 '/scratch/das/federico/databases_service2/AMMA_BUFR//PILOT2006122406',
 '/scratch/das/federico/databases_service2/AMMA_BUFR//HRT2006080618',
 '/scratch/das/federico/databases_service2/AMMA_BUFR//PILOT2006121900',
 '/scratch/das/federico/databases_service2/AMMA_BUFR//HRT2006050818',
 '/scratch/das/federico/databases_service2/AMMA_BUFR//PILOT2006092012',
 '/scratch/das/federico/databases_service2/AMMA_BUFR//HRT2006091518']

In [64]:
a = read_AMMA(files[1] )

In [65]:
all_data_stations = {}

# first loop, extracts station data from single day file
# store each station in a pandas, and in a dictionary
for file in tqdm(files):
    d = read_AMMA(file)
    for stat in d.keys():
        df = d[stat]
        #print(df)
        if stat not in all_data_stations:
            all_data_stations[stat] = []
            
        all_data_stations[stat].append(df)

  0%|          | 10/3568 [00:00<03:27, 17.18it/s]

   index statid  latitude  longitude  pressure  airTemperature  \
0      0  64400     -4.82       11.9   85000.0             NaN   
1      1  64400     -4.82       11.9   70000.0             NaN   
2      2  64400     -4.82       11.9   50000.0             NaN   
3      3  64400     -4.82       11.9   40000.0             NaN   
4      4  64400     -4.82       11.9   30000.0             NaN   
5      5  64400     -4.82       11.9   25000.0             NaN   
6      6  64400     -4.82       11.9   20000.0             NaN   
7      7  64400     -4.82       11.9   15000.0             NaN   
8      8  64400     -4.82       11.9   10000.0             NaN   

   nonCoordinateGeopotentialHeight typicalDate typicalTime  windSpeed  \
0                              NaN    20060920      110000        3.1   
1                              NaN    20060920      110000        8.2   
2                              NaN    20060920      110000        5.7   
3                              NaN    20060920 

  1%|          | 21/3568 [00:01<06:19,  9.34it/s]

    index statid  latitude  longitude  pressure  airTemperature  \
0       0  64400     -4.82       11.9  101000.0           301.8   
1       1  64400     -4.82       11.9  100000.0           298.2   
2       2  64400     -4.82       11.9   92500.0           293.8   
3       3  64400     -4.82       11.9   85000.0           289.2   
4       4  64400     -4.82       11.9   70000.0           282.8   
5       5  64400     -4.82       11.9   50000.0           268.5   
6       6  64400     -4.82       11.9   40000.0           256.5   
7       7  64400     -4.82       11.9   30000.0           241.7   
8       8  64400     -4.82       11.9   25000.0           231.5   
9       9  64400     -4.82       11.9   20000.0           219.7   
10     10  64400     -4.82       11.9   15000.0           204.2   
11     11  64400     -4.82       11.9   10000.0           192.0   

    nonCoordinateGeopotentialHeight typicalDate typicalTime  windSpeed  \
0                               NaN    20061019      1

  2%|▏         | 58/3568 [00:04<03:29, 16.72it/s]

   index statid  latitude  longitude  pressure  airTemperature  \
0      0  64400     -4.82       11.9   85000.0             NaN   
1      1  64400     -4.82       11.9   70000.0             NaN   
2      2  64400     -4.82       11.9   50000.0             NaN   
3      3  64400     -4.82       11.9   40000.0             NaN   
4      4  64400     -4.82       11.9   30000.0             NaN   
5      5  64400     -4.82       11.9   25000.0             NaN   
6      6  64400     -4.82       11.9   20000.0             NaN   
7      7  64400     -4.82       11.9   15000.0             NaN   

   nonCoordinateGeopotentialHeight typicalDate typicalTime  windSpeed  \
0                              NaN    20060623      110000        5.1   
1                              NaN    20060623      110000        7.7   
2                              NaN    20060623      110000        8.7   
3                              NaN    20060623      110000        4.6   
4                              NaN    20

  2%|▏         | 77/3568 [00:05<01:32, 37.92it/s]

   index statid  latitude  longitude  pressure  airTemperature  \
0      0  64400     -4.82       11.9   85000.0             NaN   
1      1  64400     -4.82       11.9   70000.0             NaN   
2      2  64400     -4.82       11.9   50000.0             NaN   
3      3  64400     -4.82       11.9   40000.0             NaN   
4      4  64400     -4.82       11.9   30000.0             NaN   
5      5  64400     -4.82       11.9   25000.0             NaN   
6      6  64400     -4.82       11.9   20000.0             NaN   
7      7  64400     -4.82       11.9   15000.0             NaN   
8      8  64400     -4.82       11.9   10000.0             NaN   

   nonCoordinateGeopotentialHeight typicalDate typicalTime  windSpeed  \
0                              NaN    20061009      110000        5.7   
1                              NaN    20061009      110000       13.4   
2                              NaN    20061009      110000        6.2   
3                              NaN    20061009 

  2%|▏         | 86/3568 [00:05<01:50, 31.64it/s]

   index statid  latitude  longitude  pressure  airTemperature  \
0      0  64400     -4.82       11.9   85000.0             NaN   
1      1  64400     -4.82       11.9   70000.0             NaN   
2      2  64400     -4.82       11.9   50000.0             NaN   
3      3  64400     -4.82       11.9   40000.0             NaN   
4      4  64400     -4.82       11.9   20000.0             NaN   
5      5  64400     -4.82       11.9   15000.0             NaN   
6      6  64400     -4.82       11.9   10000.0             NaN   

   nonCoordinateGeopotentialHeight typicalDate typicalTime  windSpeed  \
0                              NaN    20060901      110000        5.1   
1                              NaN    20060901      110000        8.2   
2                              NaN    20060901      110000        3.1   
3                              NaN    20060901      110000        4.1   
4                              NaN    20060901      110000       13.4   
5                              Na

  3%|▎         | 104/3568 [00:07<03:56, 14.63it/s]


KeyboardInterrupt: 

In [7]:
for station in all_data_stations.keys():       
    #print(all_data_stations[station])
    print('COMBINING STATION ::: ' , station )
    df_combi = pd.concat(all_data_stations[station])
    df_combi = df_combi.sort_values(by=['typicalDate', 'typicalTime', 'pressure'])
    df_combi = df_combi.reset_index() 
    #out_name = out + '/' + station + '_amma' + '.csv'
    
    #df_combi.to_csv(out_name, sep = '\t') 
    print('DONE STATION ::: ' , station )

COMBINING STATION :::  64400
DONE STATION :::  64400


In [8]:
df_combi.to_csv('PROVA_64400', sep = '\t') 


In [9]:
df_combi

Unnamed: 0,level_0,index,statid,latitude,longitude,pressure,airTemperature,nonCoordinateGeopotentialHeight,typicalDate,typicalTime,windSpeed,windDirection,dewpointTemperature,heightOfStation
0,12,12,64400,-4.82,11.9,2000.0,0.0,,20060424,100000,13.9,260,,17
1,11,11,64400,-4.82,11.9,3000.0,0.0,,20060424,100000,2.6,245,,17
2,10,10,64400,-4.82,11.9,5000.0,0.0,,20060424,100000,21.6,95,,17
3,9,9,64400,-4.82,11.9,7000.0,3.5e-323,,20060424,100000,8.2,70,,17
4,8,8,64400,-4.82,11.9,10000.0,0.0,,20060424,100000,5.7,185,,17
5,7,7,64400,-4.82,11.9,15000.0,1.0,,20060424,100000,8.2,270,,17
6,6,6,64400,-4.82,11.9,20000.0,0.0,,20060424,100000,7.2,5,,17
7,5,5,64400,-4.82,11.9,25000.0,2.021579e-315,,20060424,100000,6.2,20,,17
8,4,4,64400,-4.82,11.9,30000.0,2.021579e-315,,20060424,100000,2.6,35,,17
9,3,3,64400,-4.82,11.9,40000.0,6.365987e-314,,20060424,100000,2.1,85,,17


In [10]:
df_combi.to_csv('PROVA_64400_2', sep = '\t') 
