## ISCCP CTD for Third Pole region

TO DO :
- define functions for data  
* read in nc files and store in group wise dataframe 
* extract only locations > 3000 m  
* search for CS families after date or location 
* create climatology based on read in on multiple monthly files 
* plot tracks 


In [1]:
import pandas as pd
import xarray as xr
import numpy as np 
import os

In [2]:
## dictionary with text output files from Fortran program 
dir='/media/juli/Elements/ISCCP CTD/TP_DATABASE/'
txt_files={}
keys=[]
values=[]

for year in np.arange(1995,2008,1):
    for month in np.arange(1,13,1):
        if month < 10:
            keys.append(str(year)+'_0'+str(month))
        else:
            keys.append(str(year)+'_'+str(month))       
    
for k in keys:
    values.append(dir+'CS_TP_'+str(k)+'.txt')
    
## populate dictionary with keys and values 
txt_files= dict(zip(keys, values))

In [3]:
# properties of tracked CS families
props= ['SAT','FAM','MAX_SIZE','MIN_TEMP','PIXL_<200','LIFE_HOURS','ELAPS_HOURS','YEAR','MONTH','DAY','GMT','CS_RADIUS_KM','CS_CENTER_LAT','CS_CENTER_LON','INCL_NORTH','CS_ECCEN','SQUARE_CORR','CON_FRAC',
 'NR_CCinCS','CC_MAX_RADIUS','CC_MEAN_RADIUS','CC_CENTER_LAT','CC_CENTER_LON','TEMP_GRAD','CS_MEAN_TEMP','CS_MIN_TEMP','CC_MEAN_TEMP','STD_CS_TEMP','CC_WIND_DIRECT','CC_WIND_SPEED','CS_WIND_DIRECT','CS_WIND_SPEED','NON_OVRLAP','OVRLAP_%',
 'FLAG','DIV_FLAG','ISCCP_INTERN','LW_FLAG','MIN_TAU','MAX_TAU','MIN_LAT','MAX_LAT','MIN_LON','MAX_LON']

In [28]:
#
## This funtion reads in .txt files produced by the ISCCP CTD Fortran code 
#
# returns data in pandas dataframe object 

def get_data(file, props):
    act = 0 # activating variable to read in lines to df
    cc_data = 0
    nr_cols= 44   # nr. of columns
    key= 'DATABASE'   # key word after which lines should be extracted 
    i= 0 # loop counter 

    with open(file) as cc: 
        for line in cc:   # wrap pbar around object to display progress 
            i += 1 
            if key in line: 
                act = 1
                if not isinstance(cc_data, pd.DataFrame):  # control if dataframe not already exist, in order to read in more files 
                    cc_data= pd.DataFrame()  # create pandas dataframe  
                    #headers= line.split()      # split string line into list for col headers
            if act == 1:
                cols= line.split()
                if len(cols)== nr_cols and cols[1].isdigit():          
                    for item in range(0, len(cols)): 
                        if cols[item].isdigit():
                            cols[item] = int(cols[item])          
                    cc_data= cc_data.append([cols])          
            if '===' in line: 
                act = 0   
            #if i%1000==0:   # track progress 
             #   print(i)
        
        cc_data.columns = props   # set column names
        cc_data = cc_data.set_index(np.arange(1,cc_data.shape[0]+1)) # set row labels (dataframe index)

        return cc_data
    
    

In [25]:
#
# This function redefines the data types used for the different variables in nc file 
# 
# returns dataframe object with numeric values 

def define_dtypes(cc_data):

    for i in props[1:43]:
        cc_data[i]= pd.to_numeric(cc_data[i], downcast= 'integer') # change datatype of columns (except SAT )
        cc_data.loc[cc_data[str(i)] == -99.0,str(i)] = np.nan # replace fill values with NaN 
    
    # change float to integer for some cols
    cc_data.FAM = cc_data.FAM.astype(int)
    cc_data.YEAR = cc_data.YEAR.astype(int)
    cc_data.MONTH = cc_data.MONTH.astype(int)
    cc_data.DAY = cc_data.DAY.astype(int)
    cc_data.FLAG = cc_data.FLAG.astype(int)
    cc_data.LW_FLAG = cc_data.LW_FLAG.astype(int)
    cc_data.DIV_FLAG = cc_data.DIV_FLAG.astype(int)
    cc_data.SAT = cc_data.SAT.astype(str)
    
    return cc_data
       


In [17]:
for month, file in txt_files.items():
    print(file, os.path.exists(file))

/media/juli/Elements/ISCCP CTD/TP_DATABASE/CS_TP_1995_01.txt True
/media/juli/Elements/ISCCP CTD/TP_DATABASE/CS_TP_1995_02.txt True
/media/juli/Elements/ISCCP CTD/TP_DATABASE/CS_TP_1995_03.txt True
/media/juli/Elements/ISCCP CTD/TP_DATABASE/CS_TP_1995_04.txt True
/media/juli/Elements/ISCCP CTD/TP_DATABASE/CS_TP_1995_05.txt True
/media/juli/Elements/ISCCP CTD/TP_DATABASE/CS_TP_1995_06.txt True
/media/juli/Elements/ISCCP CTD/TP_DATABASE/CS_TP_1995_07.txt True
/media/juli/Elements/ISCCP CTD/TP_DATABASE/CS_TP_1995_08.txt True
/media/juli/Elements/ISCCP CTD/TP_DATABASE/CS_TP_1995_09.txt True
/media/juli/Elements/ISCCP CTD/TP_DATABASE/CS_TP_1995_10.txt True
/media/juli/Elements/ISCCP CTD/TP_DATABASE/CS_TP_1995_11.txt True
/media/juli/Elements/ISCCP CTD/TP_DATABASE/CS_TP_1995_12.txt True
/media/juli/Elements/ISCCP CTD/TP_DATABASE/CS_TP_1996_01.txt True
/media/juli/Elements/ISCCP CTD/TP_DATABASE/CS_TP_1996_02.txt True
/media/juli/Elements/ISCCP CTD/TP_DATABASE/CS_TP_1996_03.txt True
/media/jul

In [31]:
##### MAIN PROGRAM ############################

path = '/media/juli/Elements/ISCCP CTD/TP_DATABASE/ncfiles/'

for month, file in txt_files.items():
    print('processing for...' + month)
    output_path = path + 'ISCCP_CTD_TP_' +  month +  '.nc'
    if os.path.exists(file): # test whether file exists 
        if os.stat(file).st_size != 0: # test whether existing file has content
            a = os.path.exists(output_path) # test whether netCDF file already exists 
            if a == False:
                cc_data= get_data(file, props)
                cc_data= define_dtypes(cc_data)
                data_as_xr= cc_data.to_xarray()
                data_as_xr.to_netcdf(output_path, mode = 'w', format='NETCDF4', unlimited_dims=['FAM'])    
                print('nc file created.')
            else:
                print('NetCDF file already created.')
        else:
            print(file, ' is empty! ')
    else:
        print(file, 'does not exist in the database')

processing for...1995_01
NetCDF file already created.
processing for...1995_02
NetCDF file already created.
processing for...1995_03
NetCDF file already created.
processing for...1995_04
NetCDF file already created.
processing for...1995_05
NetCDF file already created.
processing for...1995_06
NetCDF file already created.
processing for...1995_07
NetCDF file already created.
processing for...1995_08
NetCDF file already created.
processing for...1995_09
NetCDF file already created.
processing for...1995_10
NetCDF file already created.
processing for...1995_11
NetCDF file already created.
processing for...1995_12
NetCDF file already created.
processing for...1996_01
NetCDF file already created.
processing for...1996_02
NetCDF file already created.
processing for...1996_03
NetCDF file already created.
processing for...1996_04
NetCDF file already created.
processing for...1996_05
NetCDF file already created.
processing for...1996_06
NetCDF file already created.
processing for...1996_07
Net

nc file created.
processing for...2007_10
/media/juli/Elements/ISCCP CTD/TP_DATABASE/CS_TP_2007_10.txt  is empty! 
processing for...2007_11
nc file created.
processing for...2007_12
nc file created.


In [11]:
if os.path.exists('/media/juli/Elements/ISCCP CTD/TP_DATABASE/CS_TP_2003_') == True:
    print('yes')

------------------------------------------------------------------------------------------------------------------------

Tested to create NetCDF groups, but very inefficient --> more creation time and much larger file sizes! 

In [318]:
#
# divide dataframe into groups based on CS families & write nc files for each group 
#


fams = cc_data.groupby('FAM', axis= 'rows') # fams = group object for each CS family which can be used to apply functions 
path = '/media/juli/Elements/ISCCP CTD/TP_DATABASE/'
nc_groups = [] # list which contains strings with all nc files for groups 

for index, row in fams:
    #index is a tuple (family name)
    #row is a new dataframe for each group 
    xarr = row.to_xarray()
    file = path + str(index) + '.nc'
    xarr.to_netcdf(file, format='NETCDF4', unlimited_dims=['FAM'])
    nc_groups.append(file)


In [326]:
# Append groups to nc file 
i = 0 
for file in nc_groups:
    if i%10==0:
        print(i)
    data_as_xr.to_netcdf(output, format='NETCDF4', mode= 'a', group = file) 
    i += 1 
    
    

0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270


In [350]:
## xarray attributes 
#data_as_xr.dims
#data_as_xr.data_vars
#data_as_xr.coords

# conditional statement on column value 
#np.shape(np.where(cc_data['PIXL_<200']>0))[1]


# check datatypes: 
#cc_data.dtypes