# SWOT Data Preparation


## Datasource
- http://redwing.pmel.noaa.gov:8080/erddap


## Steps:

### Data Steps
1) Ingest Prawler Data - convert units if necessary (this is true for conductivity)
2) Ingest Met Data (not currently used)
3) Ingest GPS Data

### Variable Meta Data
2) standard names , units 
3) serial numbers, models

### Project Meta Data
1) Project
2) PI
3) Contacts
4) Serials

### Reference Values
1) Clock Date is 20000101T000000Z
2) all values are float64 or char

### Synthesized Data Hosting (L1)

Data Format was suggested by JPL - its not quite CF but close... rehost on local system as a test

**TODO:**

Wrap successful code in modularized class for convenience and commenting
- FileName Structure: SWOTPOSTLAUNCH_L1_MOORING-S1_CTD-PROFILER_START20190905_END20190905_VER001.nc

Next Steps:  
1 - What frequncy of file generation / update do we want  (3-6hr)  
2 - what time base will be within each file  (daily with rewrites is fine)  
3 - how to upload to JPL server  (host on pmel ftp site)  
4 - error checking and comparison against RUDICS / SUMMARY FILE? -> visual confirmation file  


In [2]:
# use yaml to setup the instrument parameters

import yaml

stream = open('SWOTprawler.yaml', 'r')    # specifies the SWOT prawler/gps netcdf variable attributes
SWOTprawler = yaml.safe_load(stream)

stream = open('SWOTmet.yaml', 'r')    # specifies the SWOT met netcdf variable attributes, seperated because it isn't defined by NASA and isn't used
SWOTmet = yaml.safe_load(stream)

stream = open('SWOTGlobalAttrs.yaml', 'r')    # specifies the SWOT global netcdf variable attributes
SWOTGlobal = yaml.safe_load(stream)

SWOTSerialID = {'N001':{'Name':'SWOT N001','DepLatN':47.6062,'DepLonE':-122.3321,'InstMake':'Sea-Bird Scientific','InstModel':'GPCTD','InstSerial':'0000'},
               #  'N201':{'Name':'SWOT N201','DepLat':None,'DepLon':None},
               # 'N202':{'Name':'SWOT N202','DepLat':None,'DepLon':None},
               # 'N203':{'Name':'SWOT N203','DepLat':None,'DepLon':None},
               # 'N204':{'Name':'SWOT N204','DepLat':None,'DepLon':None},
               # 'N205':{'Name':'SWOT N205','DepLat':None,'DepLon':None},
               # 'N206':{'Name':'SWOT N206','DepLat':None,'DepLon':None},
               # 'N207':{'Name':'SWOT N207','DepLat':None,'DepLon':None},
               } #deployment specific information

## Connect to EDD ERDDAP and retrieve all SWOT related datasets

There are erddaps for GPS and Prawler to draw from (the engineering data wont initially be retrieved)
- 7 deployments
- Only daily data will be uploaded to JPL
- there are time conversion issues that prevent the >last_n_days feature

- load into pandas
- adjust cond units
- combine gps time
- calculate float time
- build xarray file (not CF compliant - follow reverse engr template)
- input metadata / translate to metadata format

In [3]:
from erddapy import ERDDAP
import xarray as xa
import pandas as pd
import numpy as np
import datetime
from urllib.error import HTTPError

def get_data(server_url='http://redwing.pmel.noaa.gov:8080/erddap', SWOTUnitID=None, DataType=None, DateRange=['2020-01-01','2021-01-01']):
    """
    
    DataType: GPS, PRAWC, BARO
    
    DateRange: [yyyy-mm-ddTHH:MM:SSZ,yyyy-mm-ddTHH:MM:SSZ] or [yyyy-mm-dd,yyyy-mm-dd]
    """

    e = ERDDAP(server=server_url,
        protocol='tabledap',
        response='csv')

    constraints = {
    'time>=': DateRange[0],
    'time<=': DateRange[1],
    }
    
    try:
        df = pd.read_csv(e.get_search_url(response='csv', search_for=f'{SWOTUnitID} {DataType}'))
    except HTTPError:
        print('No search results')
        return

    e.dataset_id=df['Dataset ID'][0]
    e.constraints = constraints
    
    pdf = e.to_pandas(
                index_col='time (UTC)',
                parse_dates=True,
                skiprows=(1,)  # units information can be dropped.
            )

    return pdf

class make_base_netcdf():
    """
    Designed for:
        - Ingesting Prawler and GPS data for SWOT Misson

    """

    def __init__(self, SWOTID='N001', praw_data=None, gps_data=None):
        """[summary]

        Args:
            df (DataFrame): Pandas DataFrame of mesurement data.
            instrument_yaml (str, optional): yaml file with instrumentation meta attributes. Defaults to ''.
            operation_yaml (str, optional): yaml file with cruise or mooring meta attributes. Defaults to ''.
            operation_type (str, optional): Choose from 'mooring','ctd',''. Defaults to 'mooring'.
            instrument_id (str, optional): [description]. Defaults to ''.
            inst_shortname (str, optional): [description]. Defaults to ''.
        """
        assert ~praw_data.empty, "Must have a valid dataframe of prawler data"
        assert ~gps_data.empty, "Must have a valid dataframe of gps data"
        self.praw_data = praw_data
        self.gps_data = gps_data
        
    def to_xarray(self):
        self.praw_data = self.praw_data.rename(
            columns={
                        'SB_Temp':'TEMP',
                        'SB_Depth':'PRES',
                        'SB_Conductivity':'CNDC',}
        )
        self.praw_data.index.name = 'TIME'

        self.praw_data['TIME'] = self.praw_data.index.values
        #drop all others
        afg = self.praw_data.to_xarray()
        afg = afg.drop_indexes('TIME')
        for x in afg.var():
            if x not in ['PRES','TEMP','CNDC']:
                afg = afg.drop(x)
        afg = afg.drop('Epoch_Time')        
        
        self.praw_data = afg

        return(self.praw_data,self.gps_data)
        
    def add_deploy_location(self, dep_lat_n = 34.156113, dep_lon_e = -118.131943):
        self.praw_data = self.praw_data.assign_coords({'LATITUDE': SWOTSerialID[SWOTUnit]['DepLatN'], 'LONGITUDE': SWOTSerialID[SWOTUnit]['DepLonE']}) \
         .expand_dims({'LATITUDE': 1, 'LONGITUDE':1})
        self.praw_data['PRES']=self.praw_data.PRES.isel({'LATITUDE':0,'LONGITUDE':0})
        self.praw_data['TEMP']=self.praw_data.TEMP.isel({'LATITUDE':0,'LONGITUDE':0})
        self.praw_data['CNDC']=self.praw_data.CNDC.isel({'LATITUDE':0,'LONGITUDE':0})
        
    def add_gps_data(self):
        self.gps_data['LatD'] = self.gps_data['LatD'].apply(lambda x: 1 if x == 'N' else -1)
        self.gps_data['LonD'] = self.gps_data['LonD'].apply(lambda x: 1 if x == 'E' else -1)
        self.gps_data['LatDD'] = self.gps_data['LatD']*((self.gps_data['Latitude']/100-np.floor(self.gps_data['Latitude']/100))*100 / 60) + self.gps_data['Latitude']/100
        self.gps_data['LonDD'] = self.gps_data['LonD']*(((self.gps_data['Longitude']/100-np.floor(self.gps_data['Longitude']/100))*100 / 60) + self.gps_data['Longitude']/100)
    
        return self.gps_data
    
    def add_variable_meta(self):
        pass
    
    def inst_meta(self):
        pass
    
    def global_attr(self):
        pass
    
    def history(self):
        pass
    
    def xarray2netcdf_save(self, xdf, filename='temp.nc', **kwargs):
        """Save xarray to netcdf

        Args:
            xdf (xarray dataset): xarray dataset
            filename (str, optional): Filename. Defaults to 'temp.nc'.
        """
                
        self.praw_data.to_netcdf(filename,format=kwargs['format'],encoding={'TIME':{'units':'days since 2000-01-01'}})
    

In [14]:
# datarange = 

for SWOTUnit in SWOTSerialID:
    print(SWOTUnit)
    dfg = {}
    datastatus = 0 #0: no data processed, 1: prawler/gps dataprocessed, 2: prawler and gps data processed
    
    for datatype in ['PRAWC','GPS']:
        pdf = get_data(SWOTUnitID=SWOTUnit, DataType=datatype, DateRange=['2020-1-01','2020-02-01'])
        dfg.update({SWOTUnit+datatype:pdf})
        

        dfg[SWOTUnit+datatype].index.name = dfg[SWOTUnit+datatype].index.name.split(' ')[0]
        dfg[SWOTUnit+datatype].columns = [x.split(' ')[0] for x in dfg[SWOTUnit+datatype].columns]
        
        # dfg[SWOTUnit+datatype] = dfg[SWOTUnit+datatype].loc['2020':'2020'] #<--- hard coded sanity check for SWOT data in period of comparision
        
        if datatype == 'PRAWC':
            dfg[SWOTUnit+datatype] = dfg[SWOTUnit+datatype].rename(
                columns={
                            'SB_Temp':'TEMP',
                            'SB_Depth':'PRES',
                            'SB_Conductivity':'CNDC',}
            )
            dfg[SWOTUnit+datatype].index.name = 'TIME'
            
            dfg[SWOTUnit+datatype]['TIME'] = dfg[SWOTUnit+datatype].index.values
            dfg[SWOTUnit+datatype].sort_index(inplace=True)
            #drop all others
            afg = dfg[SWOTUnit+datatype].to_xarray()
            afg = afg.drop_indexes('TIME')
            
            for x in afg.var():
                if x not in ['PRES','TEMP','CNDC']:
                    afg = afg.drop(x)
            afg = afg.drop('Epoch_Time')
        
            #add Latitude/Longitude from Deployment
            afg = afg.assign_coords({'LATITUDE': SWOTSerialID[SWOTUnit]['DepLatN'], 'LONGITUDE': SWOTSerialID[SWOTUnit]['DepLonE']}) \
                     .expand_dims({'LATITUDE': 1, 'LONGITUDE':1})
            afg['PRES']=afg.PRES.isel({'LATITUDE':0,'LONGITUDE':0})
            afg['TEMP']=afg.TEMP.isel({'LATITUDE':0,'LONGITUDE':0})
            afg['CNDC']=afg.CNDC.isel({'LATITUDE':0,'LONGITUDE':0})

            #add ancillary info
            afg = afg.assign_coords({'INSTR_INFO':np.array('',dtype='S1')})
            afg = afg.assign_coords({'INSTR_MAKE': np.array(SWOTSerialID[SWOTUnit]['InstMake'],dtype='S19')})
            afg = afg.assign_coords({'INSTR_MODEL': np.array(SWOTSerialID[SWOTUnit]['InstModel'],dtype='S5')})
            afg = afg.assign_coords({'INSTR_SN': np.array(SWOTSerialID[SWOTUnit]['InstSerial'],dtype='int')})

            # odd cleanup for formatting
            afg = afg.drop_duplicates(dim='TIME')
            afg['CNDC'] = afg.CNDC  * 10 #CC unit conversion
        
            datastatus += 1
        #add gps info
        if datatype == 'GPS':
            core = make_base_netcdf(praw_data=dfg[SWOTUnit+'PRAWC'], gps_data=dfg[SWOTUnit+'GPS'])

            t,g = core.to_xarray()
            gps = core.add_gps_data()

            gps_df = g[['LatDD','LonDD']]

            gps_df.index.name = 'TIME_SURFACE_BUOY'
            gps_df = gps_df.rename(
                            columns={
                                        'LatDD':'LATITUDE_SURFACE_BUOY',
                                        'LonDD':'LONGITUDE_SURFACE_BUOY',}
                            )

            gps_df['TIME_SURFACE_BUOY'] = gps_df.index.values
            gps_df.sort_index(inplace=True)

            gps_xdf = gps_df.to_xarray()
            # odd cleanup for formatting
            gps_xdf = gps_xdf.drop_duplicates(dim='TIME_SURFACE_BUOY')

            datastatus += 1

        if datastatus == 2:    
            afg = afg.merge(gps_xdf)

            #add var meta information
            for var in afg.var():
                afg[var].attrs = SWOTprawler[var]
                afg[var].encoding['_FillValue'] = None
            for var in afg.coords:
                afg[var].attrs = SWOTprawler[var]
                afg[var].encoding['_FillValue'] = None

            #add global meta information
            afg.attrs.update(SWOTGlobal['erddap_globalattributes'])

            #add dynamic meta information
            afg.attrs.update({'date_created':datetime.datetime.utcnow().strftime("%Y-%m-%d")})
            afg.attrs.update({'geospatial_lat_min':afg.LATITUDE_SURFACE_BUOY.min().values})
            afg.attrs.update({'geospatial_lat_max':afg.LATITUDE_SURFACE_BUOY.max().values})
            afg.attrs.update({'geospatial_lon_min':afg.LONGITUDE_SURFACE_BUOY.min().values})
            afg.attrs.update({'geospatial_lon_max':afg.LONGITUDE_SURFACE_BUOY.max().values})
            afg.attrs.update({'geospatial_vertical_min':afg.PRES.min().values})
            afg.attrs.update({'geospatial_vertical_max':afg.PRES.max().values})
            afg.attrs.update({'time_coverage_start':afg.TIME.min().values.astype('datetime64[s]').item().strftime("%Y%m%dZ%H%M%SZ")})
            afg.attrs.update({'time_coverage_end':afg.TIME.max().values.astype('datetime64[s]').item().strftime("%Y%m%dZ%H%M%SZ")})

            afg.attrs.update({'history':afg.attrs['history'] + datetime.datetime.utcnow().strftime("%Y-%m-%d") +': Initial File Creation - RUDICS/FLOT to NETCDF Conversion TEST for SWOT Validation'}) 
            
            afg = afg.reset_coords(names=["INSTR_INFO","INSTR_MAKE","INSTR_MODEL","INSTR_SN"])
            afg.to_netcdf('PMEL_SWOTtest_dataset.nc',encoding={'TIME':{'units':'days since 20000101T000000Z','_FillValue':None},
                                                               'TIME_SURFACE_BUOY':{'units':'days since 20000101T000000Z','_FillValue':None}})
            

N001
