# Imports

In [1]:
import pandas as pd
import json
import validate
from general import many_to_many, many_cols
import ingest
import clean

# Pull Data into Pandas

In [2]:
data = ingest.main()

In [3]:
excel_data = data['excel_data']
db = data['database']

# Cleaning

In [4]:
db = clean.remove_NaN_columns(db)

In [5]:
db = clean.strip_all_columns(db)

['iopse_id', 'event_type', 'foriegn_campaign_short_name', 'foriegn_deployment_short_name', 'description', 'region_description', 'published_list', 'reports', 'reference_file']


In [6]:
db['gcmd_phenomena']['code'] = db['gcmd_phenomena']['code'].apply(lambda x: str(x))

# Short Name Supplementation

In [7]:
# make unique shortname by combining the campaign name and the table sub short name

db['deployment']['short_name'] = db['deployment']['foriegn_campaign_short_name']+'_'+db['deployment']['deployment_id']

# this is not a true short_name
db['iopse']['iop_short_name'] = db['iopse']['foriegn_campaign_short_name']+'_'+db['iopse']['iopse_id']

db['flight'] = many_to_many(db, 'linking', 'table_instrument', keep_all=True)
db['flight']['foriegn_deployment_short_name'] = db['flight']['foriegn_campaign_short_name']+'_'+db['flight']['foriegn_deployment']

# Find Matching Deployments for IOPSE

In [8]:
db['iopse']['deployment_short_name'] = 'No Matches'

for row in range(len(db['iopse'])):
    iop_start = db['iopse'].iloc[row]['start_date']
    iop_end = db['iopse'].iloc[row]['end_date']
    iop_camp = db['iopse'].iloc[row]['foriegn_campaign_short_name']
    
    campaign_filter = db['deployment']['foriegn_campaign_short_name'].apply(lambda short_name: short_name == iop_camp)
    possible_campaigns = db['deployment'][campaign_filter]
    
    start_filter = possible_campaigns['start_date'].apply(lambda dep_start: validate.vali_date(dep_start, iop_start))
    end_filter = possible_campaigns['end_date'].apply(lambda dep_end: validate.vali_date(iop_end, dep_end))
    
    matching_deployments = list(possible_campaigns[start_filter*end_filter]['short_name'])
    if len(matching_deployments)>1:
        print(f"error on {db['iopse'].iloc[row]}")
    elif len(matching_deployments)==1:
        matching_deployments = matching_deployments[0]
    else:
        matching_deployments = 'None Found'
    db['iopse']['deployment_short_name'].iloc[row]=matching_deployments



# Many to Many Creation

In [9]:
main_table_names = ['campaign', 'platform', 'instrument', 'deployment', 'iopse']

In [10]:
for table in main_table_names:
    print(table)
    for column in [col for col in db[table].keys() if isinstance(col,str) and 'table' in col]:
        new_table_name = f"{table}_to_{column.replace('table_','')}"
        db[new_table_name]=many_to_many(db, table, column)
        print(f'   {new_table_name} created')

campaign
   campaign_to_focus_area created
   campaign_to_season created
   campaign_to_platform_type created
   campaign_to_gcmd_phenomenas_uuid created
   campaign_to_repository created
   campaign_to_partner_org created
   campaign_to_gcmd_project created
platform
   platform_to_gcmd_platform_uuid created
instrument
   instrument_to_gcmd_instrument created
   instrument_to_instrument_type created
   instrument_to_measurement_keywords_uuid created
   instrument_to_geophysical_concept created
   instrument_to_repository created
   instrument_to_measurement_region created
deployment
   deployment_to_geographical_region created
   deployment_to_platform created
iopse


# Validation

### Short Name Duplicates

In [11]:
for table_name in db.keys():
    if 'short_name' in db[table_name].keys():
        print(table_name)
        duplicates = validate.find_duplicates(db, table_name, 'short_name')

        print(f'    {duplicates}')

platform_type
    []
aircraft_type
    []
home_base
    []
repository
    []
focus_area
    []
season
    []
instrument_type
    ['insitulaboratorychemicalmetersanalyzers']
measurement_region
    []
geographical_region
    []
geophysical_concept
    ['informationnotavailable', 'ocean']
campaign
    ['informationnotavailable']
platform
    []
instrument
    ['atlas']
deployment
    []
gcmd_instrument
    ['atlas', 'informationnotavailable', 'wcr', nan, 'epic', 'opc', 'gnssreceiver', 'cris', 'aa', 'iris', 'particlespectrometers', 'ssies', 'icecube', 'aps']
gcmd_platform
    ['kingair', 'informationnotavailable', 'goes10', 'goes11', 'goes12', 'goes13', 'goes14', 'goes15', 'goes16', 'goes1', 'goes2', 'goes3', 'goes4', 'goes5', 'goes6', 'goes7', 'goes8', 'goes9', 'environmentalmodeling']
gcmd_project
    ['afsisclimate', 'camp', 'informationnotavailable', 'iodp', 'landsat7', 'mcmurdopredatorprey', 'notapplicable']
partner_org
    []


### Foriegn Key Links

### Campaign

In [12]:
pd.set_option('display.max_rows', 500)

In [13]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign_to_gcmd_project', 
                                data_index='campaign', 
                                data_column='gcmd_project', 
                                foriegn_table='gcmd_project', 
                                foriegn_column='uuid')
errors

Unnamed: 0,campaign,gcmd_project,suggestions
2,ACEPOL,e0a48b3c-ab3d-4331-b992-367352d5c09c,[]
16,C3VP,NID,[]
17,CalWater,NID,[]
77,ACTIVATE,TBD,[]
78,CAMP2Ex,TBD,[]
79,DCOTSS,TBD,[]
80,Delta-X,TBD,[]
83,IMPACTS,TBD,[]
84,S-MODE,TBD,[]
90,ARISE,NOT LISTED IN GCMD,[]


In [14]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign_to_focus_area', 
                                data_index='campaign', 
                                data_column='focus_area', 
                                foriegn_table='focus_area', 
                                foriegn_column='short_name')
errors

Unnamed: 0,campaign,focus_area,suggestions
10,ACT-America,Climate Variability and Change,[Climate Variability & Change]
39,CLAMS,Earth Surface and Interior,[Earth Surface & Interior]
40,CLASIC07,Water & Energy Cycle,[Global Water & Energy Cycle]
82,ORACLES,Global Water and Energy Cycles,[Global Water & Energy Cycle]
102,TOGA COARE,Atmospheric Dynamics,[]
131,CASIE,Airborne Science,[]
133,CLPX II,Water & Energy Cycle,[Global Water & Energy Cycle]
134,CLPX III,Water & Energy Cycle,[Global Water & Energy Cycle]
137,DEVOTE,Airborne Science,[]
140,High Winds,Water & Energy Cycle,[Global Water & Energy Cycle]


In [15]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign_to_season', 
                                data_index='campaign', 
                                data_column='season', 
                                foriegn_table='season', 
                                foriegn_column='short_name')
errors

Unnamed: 0,campaign,season,suggestions
106,Delta-X,Overshooting tops,[]
107,Delta-X,deep convection,[]
108,Delta-X,lower stratospheric air chemistry,[]
119,ARISE,boreal summer boreal fall,[]


In [16]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign_to_platform_type', 
                                data_index='campaign', 
                                data_column='platform_type', 
                                foriegn_table='platform_type', 
                                foriegn_column='short_name')
errors

Unnamed: 0,campaign,platform_type,suggestions
1,ABoVE,Ground-based Platforms,"[Land-based Platforms, Water-based Platforms]"
10,ACT-America,Ground-based Platforms,"[Land-based Platforms, Water-based Platforms]"
26,ATom,Model Output,[]
31,BOREAS,Model Output,[]
32,BOREAS,Satellite,[Satellites]
33,BROMEX,Ground-based Platforms,"[Land-based Platforms, Water-based Platforms]"
47,CARVE,Model Output,[]
52,CLAMS,In-situ Ground-based Platforms,[]
55,CLASIC07,In-situ Ground-based Platforms,[]
63,CPEX,In-situ Ground-based Platforms,[]


In [17]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign_to_gcmd_phenomenas_uuid', 
                                data_index='campaign', 
                                data_column='gcmd_phenomenas_uuid', 
                                foriegn_table='gcmd_phenomena', 
                                foriegn_column='code')
errors

Unnamed: 0,campaign,gcmd_phenomenas_uuid,suggestions


In [18]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign_to_repository', 
                                data_index='campaign', 
                                data_column='repository', 
                                foriegn_table='repository', 
                                foriegn_column='short_name')
errors

Unnamed: 0,campaign,repository,suggestions
16,BROMEX,NID,[]
24,CLPX,NSIDC,[]
67,SnowEx,NSIDC,[]
80,CAMP2Ex,TBD,[]
84,FIREx-AQ,TBD,[]
164,GTE - ABLE 2A/B,ADSC,[ASDC]


In [19]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign_to_partner_org', 
                                data_index='campaign', 
                                data_column='partner_org', 
                                foriegn_table='partner_org', 
                                foriegn_column='short_name')
errors

Unnamed: 0,campaign,partner_org,suggestions
11,ARCTAS,California Air Resources Board,[]
12,ARCTAS,International Polar Year,[]
17,ATTREX,DFG,[]
39,CORAL,NID,[]
40,CPEX,NID,[]
48,FIFE,NAC (Canada),[]
49,FIFE,AAFC (Canada),[]
54,HS3,Environment Canada,[]
61,IPHEx,Duke University,[]
62,IPHEx,NOAA Hydrometeorological Testbed,[]


In [20]:
errors = validate.foriegn_keys(db, 
                                data_table='platform_to_gcmd_platform_uuid', 
                                data_index='platform', 
                                data_column='gcmd_platform_uuid', 
                                foriegn_table='gcmd_platform', 
                                foriegn_column='uuid')
errors

Unnamed: 0,platform,gcmd_platform_uuid,suggestions


In [21]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument_to_measurement_keywords_uuid', 
                                data_index='instrument', 
                                data_column='measurement_keywords_uuid', 
                                foriegn_table='gcmd_phenomena', 
                                foriegn_column='code')
errors

Unnamed: 0,instrument,measurement_keywords_uuid,suggestions
86,CPL,0,"[1000, 2000, 3000, 4000]"


In [22]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument_to_gcmd_instrument', 
                                data_index='instrument', 
                                data_column='gcmd_instrument', 
                                foriegn_table='gcmd_instrument', 
                                foriegn_column='uuid')
errors

Unnamed: 0,instrument,gcmd_instrument,suggestions
6,4-STAR,b0f93e6a-c766-4957-8762-5c7709487459,[]
44,AVIRIS,d67afd03-3b79-419c-9289-5dde713ab904\n57854209...,[]
68,CIP,92f99316-b581-4adb-9980-aeb6bed64eee,[]
82,CPL,6238fe2-9a87-4e32-b866-c4a637094b51,[6238f3e2-9a87-4e32-b866-c4a637094b51]
117,EXRAD,a212d36d-2a4e-473f-b16a-6e2104b9dd8f\nba3de3fc...,[]
152,HSRL-2,abdf08cd-03c5-4497-87a4-65493584e2c7,[]
274,W-Band Radar,a90e-4a70-9bcb-93d106c1583f,[dc5ee11d-a90e-4a70-9bcb-93d106c1583f]


In [23]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument_to_instrument_type', 
                                data_index='instrument', 
                                data_column='instrument_type', 
                                foriegn_table='instrument_type', 
                                foriegn_column='short_name')
errors

Unnamed: 0,instrument,instrument_type,suggestions
1,2D-C/P,Chemical Meters/Analyzers,[]
41,AVAPS,Profilers/Sounders,[]
79,CPL,LIDAR,[]
136,HAMSR,Spectrometers/Radiometers,[In Situ/Laboratory - Spectrometers/Radiometers]
141,HIRAD,Spectrometers/Radiometers,[In Situ/Laboratory - Spectrometers/Radiometers]
231,S-HIS,interferometer/sounder,[]


In [24]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument_to_geophysical_concept', 
                                data_index='instrument', 
                                data_column='geophysical_concept', 
                                foriegn_table='geophysical_concept', 
                                foriegn_column='short_name')
errors

Unnamed: 0,instrument,geophysical_concept,suggestions


In [25]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument_to_repository', 
                                data_index='instrument', 
                                data_column='repository', 
                                foriegn_table='repository', 
                                foriegn_column='short_name')
errors

Unnamed: 0,instrument,repository,suggestions


In [26]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument_to_measurement_region', 
                                data_index='instrument', 
                                data_column='measurement_region', 
                                foriegn_table='measurement_region', 
                                foriegn_column='short_name')
errors

Unnamed: 0,instrument,measurement_region,suggestions
141,HIRAD,boundary layer? sea surface is what I would ca...,[]


In [27]:
errors = validate.foriegn_keys(db, 
                                data_table='deployment_to_geographical_region', 
                                data_index='deployment', 
                                data_column='geographical_region', 
                                foriegn_table='geographical_region', 
                                foriegn_column='short_name')
errors

Unnamed: 0,deployment,geographical_region,suggestions
92,ABoVE_dep_2017,Alaska,[]
93,ABoVE_dep_2017,western Canada,[]
94,ABoVE_dep_2018,Alaska,[]
95,ABoVE_dep_2018,western Canada,[]
96,ABoVE_dep_2019,Alaska,[]
97,ABoVE_dep_2019,western Canada,[]
98,CLAMS_dep_2001,Chesapeake Bay,[]
99,CLAMS_dep_2001,Atlantic Ocean,[]


In [28]:
errors = validate.foriegn_keys(db, 
                                data_table='deployment_to_platform', 
                                data_index='deployment', 
                                data_column='platform', 
                                foriegn_table='platform', 
                                foriegn_column='short_name')
print('\n\ndo I really need to validate this?')
errors



do I really need to validate this?


Unnamed: 0,deployment,platform,suggestions
0,HS3_dep_2012,global hawk AV-1,[]
1,HS3_dep_2012,AV-6,[]
2,HS3_dep_2013,global hawk AV-1,[]
3,HS3_dep_2013,AV-6,[]
4,HS3_dep_2014,global hawk AV-6,[]
8,OLYMPEX_dep_2016,UND Citation II,[Citation]
23,ATom_dep_2016,NASA DC-8-AFRC,[]
24,ATom_dep_2017a,NASA DC-8-AFRC,[]
25,ATom_dep_2017b,NASA DC-8-AFRC,[]
26,ATom_dep_2018,NASA DC-8-AFRC,[]


In [29]:
errors = validate.foriegn_keys(db, 
                                data_table='deployment', 
                                data_index='short_name', 
                                data_column='foriegn_campaign_short_name', 
                                foriegn_table='campaign', 
                                foriegn_column='short_name')
print('\n\n TODO this better once you have the data')
errors



 TODO this better once you have the data


6,short_name,foriegn_campaign_short_name,suggestions


In [30]:
errors = validate.foriegn_keys(db, 
                                data_table='platform', 
                                data_index='short_name', 
                                data_column='foriegn_aircraft_type', 
                                foriegn_table='aircraft_type', 
                                foriegn_column='short_name')
errors

1,short_name,foriegn_aircraft_type,suggestions
7,ASO,Prop Plane,[]


In [31]:
errors = validate.foriegn_keys(db, 
                                data_table='platform_to_gcmd_platform_uuid', 
                                data_index='platform', 
                                data_column='gcmd_platform_uuid', 
                                foriegn_table='gcmd_platform', 
                                foriegn_column='uuid')
errors

Unnamed: 0,platform,gcmd_platform_uuid,suggestions


In [32]:
errors = validate.foriegn_keys(db, 
                                data_table='iopse', 
                                data_index='iopse_id', 
                                data_column='deployment_short_name', 
                                foriegn_table='deployment', 
                                foriegn_column='short_name')
errors

2,iopse_id,deployment_short_name,suggestions
6,iop_1,None Found,[]
7,iop_1,None Found,[]
8,iop_1,None Found,[]
9,iop_1,None Found,[]
10,iop_1,None Found,[]
12,Information Not Available,None Found,[]
13,Information Not Available,None Found,[]
14,Information Not Available,None Found,[]
30,Information Not Available,None Found,[]
40,iop_5,None Found,[]


In [33]:
errors = validate.foriegn_keys(db, 
                                data_table='iopse', 
                                data_index='iopse_id', 
                                data_column='event_type', 
                                foriegn_table='deployment', 
                                foriegn_column='short_name')
errors

2,iopse_id,event_type,suggestions
3,iop_1,Y,[]
4,iop_2,Y,[]
5,iop_3,Y,[]
6,iop_1,N,[]
7,iop_1,Y,[]
8,iop_1,Y,[]
9,iop_1,Y,[]
10,iop_1,Y,[]
11,iop_1,Y,[]
15,iop_1,IOP,[]


### Flight

In [34]:
# flight table, instruments

errors = validate.foriegn_keys(db, data_table='flight', 
                      data_index='foriegn_campaign_short_name', 
                      data_column='instrument', 
                      foriegn_table='instrument', 
                      foriegn_column='short_name')
errors

Unnamed: 0,foriegn_campaign_short_name,instrument,suggestions
26,IPHEx,2D-C,[2D-C/P]
33,IPHEx,Nevzorov,[Nevzorov probe]
34,IPHEx,King hot wire,[King hot wire probe]
129,SEAC4RS,Dew Point,[]
130,SEAC4RS,LWC/TWC,[]
131,SEAC4RS,Rosemount temperature,[]
132,SEAC4RS,Rosemount icing rod,[]
133,SEAC4RS,AIMMS-20,[]
134,SEAC4RS,NMASS,[MAS]
249,BOREAS,LI6262,[LICOR 6262]


In [35]:
# Flight table, platforms

errors = validate.foriegn_keys(db, 
                      data_table='flight', 
                      data_index='foriegn_campaign_short_name', 
                      data_column='platform', 
                      foriegn_table='platform', 
                      foriegn_column='short_name')
errors

Unnamed: 0,foriegn_campaign_short_name,platform,suggestions
15,HS3,WB-57f,[WB-57]
16,HS3,WB-57f,[WB-57]
19,OLYMPEX,UND Citation II,[Citation]


In [36]:
errors = validate.foriegn_keys(db, 
                      data_table='flight', 
                      data_index='foriegn_campaign_short_name', 
                      data_column='foriegn_deployment_short_name', 
                      foriegn_table='deployment', 
                      foriegn_column='short_name')
errors

Unnamed: 0,foriegn_campaign_short_name,foriegn_deployment_short_name,suggestions


# Dates

In [37]:
for table_name in db.keys():
    # find tables that have date fields
    if len(col_names := [col for col in db[table_name].columns if 'date' in col])>0:
        print(f"{table_name}\n    {', '.join(col_names)}")

campaign
    start_date, end_date
instrument
    deployment_date, decommision_date
deployment
    start_date, end_date
iopse
    start_date, end_date


In [38]:
table_names = ['campaign', 'deployment', 'iopse'] 
for table_name in table_names:
    print(table_name)
    db[table_name]['valid_date']=False
    
    db[table_name]['valid_date'] = db[table_name].apply(lambda row: validate.vali_date(row['start_date'], row['end_date']), axis=1)
        

campaign
    non date-time detected: 2017-04-26 00:00:00, ongoing
    non date-time detected: 3 week period in june, Information Not Available
    non date-time detected: 2015,  on-going
    non date-time detected: 2016-09-26 00:00:00, ongoing
    non date-time detected: 1905-06-18 00:00:00, Information Not Available
    non date-time detected: 1998, 1998
    non date-time detected: January ?, 2019, December ?, 2023
    non date-time detected: TBD, TBD
    non date-time detected: TBD, TBD
    non date-time detected: TBD, TBD
    non date-time detected: 1905-07-03 00:00:00, ongoing
    non date-time detected: 2007, 2008
    non date-time detected: 2007, 2008
    non date-time detected: Information Not Available, 1984-06-01 00:00:00
    non date-time detected: Information Not Available, 1984-06-01 00:00:00
    non date-time detected: Information Not Available, 1984-06-01 00:00:00
    non date-time detected: Information Not Available, 1983-07-01 00:00:00
    non date-time detected: Inform

In [39]:
table_names = ['instrument'] 
for table_name in table_names:
    
    db[table_name]['valid_date']=False
    db[table_name]['valid_date'] = db[table_name].apply(lambda row: validate.vali_date(row['deployment_date'], row['decommision_date']), axis=1)
       

    non date-time detected: 1997, 1997-present
    non date-time detected: 2000, Information Not Available
    non date-time detected: 2001, Information Not Available
    non date-time detected: 2010, Information Not Available
    non date-time detected: 1998, Information Not Available


In [40]:
db['campaign'][['short_name','start_date','end_date','valid_date']][db['campaign']['valid_date'].apply(lambda x: not(x))]

1,short_name,start_date,end_date,valid_date
3,ABoVE,2017-04-26 00:00:00,ongoing,False
4,ABLE,Information Not Available,Information Not Available,False
10,AirMISR,Information Not Available,Information Not Available,False
11,AMSR,Information Not Available,Information Not Available,False
23,CITE,Information Not Available,Information Not Available,False
25,CLASIC07,3 week period in june,Information Not Available,False
32,FIRE,Information Not Available,Information Not Available,False
37,HYMEx,Information Not Available,Information Not Available,False
38,IceBridge,Information Not Available,Information Not Available,False
39,ICEPOP,Information Not Available,Information Not Available,False


# IOPSE

In [41]:
# FILTER 
# TODO: FIX GRIP
db['iopse']=db['iopse'][db['iopse']['iopse_id']!='Information Not Available'][db['iopse']['foriegn_campaign_short_name']!='GRIP'][db['iopse']['foriegn_campaign_short_name']!='ATTREX']


  db['iopse']=db['iopse'][db['iopse']['iopse_id']!='Information Not Available'][db['iopse']['foriegn_campaign_short_name']!='GRIP'][db['iopse']['foriegn_campaign_short_name']!='ATTREX']


In [42]:
# split into two tables

db['significant_event'] = db['iopse'][db['iopse']['event_type']=='Y'].copy()
db['iop'] = db['iopse'][db['iopse']['event_type']!='Y'].copy()

In [43]:
# validate that all iops are unique

db['iop']['iop_short_name'][db['iop']['iop_short_name'].duplicated()]

Series([], Name: iop_short_name, dtype: object)

In [44]:
# if sig event has matching iop, link them

# if sig event doesn't have matching iop, delete it so there will be no foriegn key

In [45]:
# tag significant events that have IOPs

db['significant_event']['has_iop'] = db['significant_event']['iop_short_name'].apply(lambda short_name: short_name in list(db['iop']['iop_short_name']))


In [53]:
# # check that all significant events with an iop have an iop

# s_i = set(db['iop']['iop_short_name'])
# s_s = set(has_iop['iop_short_name'])
# [s for s in s_s if s not in s_i]

In [52]:
from datetime import datetime
# sig event start >= iop start

def start_val(sig_row, db):
    large_date = sig_row['start_date']
    small_date = db['iop'][db['iop']['iop_short_name']==sig_row['iop_short_name']]['start_date'].iloc[0]
    val = large_date >= small_date
    return val

def end_val(sig_row, db):
    large_date = sig_row['end_date']
    small_date = db['iop'][db['iop']['iop_short_name']==sig_row['iop_short_name']]['end_date'].iloc[0]
    val = large_date <= small_date
    return val

has_iop = db['significant_event'][db['significant_event']['has_iop']].copy()
val_iop_date_start = has_iop.apply(lambda row: not(start_val(row, db)), axis=1)
val_iop_date_end = has_iop.apply(lambda row: not(end_val(row, db)), axis=1)

In [48]:
# display incorrect start dates
has_iop[val_iop_date_start]

2,iopse_id,event_type,foriegn_campaign_short_name,foriegn_deployment_short_name,start_date,end_date,description,region_description,published_list,reports,reference_file,iop_short_name,deployment_short_name,valid_date,has_iop


In [54]:
# display incorrect end dates
has_iop[val_iop_date_end]

2,iopse_id,event_type,foriegn_campaign_short_name,foriegn_deployment_short_name,start_date,end_date,description,region_description,published_list,reports,reference_file,iop_short_name,deployment_short_name,valid_date,has_iop


## Remap Limited Fields

In [55]:
from ingest import rename_columns

In [57]:
column_mapping = json.load(open('limited_col_mapping.json', 'r'))

In [58]:
# remaps limited field names
ingest_order = [
 'platform_type',
 'aircraft_type',
 'instrument_type',
 'home_base',
 'repository',
 'focus_area',
 'season',
 'measurement_region',
 'geographical_region',
 'geophysical_concept',
 'gcmd_phenomena',
 'gcmd_instrument',
 'gcmd_platform',
 'gcmd_project',
 'partner_org'
]
tables_to_remap = ['platform_type',
 'aircraft_type',
 'instrument_type',
 'home_base',
 'repository',
 'focus_area',
 'season',
 'measurement_region',
 'geographical_region',
 'geophysical_concept',]

for table_name in tables_to_remap:
    db[table_name] = rename_columns(db, table_name, column_mapping)

### Ingest Order

In [59]:
ingest_order = json.load(open('ingest_order.json', 'r'))
ingest_order

['platform_type',
 'aircraft_type',
 'instrument_type',
 'home_base',
 'repository',
 'focus_area',
 'season',
 'measurement_region',
 'geographical_region',
 'geophysical_concept',
 'gcmd_phenomena',
 'gcmd_instrument',
 'gcmd_platform',
 'gcmd_project',
 'partner_org',
 ['instrument',
  'instrument_to_gcmd_instrument',
  'instrument_to_measurement_keywords_uuid',
  'instrument_to_geophysical_concept',
  'instrument_to_repository',
  'instrument_to_measurement_region',
  'instrument_to_instrument_type'],
 ['platform', 'platform_to_gcmd_platform_uuid'],
 'flight',
 ['deployment', 'deployment_to_geographical_region', 'deployment_to_platform'],
 ['campaign',
  'campaign_to_platform_type',
  'campaign_to_focus_area',
  'campaign_to_season',
  'campaign_to_gcmd_phenomenas_uuid',
  'campaign_to_repository',
  'campaign_to_partner_org',
  'campaign_to_gcmd_project']]

### Pickle the Data

In [63]:
import pickle

In [65]:
pickle.dump(db, open('ingested_data','wb'))