# Imports

In [1]:
import pandas as pd
import json
import validate
from general import many_to_many, many_cols
import ingest
import clean

# Pull Data into Pandas

In [2]:
data = ingest.main('Updated Inventory.xlsx')

In [3]:
excel_data = data['excel_data']
db = data['database']

# Cleaning

In [4]:
db = clean.remove_NaN_columns(db)

In [5]:
db = clean.strip_all_columns(db)

['short_name', 'TYPE (IOP or SE)', 'parent short_name', 'foreign-campaign-short_name', 'ADMG Deployment Name', 'description', 'region_description', 'published_list', 'reports', 'reference_file']


In [6]:
db['gcmd_phenomena']['ignore_code'] = db['gcmd_phenomena']['ignore_code'].apply(lambda x: str(x))

# Short Name Supplementation

In [7]:
# make unique shortname by combining the campaign name and the table sub short name

db['deployment']['short_name'] = db['deployment']['foreign-campaign-short_name']+'_'+db['deployment']['ignore_deployment_id']

db['collection_period'] = many_to_many(db, 'linking', 'table-instrument-short_name', keep_all=True)
db['collection_period']['short_name'] = db['collection_period']['foreign-campaign-short_name']+'_'+db['collection_period']['foreign-deployment-short_name']

# Find Matching Deployments for IOPSE

In [8]:
db['iopse']['deployment_short_name'] = 'No Matches'

for row in range(len(db['iopse'])):
    iop_start = db['iopse'].iloc[row]['start_date']
    iop_end = db['iopse'].iloc[row]['end_date']
    iop_camp = db['iopse'].iloc[row]['foreign-campaign-short_name']
    
    campaign_filter = db['deployment']['foreign-campaign-short_name'].apply(lambda short_name: short_name == iop_camp)
    possible_campaigns = db['deployment'][campaign_filter]
    
    start_filter = possible_campaigns['start_date'].apply(lambda dep_start: validate.vali_date(dep_start, iop_start))
    end_filter = possible_campaigns['end_date'].apply(lambda dep_end: validate.vali_date(iop_end, dep_end))
    
    matching_deployments = list(possible_campaigns[start_filter*end_filter]['short_name'])
    if len(matching_deployments)>1:
        print(f"error on {db['iopse'].iloc[row]}")
    elif len(matching_deployments)==1:
        matching_deployments = matching_deployments[0]
    else:
        matching_deployments = 'None Found'
    db['iopse']['deployment_short_name'].iloc[row]=matching_deployments

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


# Many to Many Creation

In [9]:
main_table_names = ['campaign', 'platform', 'instrument', 'deployment', 'iopse']

In [10]:
for table in main_table_names:
    print(table)
    for column in [col for col in db[table].keys() if isinstance(col,str) and 'table' in col]:
        name = column.split('-')[1]
        new_table_name = f"{table}-to-{name}"
        db[new_table_name]=many_to_many(db, table, column)
        print(f'   {new_table_name} created')

campaign
   campaign-to-focus_area created
   campaign-to-season created
   campaign-to-platform_type created
   campaign-to-gcmd_phenomena created
   campaign-to-repository created
   campaign-to-partner_org created
   campaign-to-gcmd_project created
platform
   platform-to-gcmd_platform created
instrument
   instrument-to-gcmd_instrument created
   instrument-to-instrument_type created
   instrument-to-measurement_keywords created
   instrument-to-geophysical_concept created
   instrument-to-repository created
   instrument-to-measurement_region created
deployment
   deployment-to-geographical_region created
iopse


# Validation

### Short Name Duplicates

In [12]:
for table_name in db.keys():
    if 'short_name' in db[table_name].keys():
        print(table_name)
        duplicates = validate.find_duplicates(db, table_name, 'short_name')

        print(f'    {duplicates}')

platform_type
    []
home_base
    []
repository
    []
focus_area
    []
season
    []
instrument_type
    []
measurement_region
    []
geographical_region
    []
geophysical_concept
    ['informationnotavailable']
campaign
    ['informationnotavailable']
platform
    ['informationnotavailable']
instrument
    ['informationnotavailable', 'atlas']
deployment
    []
iopse
    []
gcmd_instrument
    ['atlas', 'informationnotavailable', 'wcr', nan, 'epic', 'opc', 'gnssreceiver', 'cris', 'aa', 'iris', 'particlespectrometers', 'ssies', 'icecube', 'aps']
gcmd_platform
    ['kingair', 'informationnotavailable', 'goes10', 'goes11', 'goes12', 'goes13', 'goes14', 'goes15', 'goes16', 'goes1', 'goes2', 'goes3', 'goes4', 'goes5', 'goes6', 'goes7', 'goes8', 'goes9', 'environmentalmodeling']
gcmd_project
    ['afsisclimate', 'camp', 'informationnotavailable', 'iodp', 'landsat7', 'mcmurdopredatorprey', 'notapplicable']
partner_org
    []
collection_period
    ['hs3dep2012', 'hs3dep2013', 'hs3dep2014',

### Foriegn Key Links

### Campaign

In [16]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign-to-gcmd_project', 
                                data_index='campaign', 
                                data_column='gcmd_project', 
                                foriegn_table='gcmd_project', 
                                foriegn_column='gcmd_uuid')
errors

Unnamed: 0,campaign,gcmd_project,suggestions
2,ACEPOL,e0a48b3c-ab3d-4331-b992-367352d5c09c,[]
16,C3VP,NID,[]
17,CalWater,NID,[]
77,ACTIVATE,TBD,[]
78,CAMP2Ex,TBD,[]
79,DCOTSS,TBD,[]
80,Delta-X,TBD,[]
83,IMPACTS,TBD,[]
84,S-MODE,TBD,[]
90,ARISE,NOT LISTED IN GCMD,[]


In [17]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign-to-focus_area', 
                                data_index='campaign', 
                                data_column='focus_area', 
                                foriegn_table='focus_area', 
                                foriegn_column='short_name')
errors

Unnamed: 0,campaign,focus_area,suggestions
10,ACT-America,Climate Variability and Change,[Climate Variability & Change]
39,CLAMS,Earth Surface and Interior,[Earth Surface & Interior]
83,ORACLES,Global Water and Energy Cycles,[Global Water & Energy Cycle]
103,TOGA COARE,Atmospheric Dynamics,[]
132,CASIE,Airborne Science,[]
134,CLPX II,Water & Energy Cycle,[Global Water & Energy Cycle]
135,CLPX III,Water & Energy Cycle,[Global Water & Energy Cycle]
138,DEVOTE,Airborne Science,[]
141,High Winds,Water & Energy Cycle,[Global Water & Energy Cycle]
150,Methane Sounder,Carbon Cycles and Ecosystems,[Carbon Cycle & Ecosystems]


In [18]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign-to-season', 
                                data_index='campaign', 
                                data_column='season', 
                                foriegn_table='season', 
                                foriegn_column='short_name')
errors

Unnamed: 0,campaign,season,suggestions
107,Delta-X,Overshooting tops,[]
108,Delta-X,deep convection,[]
109,Delta-X,lower stratospheric air chemistry,[]
120,ARISE,boreal summer boreal fall,[]


In [19]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign-to-platform_type', 
                                data_index='campaign', 
                                data_column='platform_type', 
                                foriegn_table='platform_type', 
                                foriegn_column='short_name')
errors

Unnamed: 0,campaign,platform_type,suggestions
1,ABoVE,Land-based Platforms,[Land Platforms]
6,ACEPOL,Land-based Platforms,[Land Platforms]
8,ACES,Land-based Platforms,[Land Platforms]
10,ACT-America,Land-based Platforms,[Land Platforms]
13,AfriSAR,Land-based Platforms,[Land Platforms]
15,AirMOSS,Land-based Platforms,[Land Platforms]
19,ARCTAS,Land-based Platforms,[Land Platforms]
23,ARESE,Land-based Platforms,[Land Platforms]
26,ATom,Model Output,[]
30,BOREAS,Land-based Platforms,[Land Platforms]


In [None]:
db['gcmd_phenomena']

In [23]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign-to-gcmd_phenomena', 
                                data_index='campaign', 
                                data_column='gcmd_phenomena', 
                                foriegn_table='gcmd_phenomena', 
                                foriegn_column='ignore_code')
errors

Unnamed: 0,campaign,gcmd_phenomena,suggestions


In [24]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign-to-repository', 
                                data_index='campaign', 
                                data_column='repository', 
                                foriegn_table='repository', 
                                foriegn_column='short_name')
errors

Unnamed: 0,campaign,repository,suggestions
16,BROMEX,NID,[]
23,CLASIC07,NSIDC,[]
24,CLPX,NSIDC,[]
67,SnowEx,NSIDC,[]
80,CAMP2Ex,TBD,[]
84,FIREx-AQ,TBD,[]
164,GTE - ABLE 2A/B,ADSC,[ASDC]


In [25]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign-to-partner_org', 
                                data_index='campaign', 
                                data_column='partner_org', 
                                foriegn_table='partner_org', 
                                foriegn_column='short_name')
errors

Unnamed: 0,campaign,partner_org,suggestions
11,ARCTAS,California Air Resources Board,[]
12,ARCTAS,International Polar Year,[]
17,ATTREX,DFG,[]
39,CORAL,NID,[]
40,CPEX,NID,[]
48,FIFE,NAC (Canada),[]
49,FIFE,AAFC (Canada),[]
54,HS3,Environment Canada,[]
61,IPHEx,Duke University,[]
62,IPHEx,NOAA Hydrometeorological Testbed,[]


In [29]:
errors = validate.foriegn_keys(db, 
                                data_table='platform-to-gcmd_platform', 
                                data_index='platform', 
                                data_column='gcmd_platform', 
                                foriegn_table='gcmd_platform', 
                                foriegn_column='gcmd_uuid')
errors

Unnamed: 0,platform,gcmd_platform,suggestions


In [36]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument-to-measurement_keywords', 
                                data_index='instrument', 
                                data_column='measurement_keywords', 
                                foriegn_table='gcmd_phenomena', 
                                foriegn_column='ignore_code')
errors

Unnamed: 0,instrument,measurement_keywords,suggestions
98,CPL,0000,"[1000, 2000, 3000, 4000]"
147,GCAS,GEO-CAPE Airborne Simulator,[]


In [38]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument-to-gcmd_instrument', 
                                data_index='instrument', 
                                data_column='gcmd_instrument', 
                                foriegn_table='gcmd_instrument', 
                                foriegn_column='gcmd_uuid')
errors

Unnamed: 0,instrument,gcmd_instrument,suggestions
8,4-STAR,b0f93e6a-c766-4957-8762-5c7709487459,[]
49,AVIRIS,d67afd03-3b79-419c-9289-5dde713ab904\n57854209...,[]
75,CIP,92f99316-b581-4adb-9980-aeb6bed64eee,[]
90,CPL,6238fe2-9a87-4e32-b866-c4a637094b51,[6238f3e2-9a87-4e32-b866-c4a637094b51]
126,EXRAD,a212d36d-2a4e-473f-b16a-6e2104b9dd8f\nba3de3fc...,[]
164,HSRL-2,abdf08cd-03c5-4497-87a4-65493584e2c7,[]
298,W-Band Radar,a90e-4a70-9bcb-93d106c1583f,[dc5ee11d-a90e-4a70-9bcb-93d106c1583f]


In [39]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument-to-instrument_type', 
                                data_index='instrument', 
                                data_column='instrument_type', 
                                foriegn_table='instrument_type', 
                                foriegn_column='short_name')
errors

Unnamed: 0,instrument,instrument_type,suggestions
2,2D-C/P,Chemical Meters/Analyzers,[]
22,AirMOSS,Earth Remote Sensing - Active Remote Sensing,[]
28,AMP,In Situ/Laboratory - Spectrometers/Radiometers,[In Situ - Spectrometer/Radiometer]
35,APR-2,Earth Remote Sensing - Active Remote Sensing,[]
46,AVAPS,Profilers/Sounders,[]
87,CPL,LIDAR,[]
125,FLIR,Earth Remote Sensing - Passive Remote Sensing,[]
148,HAMSR,Spectrometers/Radiometers,[In Situ - Spectrometer/Radiometer]
153,HIRAD,Spectrometers/Radiometers,[In Situ - Spectrometer/Radiometer]
155,HIWRAP,Earth Remote Sensing - Active Remote Sensing,[]


In [40]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument-to-geophysical_concept', 
                                data_index='instrument', 
                                data_column='geophysical_concept', 
                                foriegn_table='geophysical_concept', 
                                foriegn_column='short_name')
errors

Unnamed: 0,instrument,geophysical_concept,suggestions
248,RICE,Cloud Properties,[Soil Properties]
249,RICE,Processes & Dynamics,[]


In [41]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument-to-repository', 
                                data_index='instrument', 
                                data_column='repository', 
                                foriegn_table='repository', 
                                foriegn_column='short_name')
errors

Unnamed: 0,instrument,repository,suggestions
88,CPL,ORNL,[]


In [42]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument-to-measurement_region', 
                                data_index='instrument', 
                                data_column='measurement_region', 
                                foriegn_table='measurement_region', 
                                foriegn_column='short_name')
errors

Unnamed: 0,instrument,measurement_region,suggestions
87,CPL,full column,[]
148,HAMSR,full column,[]
153,HIRAD,boundary layer? sea surface is what I would ca...,[]
155,HIWRAP,full column,[]


In [43]:
errors = validate.foriegn_keys(db, 
                                data_table='deployment-to-geographical_region', 
                                data_index='deployment', 
                                data_column='geographical_region', 
                                foriegn_table='geographical_region', 
                                foriegn_column='short_name')
errors

Unnamed: 0,deployment,geographical_region,suggestions
94,ABoVE_dep_2017,Alaska,[]
95,ABoVE_dep_2017,western Canada,[]
96,ABoVE_dep_2018,Alaska,[]
97,ABoVE_dep_2018,western Canada,[]
98,ABoVE_dep_2019,Alaska,[]
99,ABoVE_dep_2019,western Canada,[]
100,CLAMS_dep_2001,Chesapeake Bay,[]
101,CLAMS_dep_2001,Atlantic Ocean,[]


In [44]:
errors = validate.foriegn_keys(db, 
                                data_table='deployment-to-platform', 
                                data_index='deployment', 
                                data_column='platform', 
                                foriegn_table='platform', 
                                foriegn_column='short_name')
print('\n\ndo I really need to validate this?')
errors

KeyError: 'deployment-to-platform'

In [45]:
errors = validate.foriegn_keys(db, 
                                data_table='deployment', 
                                data_index='short_name', 
                                data_column='foreign-campaign-short_name', 
                                foriegn_table='campaign', 
                                foriegn_column='short_name')
print('\n\n TODO this better once you have the data')
errors



 TODO this better once you have the data


6,short_name,foreign-campaign-short_name,suggestions


In [51]:
errors = validate.foriegn_keys(db, 
                                data_table='platform', 
                                data_index='short_name', 
                                data_column='foreign-platform_type-short_name', 
                                foriegn_table='platform_type', 
                                foriegn_column='short_name')
errors

2,short_name,foreign-platform_type-short_name,suggestions
8,ASO,Prop Plane,[]


In [57]:
errors = validate.foriegn_keys(db, 
                                data_table='platform-to-gcmd_platform', 
                                data_index='platform', 
                                data_column='gcmd_platform', 
                                foriegn_table='gcmd_platform', 
                                foriegn_column='gcmd_uuid')
errors

Unnamed: 0,platform,gcmd_platform,suggestions


In [59]:
db['iopse']

2,short_name,TYPE (IOP or SE),parent short_name,foreign-campaign-short_name,ADMG Deployment Name,start_date,end_date,description,region_description,published_list,reports,reference_file,deployment_short_name
3,HS3_SE_1,SE,none,HS3,dep_2014,2014-09-11,2014-09-19,Hurricane Edouard - a period of apparent rapi...,Atlantic,https://journals.ametsoc.org/doi/pdf/10.1175/B...,Flight Reports: http://dx.doi.org/10.5067/HS3/...,Information Not Available,HS3_dep_2014
4,HS3_SE_2,SE,none,HS3,dep_2012,2012-09-11,2012-09-15,Hurricane Nadine - SAL interaction,Atlantic,https://journals.ametsoc.org/doi/pdf/10.1175/B...,Flight Reports: http://dx.doi.org/10.5067/HS3/...,Information Not Available,HS3_dep_2012
5,HS3_SE_3,SE,none,HS3,dep_2014,2014-10-15,2014-10-17,Hurricane Gonzalo - study of eyewall replacem...,Atlantic,https://journals.ametsoc.org/doi/pdf/10.1175/B...,Flight Reports: http://dx.doi.org/10.5067/HS3/...,Information Not Available,HS3_dep_2014
6,OLYMPEX_IOP_1,IOP,none,OLYMPEX,dep_2016,2015-11-10,2015-12-21,(Almost) all instruments operating,Olympic Peninsula,https://journals.ametsoc.org/doi/pdf/10.1175/B...,Flight Reports: http://dx.doi.org/10.5067/GPMG...,Information Not Available,OLYMPEX_dep_2015
7,OLYMPEX_SE_1,SE,OLYMPEX_iop_1,OLYMPEX,dep_2016,2015-11-13,2015-11-17,Atmospheric river - consecutive events,Olympic Peninsula,https://journals.ametsoc.org/doi/pdf/10.1175/B...,Science Summary: https://ghrc.nsstc.nasa.gov/p...,Information Not Available,OLYMPEX_dep_2015
8,OLYMPEX_SE_2,SE,OLYMPEX_iop_1,OLYMPEX,dep_2016,2015-12-08,2015-12-09,Atmospheric river,Olympic Peninsula,https://journals.ametsoc.org/doi/pdf/10.1175/B...,Science Summary: https://ghrc.nsstc.nasa.gov/p...,Information Not Available,OLYMPEX_dep_2015
9,OLYMPEX_SE_3,SE,OLYMPEX_iop_1,OLYMPEX,dep_2016,2015-12-03,2015-12-03,Good satellite/aircraft coordination,Olympic Peninsula,https://journals.ametsoc.org/doi/pdf/10.1175/B...,Science Summary: https://ghrc.nsstc.nasa.gov/...,Information Not Available,OLYMPEX_dep_2015
10,OLYMPEX_SE_4,SE,OLYMPEX_iop_1,OLYMPEX,dep_2016,2015-12-11,2015-12-18,Anomalous trough conditions,Olympic Peninsula,https://journals.ametsoc.org/doi/pdf/10.1175/B...,Science Summary: https://ghrc.nsstc.nasa.gov/p...,Information Not Available,OLYMPEX_dep_2015
11,IPHEX_SE_1,SE,IPHEX_iop_1,IPHEx,dep_2014,2014-05-01,2014-06-15,Primary time frame of the campaign/deployment....,Southern Appalachia,"https://doi.org/10.1175/JHM-D-17-0080.1, https...",Information Not Available,Information Not Available,IPHEx_dep_2014
12,CARVE_IOP_1,IOP,none,CARVE,dep_2012,2012-05-23,2012-10-20,"Multiple significant weather events, experienc...",Alaska,https://www.atmos-chem-phys.net/15/4093/2015/a...,Flight reports: https://doi.org/10.3334/ORNLDA...,Information Not Available,CARVE_dep_2012


In [58]:
errors = validate.foriegn_keys(db, 
                                data_table='iopse', 
                                data_index='iopse_id', 
                                data_column='deployment_short_name', 
                                foriegn_table='deployment', 
                                foriegn_column='short_name')
errors

KeyError: "['iopse_id'] not in index"

In [60]:
errors = validate.foriegn_keys(db, 
                                data_table='iopse', 
                                data_index='iopse_id', 
                                data_column='event_type', 
                                foriegn_table='deployment', 
                                foriegn_column='short_name')
errors

KeyError: 'event_type'

### Flight

In [62]:
# flight table, instruments

errors = validate.foriegn_keys(db, 
                               data_table='collection_period', 
                              data_index='foreign-campaign-short_name', 
                              data_column='instrument', 
                              foriegn_table='instrument', 
                              foriegn_column='short_name')
errors

Unnamed: 0,foreign-campaign-short_name,instrument,suggestions
27,IPHEx,2D-C,[2D-C/P]
34,IPHEx,Nevzorov,[Nevzorov probe]
35,IPHEx,King hot wire,[King hot wire probe]
130,SEAC4RS,Dew Point,[]
131,SEAC4RS,LWC/TWC,[]
132,SEAC4RS,Rosemount temperature,[]
133,SEAC4RS,Rosemount icing rod,[]
135,SEAC4RS,NMASS,[MAS]
140,ATom,CAPS Vienna,[]
164,ATom,CAPS Vienna,[]


In [63]:
# Flight table, platforms

errors = validate.foriegn_keys(db, 
                      data_table='collection_period', 
                      data_index='foreign-campaign-short_name', 
                      data_column='platform', 
                      foriegn_table='platform', 
                      foriegn_column='short_name')
errors

Unnamed: 0,foreign-campaign-short_name,platform,suggestions
15,HS3,WB-57f,[WB-57]
16,HS3,WB-57f,[WB-57]
19,OLYMPEX,UND Citation II,[Citation]
21,OLYMPEX,Field_Site,[Field_Sites?]
1021,HyMEx,DO128,[DO-228]
1022,HyMEx,DO128,[DO-228]
1023,HyMEx,DO128,[DO-228]
1024,HyMEx,DO128,[DO-228]
1025,HyMEx,DO128,[DO-228]
1026,HyMEx,DO128,[DO-228]


In [66]:
errors = validate.foriegn_keys(db, 
                      data_table='collection_period', 
                      data_index='foreign-campaign-short_name', 
                      data_column='foreign-deployment-short_name', 
                      foriegn_table='deployment', 
                      foriegn_column='short_name')
errors

Unnamed: 0,foreign-campaign-short_name,foreign-deployment-short_name,suggestions
0,HS3,dep_2012,"[HS3_dep_2012, DC3_dep_2012]"
1,HS3,dep_2012,"[HS3_dep_2012, DC3_dep_2012]"
2,HS3,dep_2012,"[HS3_dep_2012, DC3_dep_2012]"
3,HS3,dep_2013,[HS3_dep_2013]
4,HS3,dep_2013,[HS3_dep_2013]
...,...,...,...
1051,TCSP,dep_2005,[TCSP_dep_2005]
1052,TCSP,dep_2005,[TCSP_dep_2005]
1053,TCSP,dep_2005,[TCSP_dep_2005]
1054,TCSP,dep_2005,[TCSP_dep_2005]


# Dates

In [67]:
for table_name in db.keys():
    # find tables that have date fields
    if len(col_names := [col for col in db[table_name].columns if 'date' in col])>0:
        print(f"{table_name}\n    {', '.join(col_names)}")

campaign
    start_date, end_date, ignore_metadata_date
deployment
    start_date, end_date
iopse
    start_date, end_date


In [68]:
table_names = ['campaign', 'deployment', 'iopse'] 
for table_name in table_names:
    print(table_name)
    db[table_name]['valid_date']=False
    
    db[table_name]['valid_date'] = db[table_name].apply(lambda row: validate.vali_date(row['start_date'], row['end_date']), axis=1)
        

campaign
    non date-time detected: 2017-04-26 00:00:00, ongoing
    non date-time detected: 2015,  on-going
    non date-time detected: 2016-09-26 00:00:00, ongoing
    non date-time detected: 1905-06-18 00:00:00, Information Not Available
    non date-time detected: 1998, 1998
    non date-time detected: January ?, 2019, December ?, 2023
    non date-time detected: TBD, TBD
    non date-time detected: TBD, TBD
    non date-time detected: TBD, TBD
    non date-time detected: 1905-07-03 00:00:00, ongoing
    non date-time detected: 2007, 2008
    non date-time detected: 2007, 2008
    non date-time detected: Information Not Available, 1984-06-01 00:00:00
    non date-time detected: Information Not Available, 1984-06-01 00:00:00
    non date-time detected: Information Not Available, 1984-06-01 00:00:00
    non date-time detected: Information Not Available, 1983-07-01 00:00:00
    non date-time detected: Information Not Available, 1983-11-01 00:00:00
    non date-time detected: Informat

In [70]:
# this field no longer exists?

# table_names = ['instrument'] 
# for table_name in table_names:
    
#     db[table_name]['valid_date']=False
#     db[table_name]['valid_date'] = db[table_name].apply(lambda row: validate.vali_date(row['deployment_date'], row['decommision_date']), axis=1)
       

In [71]:
db['campaign'][['short_name','start_date','end_date','valid_date']][db['campaign']['valid_date'].apply(lambda x: not(x))]

Unnamed: 0,short_name,start_date,end_date,valid_date
2,ABoVE,2017-04-26 00:00:00,ongoing,False
3,ABLE,Information Not Available,Information Not Available,False
9,AirMISR,Information Not Available,Information Not Available,False
10,AMSR,Information Not Available,Information Not Available,False
22,CITE,Information Not Available,Information Not Available,False
31,FIRE,Information Not Available,Information Not Available,False
37,IceBridge,Information Not Available,Information Not Available,False
38,ICEPOP,Information Not Available,Information Not Available,False
40,ImPACT-PM,Information Not Available,Information Not Available,False
41,INTEX,Information Not Available,Information Not Available,False


# IOPSE

In [72]:
# FILTER 
# TODO: FIX GRIP
db['iopse']=db['iopse'][db['iopse']['iopse_id']!='Information Not Available'][db['iopse']['foreign-campaign-short_name']!='GRIP'][db['iopse']['foriegn_campaign_short_name']!='ATTREX']


KeyError: 'iopse_id'

In [None]:
# split into two tables

db['significant_event'] = db['iopse'][db['iopse']['event_type']=='Y'].copy()
db['iop'] = db['iopse'][db['iopse']['event_type']!='Y'].copy()

In [None]:
# validate that all iops are unique

db['iop']['iop_short_name'][db['iop']['iop_short_name'].duplicated()]

In [None]:
# if sig event has matching iop, link them

# if sig event doesn't have matching iop, delete it so there will be no foriegn key

In [None]:
# tag significant events that have IOPs

db['significant_event']['has_iop'] = db['significant_event']['iop_short_name'].apply(lambda short_name: short_name in list(db['iop']['iop_short_name']))


In [None]:
# # check that all significant events with an iop have an iop

# s_i = set(db['iop']['iop_short_name'])
# s_s = set(has_iop['iop_short_name'])
# [s for s in s_s if s not in s_i]

In [None]:
from datetime import datetime
# sig event start >= iop start

def start_val(sig_row, db):
    large_date = sig_row['start_date']
    small_date = db['iop'][db['iop']['iop_short_name']==sig_row['iop_short_name']]['start_date'].iloc[0]
    val = large_date >= small_date
    return val

def end_val(sig_row, db):
    large_date = sig_row['end_date']
    small_date = db['iop'][db['iop']['iop_short_name']==sig_row['iop_short_name']]['end_date'].iloc[0]
    val = large_date <= small_date
    return val

has_iop = db['significant_event'][db['significant_event']['has_iop']].copy()
val_iop_date_start = has_iop.apply(lambda row: not(start_val(row, db)), axis=1)
val_iop_date_end = has_iop.apply(lambda row: not(end_val(row, db)), axis=1)

In [None]:
# display incorrect start dates
has_iop[val_iop_date_start]

In [None]:
# display incorrect end dates
has_iop[val_iop_date_end]

## Remap Limited Fields  I SHOULD DELETE THIS IT RUNS ELSEWHERE

In [None]:
from ingest import rename_columns

In [None]:
column_mapping = json.load(open('limited_col_mapping.json', 'r'))

In [None]:
# remaps limited field names
ingest_order = [
 'platform_type',
 'aircraft_type',
 'instrument_type',
 'home_base',
 'repository',
 'focus_area',
 'season',
 'measurement_region',
 'geographical_region',
 'geophysical_concept',
 'gcmd_phenomena',
 'gcmd_instrument',
 'gcmd_platform',
 'gcmd_project',
 'partner_org'
]
tables_to_remap = ['platform_type',
 'aircraft_type',
 'instrument_type',
 'home_base',
 'repository',
 'focus_area',
 'season',
 'measurement_region',
 'geographical_region',
 'geophysical_concept',]

for table_name in tables_to_remap:
    db[table_name] = rename_columns(db, table_name, column_mapping)

### Ingest Order

In [None]:
ingest_order = json.load(open('ingest_order.json', 'r'))
ingest_order

### Pickle the Data

In [73]:
import pickle

In [74]:
pickle.dump(db, open('ingested_data','wb'))

In [None]:
db['geographical_region']