# Imports

In [1]:
import pandas as pd
import json
import validate
from general import correct_values, many_to_many, many_cols
import ingest
import clean

In [2]:
def print_full(df):
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        print(df)

In [3]:
# def correct_values(db, table_name, column, wrong_value, correct_value):
#     db[table_name][column]=db[table_name][column].apply(lambda x: x if x!=wrong_value else correct_value)

# Pull Data into Pandas

In [4]:
# data = ingest.main('inventory_correct_linking.xlsx')
# data = ingest.main('inventory_2020.05.29.xlsx')
data = ingest.main('inventory_2020-07-01.xlsx')

In [5]:
excel_data = data['excel_data']
db = data['database']

# Cleaning

In [6]:
# change ignore_code field to be string
db['gcmd_phenomena']['ignore_code'] = db['gcmd_phenomena']['ignore_code'].apply(lambda x: str(x))

# convert gcmd_uuid into string
db['instrument']['table-measurement_keywords-gcmd_uuid'] = db['instrument']['table-measurement_keywords-gcmd_uuid'].apply(lambda x: str(x))

In [7]:
db = clean.remove_NaN_columns(db)

In [8]:
db = clean.strip_all_columns(db)

# Short Name Supplementation

In [9]:
# make unique shortname by combining the campaign name and the table sub short name

db['deployment']['short_name'] = db['deployment']['foreign-campaign-short_name']+'_'+db['deployment']['ignore_deployment_id']
db['iopse']['foreign-deployment-short_name']=db['iopse']['foreign-campaign-short_name']+'_'+db['iopse']['ignore_deployment']

db['collection_period'] = many_to_many(db, 'linking', 'table-instrument-short_name', keep_all=True)
db['collection_period']['short_name'] = db['collection_period']['foreign-campaign-short_name']+'_'+db['collection_period']['foreign-deployment-short_name']

# Find Matching Deployments for IOPSE

In [10]:
# original process before good shortname usage on the sheets


# db['iopse']['deployment_short_name'] = 'No Matches'

# for row in range(len(db['iopse'])):
#     iop_start = db['iopse'].iloc[row]['start_date']
#     iop_end = db['iopse'].iloc[row]['end_date']
#     iop_camp = db['iopse'].iloc[row]['foreign-campaign-short_name']
    
#     campaign_filter = db['deployment']['foreign-campaign-short_name'].apply(lambda short_name: short_name == iop_camp)
#     possible_campaigns = db['deployment'][campaign_filter]
    
#     start_filter = possible_campaigns['start_date'].apply(lambda dep_start: validate.vali_date(dep_start, iop_start))
#     end_filter = possible_campaigns['end_date'].apply(lambda dep_end: validate.vali_date(iop_end, dep_end))
    
#     matching_deployments = list(possible_campaigns[start_filter*end_filter]['short_name'])
#     if len(matching_deployments)>1:
#         print(f"error on {db['iopse'].iloc[row]}")
#     elif len(matching_deployments)==1:
#         matching_deployments = matching_deployments[0]
#     else:
#         matching_deployments = 'None Found'
#     db['iopse']['deployment_short_name'].iloc[row]=matching_deployments

In [11]:
# filter out missing rows on the iopse tab
db['iopse'] = db['iopse'][db['iopse']['short_name']!='Information Not Available']

In [12]:
# test for unexpected values in this column
assert set(db['iopse']['type']) == {'IOP', 'SE'}

In [13]:
# convert parent and short name to lower so they will match correctly
db['iopse']['short_name'] = db['iopse']['short_name'].apply(lambda x: x.lower())
db['iopse']['parent short_name'] = db['iopse']['parent short_name'].apply(lambda x: x.lower())

In [14]:
db['iop'] = db['iopse'][db['iopse']['type']=='IOP']
db['significant_event'] = db['iopse'][db['iopse']['type']=='SE']

# Many to Many Creation

In [15]:
# main_table_names = ['campaign', 'platform', 'instrument', 'deployment']

In [16]:
# for table in main_table_names:
#     print(table)
#     for column in [col for col in db[table].keys() if isinstance(col,str) and 'table' in col]:
#         name = column.split('-')[1]
#         new_table_name = f"{table}-to-{name}"
#         db[new_table_name]=many_to_many(db, table, column)
#         print(f'   {new_table_name} created')

# Campaign Filter

In [17]:
from general import filter_campaigns, log_short_names

In [18]:
ingest_campaign_list = json.load(open('ingest_campaign_list.json', 'r'))

In [19]:
db = filter_campaigns(db, ingest_campaign_list)

In [20]:
log_short_names(db, 'instrument')
log_short_names(db, 'platform')

# Many to Many Creation

In [21]:
main_table_names = ['campaign', 'platform', 'instrument', 'deployment']

In [22]:
for table in main_table_names:
    print(table)
#     print([col for col in db[table].keys() if isinstance(col,str) and 'table' in col])
    for column in [col for col in db[table].keys() if isinstance(col,str) and 'table' in col]:
        name = column.split('-')[1]
        new_table_name = f"{table}-to-{name}"
        db[new_table_name]=many_to_many(db, table, column)
        print(f'   {new_table_name} created')

campaign
   campaign-to-focus_area created
   campaign-to-season created
   campaign-to-platform_type created
   campaign-to-repository created
   campaign-to-partner_org created
   campaign-to-gcmd_project created
platform
   platform-to-gcmd_platform created
instrument
   instrument-to-gcmd_instrument created
   instrument-to-instrument_type created
   instrument-to-measurement_keywords created
   instrument-to-geophysical_concept created
   instrument-to-repository created
   instrument-to-measurement_region created
deployment
   deployment-to-geographical_region created


In [23]:
assert 5==6

AssertionError: 

# Validation

### Short Name Duplicates

In [24]:
for table_name in db.keys():
    if table_name == 'collection_period':
        # this is being skipped because it has been broken out by instrument
        # and therefore it has duplicate short_names
        continue
        
    if 'short_name' in db[table_name].keys():
        print(table_name)
        duplicates = validate.find_duplicates(db, table_name, 'short_name')

        print(f'    {duplicates}')

platform_type
    []
home_base
    []
repository
    []
focus_area
    []
season
    []
instrument_type
    []
measurement_region
    []
geographical_region
    []
geophysical_concept
    []
campaign
    []
platform
    []
instrument
    []
deployment
    []
iopse
    []
gcmd_instrument
    ['atlas', 'informationnotavailable', 'wcr', nan, 'epic', 'opc', 'gnssreceiver', 'cris', 'aa', 'iris', 'particlespectrometers', 'ssies', 'icecube', 'aps']
gcmd_platform
    ['kingair', 'informationnotavailable', 'goes10', 'goes11', 'goes12', 'goes13', 'goes14', 'goes15', 'goes16', 'goes1', 'goes2', 'goes3', 'goes4', 'goes5', 'goes6', 'goes7', 'goes8', 'goes9', 'environmentalmodeling']
gcmd_project
    ['afsisclimate', 'camp', 'informationnotavailable', 'iodp', 'landsat7', 'mcmurdopredatorprey', 'notapplicable']
partner_org
    []
iop
    []
significant_event
    []


### Foriegn Key Links

In [25]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign-to-gcmd_project', 
                                data_index='campaign', 
                                data_column='gcmd_project', 
                                foriegn_table='gcmd_project', 
                                foriegn_column='gcmd_uuid')
errors

Unnamed: 0,campaign,gcmd_project,suggestions
9,OLYMPEX,NID,[]


In [26]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign-to-focus_area', 
                                data_index='campaign', 
                                data_column='focus_area', 
                                foriegn_table='focus_area', 
                                foriegn_column='short_name')
errors

Unnamed: 0,campaign,focus_area,suggestions


In [27]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign-to-season', 
                                data_index='campaign', 
                                data_column='season', 
                                foriegn_table='season', 
                                foriegn_column='short_name')
errors

Unnamed: 0,campaign,season,suggestions


In [28]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign-to-platform_type', 
                                data_index='campaign', 
                                data_column='platform_type', 
                                foriegn_table='platform_type', 
                                foriegn_column='short_name')
errors

Unnamed: 0,campaign,platform_type,suggestions
4,ACES,Permanent Land Sites,[Permanant Land Sites]
20,DC3,Permanent Land Sites,[Permanant Land Sites]
25,GOES-R PLT,Permanent Land Sites,[Permanant Land Sites]
31,HS3,Permanent Land Sites,[Permanant Land Sites]
34,OLYMPEX,Permanent Land Sites,[Permanant Land Sites]


In [56]:
[key for key in db.keys() if 'gcmd' in key]

['gcmd_phenomena',
 'gcmd_instrument',
 'gcmd_platform',
 'gcmd_project',
 'campaign-to-gcmd_project',
 'platform-to-gcmd_platform',
 'instrument-to-gcmd_instrument']

In [29]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign-to-gcmd_phenomena', 
                                data_index='campaign', 
                                data_column='gcmd_phenomena', 
                                foriegn_table='gcmd_phenomena', 
                                foriegn_column='ignore_code')
errors

KeyError: 'campaign-to-gcmd_phenomena'

In [30]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign-to-repository', 
                                data_index='campaign', 
                                data_column='repository', 
                                foriegn_table='repository', 
                                foriegn_column='short_name')
errors

Unnamed: 0,campaign,repository,suggestions


In [31]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign-to-partner_org', 
                                data_index='campaign', 
                                data_column='partner_org', 
                                foriegn_table='partner_org', 
                                foriegn_column='short_name')
errors

Unnamed: 0,campaign,partner_org,suggestions


In [32]:
errors = validate.foriegn_keys(db, 
                                data_table='platform-to-gcmd_platform', 
                                data_index='platform', 
                                data_column='gcmd_platform', 
                                foriegn_table='gcmd_platform', 
                                foriegn_column='gcmd_uuid')
errors

Unnamed: 0,platform,gcmd_platform,suggestions


In [33]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument-to-measurement_keywords', 
                                data_index='instrument', 
                                data_column='measurement_keywords', 
                                foriegn_table='gcmd_phenomena', 
                                foriegn_column='ignore_code')
errors

Unnamed: 0,instrument,measurement_keywords,suggestions
148,FTS,1154 1159,[]


In [34]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument-to-gcmd_instrument', 
                                data_index='instrument', 
                                data_column='gcmd_instrument', 
                                foriegn_table='gcmd_instrument', 
                                foriegn_column='gcmd_uuid')
errors

Unnamed: 0,instrument,gcmd_instrument,suggestions
4,3V-CPI,NID,[]
9,AirMOSS,NID,[]
13,AOP,NID,[]
23,BAT probe,NID,[]
25,CAFS,NID,[]
34,CI-ITMS,NID,[]
36,CIP,92f99316-b581-4adb-9980-aeb6bed64eee,[]
37,CIT-CIMS,NID,[]
45,CPSD,NID,[]
88,HD-SP2,NID,[]


In [None]:
correct_values(
    db=db,
    table_name = 'instrument-to-gcmd_instrument',
    column = 'gcmd_instrument',
    wrong_value = 'NID',
    correct_value = '6238f3e2-9a87-4e32-b866-c4a637094b51')
correct_values(
    db=db,
    table_name = 'instrument-to-gcmd_instrument',
    column = 'gcmd_instrument',
    wrong_value = '92f99316-b581-4adb-9980-aeb6bed64eee',
    correct_value = 'None')
correct_values(
    db=db,
    table_name = 'instrument-to-gcmd_instrument',
    column = 'gcmd_instrument',
    wrong_value = '6238fe2-9a87-4e32-b866-c4a637094b51',
    correct_value = '6238f3e2-9a87-4e32-b866-c4a637094b51')
correct_values(
    db=db,
    table_name = 'instrument-to-gcmd_instrument',
    column = 'gcmd_instrument',
    wrong_value = 'None',
    correct_value = '6238f3e2-9a87-4e32-b866-c4a637094b51')

In [35]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument-to-instrument_type', 
                                data_index='instrument', 
                                data_column='instrument_type', 
                                foriegn_table='instrument_type', 
                                foriegn_column='short_name')
errors

Unnamed: 0,instrument,instrument_type,suggestions


In [None]:
correct_values(
    db=db,
    table_name = 'instrument-to-instrument_type',
    column = 'instrument_type',
    wrong_value = 'Passive - Remote Sensing',
    correct_value = 'Remote - Passive')
correct_values(
    db=db,
    table_name = 'instrument-to-instrument_type',
    column = 'instrument_type',
    wrong_value = 'Earth Remote Sensing - Active Remote Sensing',
    correct_value = 'Remote - Active')

In [36]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument-to-geophysical_concept', 
                                data_index='instrument', 
                                data_column='geophysical_concept', 
                                foriegn_table='geophysical_concept', 
                                foriegn_column='short_name')
errors

Unnamed: 0,instrument,geophysical_concept,suggestions
0,2D-C/P,Clouds-Properties & Processes & Dynamics,[]
1,2D-C/P,Precipitation,[]
3,3V-CPI,Precipitation,[]
4,3V-CPI,Clouds-Properties & Processes & Dynamics,[]
9,AirMOSS,Terrestrial Hydrology,[]
11,AirMSPI,Radiation & Energy Budget,[Radiation & Energy]
15,APR-2,Precipitation,[]
20,AVAPS,Boundary Layer Processes,[]
21,AVIRIS,Radiation & Energy Budget,[Radiation & Energy]
23,BAT probe,Boundary Layer Processes,[]


In [None]:
correct_values(
    db=db,
    table_name = 'instrument-to-geophysical_concept',
    column = 'geophysical_concept',
    wrong_value = 'Precipitation',
    correct_value = 'Precipitation (Precip)')
correct_values(
    db=db,
    table_name = 'instrument-to-geophysical_concept',
    column = 'geophysical_concept',
    wrong_value = 'Terrestrial Hydrology',
    correct_value = 'Terrestrial Hydrology (TerrHydrol)')
correct_values(
    db=db,
    table_name = 'instrument-to-geophysical_concept',
    column = 'geophysical_concept',
    wrong_value = 'Atmospheric Chemicals & Trace Gasses',
    correct_value = 'Atmospheric Chemicals & Trace Gases')
correct_values(
    db=db,
    table_name = 'instrument-to-geophysical_concept',
    column = 'geophysical_concept',
    wrong_value = 'Cloud Properties',
    correct_value = 'Clouds-Properties & Processes & Dynamics	')
correct_values(
    db=db,
    table_name = 'instrument-to-geophysical_concept',
    column = 'geophysical_concept',
    wrong_value = 'Processes & Dynamics',
    correct_value = 'Boundary Layer Processes')

In [37]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument-to-repository', 
                                data_index='instrument', 
                                data_column='repository', 
                                foriegn_table='repository', 
                                foriegn_column='short_name')
errors

Unnamed: 0,instrument,repository,suggestions
82,FGM,NSIDC,[]


In [None]:
correct_values(
    db=db,
    table_name = 'instrument-to-repository',
    column = 'repository',
    wrong_value = 'ORNL',
    correct_value = 'OB.DAAC')

In [38]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument-to-measurement_region', 
                                data_index='instrument', 
                                data_column='measurement_region', 
                                foriegn_table='measurement_region', 
                                foriegn_column='short_name')
errors

Unnamed: 0,instrument,measurement_region,suggestions
158,WAS,Troposhere,"[Troposphere, Mid-Troposphere]"


In [None]:
correct_values(
    db=db,
    table_name = 'instrument-to-measurement_region',
    column = 'measurement_region',
    wrong_value = 'Troposphere',
    correct_value = 'mid-troposphere')
correct_values(
    db=db,
    table_name = 'instrument-to-measurement_region',
    column = 'measurement_region',
    wrong_value = 'troposphere',
    correct_value = 'mid-troposphere')
correct_values(
    db=db,
    table_name = 'instrument-to-measurement_region',
    column = 'measurement_region',
    wrong_value = 'subsurface',
    correct_value = 'subsurface - water')

In [None]:
db['instrument-to-measurement_region'][db['instrument-to-measurement_region']['measurement_region']=='subsurface - water']

In [39]:
errors = validate.foriegn_keys(db, 
                                data_table='deployment-to-geographical_region', 
                                data_index='deployment', 
                                data_column='geographical_region', 
                                foriegn_table='geographical_region', 
                                foriegn_column='short_name')
errors

Unnamed: 0,deployment,geographical_region,suggestions
23,DC3_dep_2012,,[]


In [None]:
[key for key in db.keys() if 'deployment' in key]

In [40]:
errors = validate.foriegn_keys(db, 
                                data_table='deployment-to-platform', 
                                data_index='deployment', 
                                data_column='platform', 
                                foriegn_table='platform', 
                                foriegn_column='short_name')
print('\n\ndo I really need to validate this?')
errors

KeyError: 'deployment-to-platform'

In [41]:
errors = validate.foriegn_keys(db, 
                                data_table='deployment', 
                                data_index='short_name', 
                                data_column='foreign-campaign-short_name', 
                                foriegn_table='campaign', 
                                foriegn_column='short_name')
print('\n\n TODO this better once you have the data')
errors



 TODO this better once you have the data


6,short_name,foreign-campaign-short_name,suggestions


In [42]:
errors = validate.foriegn_keys(db, 
                                data_table='platform', 
                                data_index='short_name', 
                                data_column='foreign-platform_type-short_name', 
                                foriegn_table='platform_type', 
                                foriegn_column='short_name')
errors

2,short_name,foreign-platform_type-short_name,suggestions
58,Campaign PL,Permanent Land Sites,[Permanant Land Sites]


In [43]:
errors = validate.foriegn_keys(db, 
                                data_table='platform-to-gcmd_platform', 
                                data_index='platform', 
                                data_column='gcmd_platform', 
                                foriegn_table='gcmd_platform', 
                                foriegn_column='gcmd_uuid')
errors

Unnamed: 0,platform,gcmd_platform,suggestions


In [44]:
errors = validate.foriegn_keys(db, 
                                data_table='iop', 
                                data_index='short_name', 
                                data_column='foreign-deployment-short_name', 
                                foriegn_table='deployment', 
                                foriegn_column='short_name')
errors

2,short_name,foreign-deployment-short_name,suggestions


In [None]:
correct_values(
    db=db,
    table_name = 'iop',
    column = 'foreign-deployment-short_name',
    wrong_value = 'AirMOSS_deb_2014b',
    correct_value = 'AirMOSS_dep_2014b')

In [45]:
errors = validate.foriegn_keys(db, 
                                data_table='significant_event', 
                                data_index='short_name', 
                                data_column='foreign-deployment-short_name', 
                                foriegn_table='deployment', 
                                foriegn_column='short_name')
errors

2,short_name,foreign-deployment-short_name,suggestions


### Flight

In [46]:
# flight table, instruments

errors = validate.foriegn_keys(db, 
                               data_table='collection_period', 
                              data_index='foreign-campaign-short_name', 
                              data_column='instrument', 
                              foriegn_table='instrument', 
                              foriegn_column='short_name')
errors

Unnamed: 0,foreign-campaign-short_name,instrument,suggestions
35,OLYMPEX,2DC,[2D-C/P]
36,OLYMPEX,Nevzorov,[Nevzorov probe]
41,OLYMPEX,imaging spectrometer,[Spectrometer]
42,OLYMPEX,DVD,[DV]
45,OLYMPEX,snow poles and cameras,[]
46,OLYMPEX,radiosondes,[Radiosonde]
51,OLYMPEX,X band radar,[]
94,CARVE,LI-COR(CRV),[]
103,CARVE,LI-COR(CRV),[]
112,CARVE,LI-COR(CRV),[]


In [None]:
db['collection_period']=db['collection_period'][db['collection_period']['instrument']!='NAWX radar']

In [47]:
# Flight table, platforms

errors = validate.foriegn_keys(db, 
                      data_table='collection_period', 
                      data_index='foreign-campaign-short_name', 
                      data_column='foreign-platform-short_name', 
                      foriegn_table='platform', 
                      foriegn_column='short_name')
errors

Unnamed: 0,foreign-campaign-short_name,foreign-platform-short_name,suggestions


In [None]:
correct_values(
    db=db,
    table_name = 'collection_period',
    column = 'foreign-platform-short_name',
    wrong_value = 'UND Citation II',
    correct_value = 'Citation')
db['collection_period']=db['collection_period'][db['collection_period']['foreign-platform-short_name']!='Field_Site']

In [48]:
# I think to ignore this???

errors = validate.foriegn_keys(db, 
                      data_table='collection_period', 
                      data_index='foreign-campaign-short_name', 
                      data_column='short_name',#'foreign-deployment-short_name', 
                      foriegn_table='deployment', 
                      foriegn_column='short_name')
errors

Unnamed: 0,foreign-campaign-short_name,short_name,suggestions


In [None]:
correct_values(
    db=db,
    table_name = 'collection_period',
    column = 'short_name',
    wrong_value = 'OLYMPEX_dep_2016',
    correct_value = 'OLYMPEX_dep_2015')

db['collection_period']=db['collection_period'][db['collection_period']['short_name']!='Citation']

# Dates

In [49]:
for table_name in db.keys():
    # find tables that have date fields
    if len(col_names := [col for col in db[table_name].columns if 'date' in col])>0:
        print(f"{table_name}\n    {', '.join(col_names)}")

campaign
    start_date, end_date, ignore_metadata_date
deployment
    start_date, end_date
iopse
    end_date
iop
    end_date
significant_event
    end_date


In [50]:
table_names = ['campaign', 'deployment', 'iopse'] 
for table_name in table_names:
    print(table_name)
    db[table_name]['valid_date']=False
    
    db[table_name]['valid_date'] = db[table_name].apply(lambda row: validate.vali_date(row['start_date'], row['end_date']), axis=1)
        

campaign
deployment
iopse


KeyError: 'start_date'

In [None]:
# this field no longer exists?

# table_names = ['instrument'] 
# for table_name in table_names:
    
#     db[table_name]['valid_date']=False
#     db[table_name]['valid_date'] = db[table_name].apply(lambda row: validate.vali_date(row['deployment_date'], row['decommision_date']), axis=1)
       

In [51]:
db['campaign'][['short_name','start_date','end_date','valid_date']][db['campaign']['valid_date'].apply(lambda x: not(x))]

Unnamed: 0,short_name,start_date,end_date,valid_date


# IOPSE

In [None]:
db['iop']

In [None]:
# validate that all iops are unique, should return an empty list

db['iop']['short_name'][db['iop']['short_name'].duplicated()]

In [None]:
# if sig event has matching iop, link them

# if sig event doesn't have matching iop, delete it so there will be no foriegn key

In [None]:
# tag significant events that have IOPs

db['significant_event']['has_iop'] = db['significant_event']['iop_short_name'].apply(lambda short_name: short_name in list(db['iop']['iop_short_name']))


In [None]:
# # check that all significant events with an iop have an iop

# s_i = set(db['iop']['iop_short_name'])
# s_s = set(has_iop['iop_short_name'])
# [s for s in s_s if s not in s_i]

In [None]:
from datetime import datetime
# sig event start >= iop start

def start_val(sig_row, db):
    large_date = sig_row['start_date']
    small_date = db['iop'][db['iop']['iop_short_name']==sig_row['iop_short_name']]['start_date'].iloc[0]
    val = large_date >= small_date
    return val

def end_val(sig_row, db):
    large_date = sig_row['end_date']
    small_date = db['iop'][db['iop']['iop_short_name']==sig_row['iop_short_name']]['end_date'].iloc[0]
    val = large_date <= small_date
    return val

has_iop = db['significant_event'][db['significant_event']['has_iop']].copy()
val_iop_date_start = has_iop.apply(lambda row: not(start_val(row, db)), axis=1)
val_iop_date_end = has_iop.apply(lambda row: not(end_val(row, db)), axis=1)

In [None]:
# display incorrect start dates
has_iop[val_iop_date_start]

In [None]:
# display incorrect end dates
has_iop[val_iop_date_end]

## Sanity Check

In [52]:
db['nasa_mission']

KeyError: 'nasa_mission'

In [53]:
db['collection_period']

Unnamed: 0,foreign-campaign-short_name,foreign-deployment-short_name,foreign-platform-short_name,number_collection_periods,asp_long_name,platform_identifier,home_base,campaign_deployment_base,platform_owner,platform_technical_contact,instrument,instrument_information_source,location_info,notes_internal,short_name
0,HS3,dep_2012,GH,,Information Not Available,#872,AFRC,WFF,NASA,"Frank Cutler, David Fratello",CPL,https://journals.ametsoc.org/doi/pdf/10.1175/B...,Information Not Available,environment GH,HS3_dep_2012
1,HS3,dep_2012,GH,,Information Not Available,#872,AFRC,WFF,NASA,"Frank Cutler, David Fratello",AVAPS,https://journals.ametsoc.org/doi/pdf/10.1175/B...,Information Not Available,environment GH,HS3_dep_2012
2,HS3,dep_2012,GH,,Information Not Available,#872,AFRC,WFF,NASA,"Frank Cutler, David Fratello",S-HIS,https://journals.ametsoc.org/doi/pdf/10.1175/B...,Information Not Available,environment GH,HS3_dep_2012
3,HS3,dep_2013,GH,,Information Not Available,#872,AFRC,WFF,NASA,"Frank Cutler, David Fratello",CPL,https://journals.ametsoc.org/doi/pdf/10.1175/B...,Information Not Available,environment GH,HS3_dep_2013
4,HS3,dep_2013,GH,,Information Not Available,#872,AFRC,WFF,NASA,"Frank Cutler, David Fratello",AVAPS,https://journals.ametsoc.org/doi/pdf/10.1175/B...,Information Not Available,environment GH,HS3_dep_2013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
942,ARCTAS,dep_2008c,B-200,,NASA Beechcraft B-200,N529NA,LARC,"Yellowknife, NT, Canada",NASA,Bruce Fisher,HSRL,https://doi.org/10.5194/acp-10-5191-2010,Information Not Available,Information Not Available,ARCTAS_dep_2008c
943,ARCTAS,dep_2008c,B-200,,NASA Beechcraft B-200,N529NA,LARC,"Yellowknife, NT, Canada",NASA,Bruce Fisher,RSP,https://doi.org/10.5194/acp-10-5191-2010,Information Not Available,Information Not Available,ARCTAS_dep_2008c
944,ARCTAS,dep_2008c,Campaign FS,,Information Not Available,Information Not Available,Information Not Available,"Barrow, AK",Information Not Available,Information Not Available,LIDAR,https://ro.uow.edu.au/cgi/viewcontent.cgi?arti...,Information Not Available,Information Not Available,ARCTAS_dep_2008c
945,ARCTAS,dep_2008c,Campaign FS,,Information Not Available,Information Not Available,Information Not Available,"Barrow, AK",Information Not Available,Information Not Available,AERONET,https://ro.uow.edu.au/cgi/viewcontent.cgi?arti...,Information Not Available,Information Not Available,ARCTAS_dep_2008c


In [54]:
db['platform_type']

Unnamed: 0,ignore_ingest_label,short_name,long_name,ignore_gcmd_translation,gcmd_uuid,example,notes_public,foreign-platform_type-short_name
3,Platform Type,Aircraft,Aircraft Platforms,Aircraft,227d9c3d-f631-402d-84ed-b8c5a562fc27,Information Not Available,Platform type for instances when the type of a...,none
4,Platform Type,Balloons,Balloons,Balloons/Rockets,2196cc92-a5da-4233-9509-5523385da1d7,ground-released radiosondes,Weather (sounding) balloons launched from the ...,none
5,Platform Type,Rockets,Rockets,Balloons/Rockets,2196cc92-a5da-4233-9509-5523385da1d7,"Loki, Black Brant",Sounding rockets carry instrument/sensor paylo...,none
6,Platform Type,UAV,Unmanned Aerial Vehicle,Aircraft,227d9c3d-f631-402d-84ed-b8c5a562fc27,"Global Hawk, small quadcopter",Aircraft that operate without a human pilot ab...,Aircraft
7,Platform Type,Jet,Fixed-wing Jet Aircraft,Aircraft,227d9c3d-f631-402d-84ed-b8c5a562fc27,"DC-8, ER-2",Fixed-wing jet engine thrust aircraft,Aircraft
8,Platform Type,Prop,Fixed-wing Propeller Aircraft,Aircraft,227d9c3d-f631-402d-84ed-b8c5a562fc27,"B-200, Citation",Fixed-wing propeller thrust aircraft,Aircraft
9,Platform Type,Rotorcraft/Helicopter,Rotorcraft/Helicopter,"Aircraft, Helicopter","227d9c3d-f631-402d-84ed-b8c5a562fc27, 06e037ed...",Information Not Available,Aircraft that use vertically oriented spinning...,Aircraft
10,Platform Type,Other-Air,Other Aircraft Platforms,Aircraft,227d9c3d-f631-402d-84ed-b8c5a562fc27,"Airship, Tiltrotor",Aircraft for which the other Aircraft Sub-Type...,Aircraft
11,Platform Type,Land Platforms,Land-based Platforms,In Situ Land-based Platforms,4f396ff6-7bea-4ba4-afa3-198ebd914a4a,"NWS NEXRAD/88D radar towers, ground/field obse...","Any platform that operates on the Earth, excep...",none
12,Platform Type,Field Sites,Field Site/Ground Sites,"Weather Stations/Networks, Ground Stations, Fi...","57b7373d-5c21-4abb-8097-a410adc2a074, 491d3fcc...","snow pole locations, soil sample sites, rain g...",Instrument site(s) or sample location(s) on th...,Land Platforms


In [None]:
links = list(set(list(db['platform_type']['foreign-platform_type-short_name'])))
full = list(db['platform_type']['short_name'])
[link for link in links if link not in full]

### Pickle the Data

In [None]:
import pickle

In [None]:
pickle.dump(db, open('db_after_corrections','wb'))

In [None]:
db['geographical_region']

In [None]:
db['significant_event']