# Imports

In [1]:
import pandas as pd
import json
from automated_ingest import ingest_2

In [2]:
import validate
from general import filter_gcmd_tables
# from general import correct_values, many_to_many, many_cols, filter_gcmd_tables
# import ingest
# import clean

In [3]:
def print_full(df):
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        print(df)

# Pull Data into Pandas

In [4]:
db = ingest_2('inventory_data/inventory - 2020.10.26.xlsx')

campaign
   campaign-to-focus_area created
   campaign-to-season created
   campaign-to-platform_type created
   campaign-to-geophysical_concept created
   campaign-to-repository created
   campaign-to-partner_org created
   campaign-to-gcmd_project created
platform
   platform-to-gcmd_platform created
instrument
   instrument-to-gcmd_instrument created
   instrument-to-measurement_style created
   instrument-to-measurement_type created
   instrument-to-gcmd_phenomena created
   instrument-to-repository created
   instrument-to-measurement_region created
deployment
   deployment-to-geographical_region created


## Check for completeness

In [5]:
# here I am checking to make sure that their aren't any new column
# names I don't recognize

for table in db.keys():
    print(table)
    for field in list(db[table].columns):
        print('    ', field)
    print()

platform_type
     ignore_ingest_label
     short_name
     long_name
     ignore_gcmd_translation
     gcmd_uuid
     example
     notes_public
     foreign-platform_type-short_name

home_base
     ignore_ingest_label
     short_name
     long_name
     location
     notes_public
     additional_info
     ignore_TODO
     ignore_parent

repository
     ignore_ingest_label
     short_name
     long_name
     ignore_gcmd_translation
     gcmd_uuid
     ignore_blank
     notes_public
     ignore_parent

focus_area
     ignore_ingest_label
     short_name
     long_name
     url
     notes_public
     ignore_blank_1
     ignore_blank_2
     ignore_parent

season
     ignore_ingest_label
     short_name
     long_name
     ignore_blank_1
     ignore_blank_2
     ignore_blank_3
     notes_public
     ignore_parent

measurement_type
     ignore_ingest_label
     short_name
     long_name
     description
     gcmd_translation
     examples
     notes_public
     foreign-measurement_type-shor

# Validation

### Short Name Duplicates

In [6]:
for table_name in db.keys():
    if table_name == 'collection_period':
        # this is being skipped because it has been broken out by instrument
        # and therefore it has duplicate short_names
        continue
        
    if 'short_name' in db[table_name].keys():
        print(table_name)
        duplicates = validate.find_duplicates(db, table_name, 'short_name')

        print(f'    {duplicates}')

platform_type
    []
home_base
    []
repository
    []
focus_area
    []
season
    []
measurement_type
    []
measurement_style
    []
measurement_region
    []
geographical_region
    []
geophysical_concept
    []
campaign
    []
platform
    ['informationnotavailable']
instrument
    ['atsp', 'informationnotavailable']
deployment
    []
gcmd_instrument
    ['atlas', 'informationnotavailable', 'wcr', 'epic', 'opc', 'gnssreceiver', 'cris', 'aa', 'iris', 'particlespectrometers', 'ssies', 'icecube', 'aps']
gcmd_platform
    ['kingair', 'informationnotavailable', 'goes10', 'goes11', 'goes12', 'goes13', 'goes14', 'goes15', 'goes16', 'goes1', 'goes2', 'goes3', 'goes4', 'goes5', 'goes6', 'goes7', 'goes8', 'goes9', 'environmentalmodeling']
gcmd_project
    ['afsisclimate', 'camp', 'informationnotavailable', 'iodp', 'landsat7', 'mcmurdopredatorprey', 'notapplicable']
partner_org
    []
iop
    []
significant_event
    []


### Foriegn Key Links

In [7]:
foreign_data_tables = [table_name for table_name in db.keys() if '-to-' in table_name]
primary_mapping = json.load(open('config/mapping_primary.json', 'r'))

In [8]:
error_log = []
for data_table in foreign_data_tables:
    data_index = data_table.split('-')[0]
    data_column = foriegn_table = data_table.split('-')[2]
    foriegn_column = primary_mapping[foriegn_table]    
    
    errors = validate.foriegn_keys(db, 
                                    data_table=data_table,
                                    data_index=data_index,
                                    data_column=data_column,
                                    foriegn_table=foriegn_table,
                                    foriegn_column=foriegn_column
    )
    error_log.append([data_table, errors])

In [9]:
for index, log in enumerate(error_log):
    print()
    print()
    print(index)
    print(log[0])
    print(log[1])



0
campaign-to-focus_area
Empty DataFrame
Columns: [campaign, focus_area, suggestions]
Index: []


1
campaign-to-season
Empty DataFrame
Columns: [campaign, season, suggestions]
Index: []


2
campaign-to-platform_type
Empty DataFrame
Columns: [campaign, platform_type, suggestions]
Index: []


3
campaign-to-geophysical_concept
Empty DataFrame
Columns: [campaign, geophysical_concept, suggestions]
Index: []


4
campaign-to-repository
Empty DataFrame
Columns: [campaign, repository, suggestions]
Index: []


5
campaign-to-partner_org
Empty DataFrame
Columns: [campaign, partner_org, suggestions]
Index: []


6
campaign-to-gcmd_project
   campaign                          gcmd_project suggestions
0    ACEPOL  e0a48b3c-ab3d-4331-b992-367352d5c09c          []
21      OMG  5a8bb977-73f5-4454-a072-0ee0483868da          []


7
platform-to-gcmd_platform
Empty DataFrame
Columns: [platform, gcmd_platform, suggestions]
Index: []


8
instrument-to-gcmd_instrument
              instrument                 

In [10]:
#########################################################
# run with caution, removes all errors indescriminately #
#########################################################

for index, log in enumerate(error_log):
    table_name = log[0]
    errors = log[1]
    db[table_name].drop(list(errors.index), inplace=True)

In [11]:
# correct_values(
#     db=db,
#     table_name = 'instrument-to-instrument_type',
#     column = 'instrument_type',
#     wrong_value = 'Passive - Remote Sensing',
#     correct_value = 'Remote - Passive')

In [18]:
error_log_2 = []
for table in db.keys():
    # linking is no longer important after collection period is made
    if table == 'linking':
        continue
    for column in db[table].keys():
        if 'foreign' in column:
            print(table)
            break
    for column in db[table].keys():
        if 'foreign' in column:            
            print('   ', column)
                      
            data_table = table
            data_index = 'short_name'
            data_column = column
            foriegn_table = column.split('-')[1]
            foriegn_column = 'short_name'
            
            errors = validate.foriegn_keys(db, 
                                            data_table=data_table,
                                            data_index=data_index,
                                            data_column=data_column,
                                            foriegn_table=foriegn_table,
                                            foriegn_column=foriegn_column
            )
            error_log_2.append([table, errors])                       

platform_type
    foreign-platform_type-short_name
measurement_type
    foreign-measurement_type-short_name
measurement_style
    foreign-measurement_style-short_name
platform
    foreign-platform_type-short_name
deployment
    foreign-campaign-short_name
collection_period
    foreign-campaign-short_name
    foreign-deployment-short_name
    foreign-platform-short_name
    foreign-instrument-short_name
iop
    foreign-campaign-short_name
    foreign-deployment-short_name
significant_event
    foreign-campaign-short_name
    foreign-deployment-short_name


In [19]:
for index, log in enumerate(error_log_2):
    print()
    print()
    print(index)
    print(log[0])
    print(log[1])



0
platform_type
                   short_name foreign-platform_type-short_name suggestions
3               Air Platforms                             none          []
11             Land Platforms                             none          []
16            Water Platforms                             none          []
28            Living organism                             none          []
29                    Visuals                             none          []
30                 Satellites                             none          []
31          Manned Spacecraft                             none          []
32                     Models                             none          []
33  Information Not Available                             none          []


1
measurement_type
Empty DataFrame
Columns: [short_name, foreign-measurement_type-short_name, suggestions]
Index: []


2
measurement_style
Empty DataFrame
Columns: [short_name, foreign-measurement_style-short_name, suggestions]
In

In [20]:
table_name

'collection_period'

In [21]:
#########################################################
# run with caution, removes all errors indescriminately #
#########################################################

for index, log in enumerate(error_log_2):
    if 'measurement' not in log[0] and 'platform_type' not in log[0]:
        print(log[0])
        table_name = log[0]
        errors = log[1]
        db[table_name].drop(list(errors.index), inplace=True)

platform
deployment
collection_period
collection_period
collection_period
collection_period
iop
iop
significant_event
significant_event


In [None]:
assert 5==6

In [None]:
# run with caution, removes all errors indescriminately
db['iop'].drop(list(errors.index), inplace=True)

In [23]:
errors = validate.foriegn_keys(db, 
                                data_table='significant_event', 
                                data_index='short_name', 
                                data_column='foreign-deployment-short_name', 
                                foriegn_table='deployment', 
                                foriegn_column='short_name')
errors

2,short_name,foreign-deployment-short_name,suggestions


### Flight

In [None]:
# remove collection periods with bad instrument links
db['collection_period'].drop(list(errors.index), inplace=True)

In [None]:
# remove collection periods with bad platform links
db['collection_period'].drop(list(errors.index), inplace=True)

In [None]:
# remove collection periods with bad deployment links
db['collection_period'].drop(list(errors.index), inplace=True)

# Dates

In [None]:
for table_name in db.keys():
    # find tables that have date fields
    if len(col_names := [col for col in db[table_name].columns if 'date' in col])>0:
        print(f"{table_name}\n    {', '.join(col_names)}")

In [None]:
table_names = ['campaign', 'deployment', 'iopse'] 
for table_name in table_names:
    print(table_name)
    db[table_name]['valid_date']=False
    
    db[table_name]['valid_date'] = db[table_name].apply(lambda row: validate.vali_date(row['start_date'], row['end_date']), axis=1)
        

In [None]:
# this field no longer exists?

# table_names = ['instrument'] 
# for table_name in table_names:
    
#     db[table_name]['valid_date']=False
#     db[table_name]['valid_date'] = db[table_name].apply(lambda row: validate.vali_date(row['deployment_date'], row['decommision_date']), axis=1)
       

In [None]:
db['campaign'][['short_name','start_date','end_date','valid_date']][db['campaign']['valid_date'].apply(lambda x: not(x))]

# IOPSE

In [None]:
db['iop']

In [None]:
# validate that all iops are unique, should return an empty list

db['iop']['short_name'][db['iop']['short_name'].duplicated()]

In [None]:
# if sig event has matching iop, link them

# if sig event doesn't have matching iop, delete it so there will be no foriegn key

In [None]:
# tag significant events that have IOPs

db['significant_event']['has_iop'] = db['significant_event']['iop_short_name'].apply(lambda short_name: short_name in list(db['iop']['iop_short_name']))


In [None]:
# # check that all significant events with an iop have an iop

# s_i = set(db['iop']['iop_short_name'])
# s_s = set(has_iop['iop_short_name'])
# [s for s in s_s if s not in s_i]

In [None]:
from datetime import datetime
# sig event start >= iop start

def start_val(sig_row, db):
    large_date = sig_row['start_date']
    small_date = db['iop'][db['iop']['iop_short_name']==sig_row['iop_short_name']]['start_date'].iloc[0]
    val = large_date >= small_date
    return val

def end_val(sig_row, db):
    large_date = sig_row['end_date']
    small_date = db['iop'][db['iop']['iop_short_name']==sig_row['iop_short_name']]['end_date'].iloc[0]
    val = large_date <= small_date
    return val

has_iop = db['significant_event'][db['significant_event']['has_iop']].copy()
val_iop_date_start = has_iop.apply(lambda row: not(start_val(row, db)), axis=1)
val_iop_date_end = has_iop.apply(lambda row: not(end_val(row, db)), axis=1)

In [None]:
# display incorrect start dates
has_iop[val_iop_date_start]

In [None]:
# display incorrect end dates
has_iop[val_iop_date_end]

# Filter GCMD Tables

In [None]:
links = list(set(list(db['platform_type']['foreign-platform_type-short_name'])))
full = list(db['platform_type']['short_name'])
[link for link in links if link not in full]

In [24]:
db = filter_gcmd_tables(db)

### Pickle the Data

In [25]:
import pickle

In [26]:
pickle.dump(db, open('ingest_data/db_20201025','wb'))

In [None]:
db['campaign']['nasa_led']

In [None]:
[key for key in db.keys() if 'gcmd' in key]