# Imports

In [1]:
import pandas as pd
import json
import validate
from general import correct_values, many_to_many, many_cols, filter_gcmd_tables
import ingest
import clean

In [2]:
def print_full(df):
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        print(df)

In [3]:
# def correct_values(db, table_name, column, wrong_value, correct_value):
#     db[table_name][column]=db[table_name][column].apply(lambda x: x if x!=wrong_value else correct_value)

# Pull Data into Pandas

In [4]:
data = ingest.main('inventory_data/inventory_2020.07.29.xlsx')

In [None]:
excel_data = data['excel_data']
db = data['database']

# Cleaning

In [None]:
# change ignore_code field to be string
db['gcmd_phenomena']['ignore_code'] = db['gcmd_phenomena']['ignore_code'].apply(lambda x: str(x))

# # convert gcmd_uuid into string
# db['instrument']['table-measurement_keywords-gcmd_uuid'] = db['instrument']['table-measurement_keywords-gcmd_uuid'].apply(lambda x: str(x))

# convert gcmd_uuid into string
db['instrument']['table-gcmd_phenomena-ignore_code'] = db['instrument']['table-gcmd_phenomena-ignore_code'].apply(lambda x: str(x))

# remove extra NID from platform type
db['platform_type'] = db['platform_type'][db['platform_type']['short_name']!='NID']

In [None]:
db = clean.remove_NaN_columns(db)

In [None]:
db = clean.strip_all_columns(db)

In [None]:
db = clean.replace_nid(db)

# Short Name Supplementation

In [None]:
# make unique shortname by combining the campaign name and the table sub short name

db['deployment']['short_name'] = db['deployment']['foreign-campaign-short_name']+'_'+db['deployment']['ignore_deployment_id']
db['iopse']['foreign-deployment-short_name']=db['iopse']['foreign-campaign-short_name']+'_'+db['iopse']['ignore_deployment']

db['collection_period'] = many_to_many(db, 'linking', 'table-instrument-short_name', keep_all=True)
db['collection_period']['short_name'] = db['collection_period']['foreign-campaign-short_name']+'_'+db['collection_period']['foreign-deployment-short_name']

# Find Matching Deployments for IOPSE

In [None]:
# original process before good shortname usage on the sheets


# db['iopse']['deployment_short_name'] = 'No Matches'

# for row in range(len(db['iopse'])):
#     iop_start = db['iopse'].iloc[row]['start_date']
#     iop_end = db['iopse'].iloc[row]['end_date']
#     iop_camp = db['iopse'].iloc[row]['foreign-campaign-short_name']
    
#     campaign_filter = db['deployment']['foreign-campaign-short_name'].apply(lambda short_name: short_name == iop_camp)
#     possible_campaigns = db['deployment'][campaign_filter]
    
#     start_filter = possible_campaigns['start_date'].apply(lambda dep_start: validate.vali_date(dep_start, iop_start))
#     end_filter = possible_campaigns['end_date'].apply(lambda dep_end: validate.vali_date(iop_end, dep_end))
    
#     matching_deployments = list(possible_campaigns[start_filter*end_filter]['short_name'])
#     if len(matching_deployments)>1:
#         print(f"error on {db['iopse'].iloc[row]}")
#     elif len(matching_deployments)==1:
#         matching_deployments = matching_deployments[0]
#     else:
#         matching_deployments = 'None Found'
#     db['iopse']['deployment_short_name'].iloc[row]=matching_deployments

In [None]:
# filter out missing rows on the iopse tab
db['iopse'] = db['iopse'][db['iopse']['short_name']!='Information Not Available']

In [None]:
# test for unexpected values in this column
assert set(db['iopse']['type']) == {'IOP', 'SE'}

In [None]:
# convert parent and short name to lower so they will match correctly
db['iopse']['short_name'] = db['iopse']['short_name'].apply(lambda x: x.lower())
db['iopse']['parent short_name'] = db['iopse']['parent short_name'].apply(lambda x: x.lower())

In [None]:
db['iop'] = db['iopse'][db['iopse']['type']=='IOP']
db['significant_event'] = db['iopse'][db['iopse']['type']=='SE']

# Campaign Filter

In [None]:
from general import filter_campaigns, log_short_names

In [None]:
ingest_campaign_list = json.load(open('config/ingest_campaign_list.json', 'r'))

In [None]:
db = filter_campaigns(db, ingest_campaign_list)

In [None]:
log_short_names(db, 'instrument')
log_short_names(db, 'platform')

# Many to Many Creation

In [None]:
main_table_names = ['campaign', 'platform', 'instrument', 'deployment']

In [None]:
for table in main_table_names:
    print(table)
#     print([col for col in db[table].keys() if isinstance(col,str) and 'table' in col])
    for column in [col for col in db[table].keys() if isinstance(col,str) and 'table' in col]:
        name = column.split('-')[1]
        new_table_name = f"{table}-to-{name}"
        db[new_table_name]=many_to_many(db, table, column)
        print(f'   {new_table_name} created')

# Validation

### Short Name Duplicates

In [None]:
for table_name in db.keys():
    if table_name == 'collection_period':
        # this is being skipped because it has been broken out by instrument
        # and therefore it has duplicate short_names
        continue
        
    if 'short_name' in db[table_name].keys():
        print(table_name)
        duplicates = validate.find_duplicates(db, table_name, 'short_name')

        print(f'    {duplicates}')

### Foriegn Key Links

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign-to-gcmd_project', 
                                data_index='campaign', 
                                data_column='gcmd_project', 
                                foriegn_table='gcmd_project', 
                                foriegn_column='gcmd_uuid')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign-to-focus_area', 
                                data_index='campaign', 
                                data_column='focus_area', 
                                foriegn_table='focus_area', 
                                foriegn_column='short_name')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign-to-season', 
                                data_index='campaign', 
                                data_column='season', 
                                foriegn_table='season', 
                                foriegn_column='short_name')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign-to-platform_type', 
                                data_index='campaign', 
                                data_column='platform_type', 
                                foriegn_table='platform_type', 
                                foriegn_column='short_name')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign-to-geophysical_concept', 
                                data_index='campaign', 
                                data_column='geophysical_concept', 
                                foriegn_table='geophysical_concept', 
                                foriegn_column='short_name')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign-to-repository', 
                                data_index='campaign', 
                                data_column='repository', 
                                foriegn_table='repository', 
                                foriegn_column='short_name')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign-to-partner_org', 
                                data_index='campaign', 
                                data_column='partner_org', 
                                foriegn_table='partner_org', 
                                foriegn_column='short_name')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='platform-to-gcmd_platform', 
                                data_index='platform', 
                                data_column='gcmd_platform', 
                                foriegn_table='gcmd_platform', 
                                foriegn_column='gcmd_uuid')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument-to-gcmd_phenomena', 
                                data_index='instrument', 
                                data_column='gcmd_phenomena', 
                                foriegn_table='gcmd_phenomena', 
                                foriegn_column='ignore_code')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument-to-gcmd_instrument', 
                                data_index='instrument', 
                                data_column='gcmd_instrument', 
                                foriegn_table='gcmd_instrument', 
                                foriegn_column='gcmd_uuid')
errors

In [None]:
correct_values(
    db=db,
    table_name = 'instrument-to-gcmd_instrument',
    column = 'gcmd_instrument',
    wrong_value = '92f99316-b581-4adb-9980-aeb6bed64eee',
    correct_value = '6238f3e2-9a87-4e32-b866-c4a637094b51')

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument-to-instrument_type', 
                                data_index='instrument', 
                                data_column='instrument_type', 
                                foriegn_table='instrument_type', 
                                foriegn_column='short_name')
errors

In [None]:
# correct_values(
#     db=db,
#     table_name = 'instrument-to-instrument_type',
#     column = 'instrument_type',
#     wrong_value = 'Passive - Remote Sensing',
#     correct_value = 'Remote - Passive')
# correct_values(
#     db=db,
#     table_name = 'instrument-to-instrument_type',
#     column = 'instrument_type',
#     wrong_value = 'Earth Remote Sensing - Active Remote Sensing',
#     correct_value = 'Remote - Active')

In [None]:
[key for key in db.keys() if 'geo' in key]

In [None]:
# errors = validate.foriegn_keys(db, 
#                                 data_table='instrument-to-geophysical_concept', 
#                                 data_index='instrument', 
#                                 data_column='geophysical_concept', 
#                                 foriegn_table='geophysical_concept', 
#                                 foriegn_column='short_name')
# errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument-to-repository', 
                                data_index='instrument', 
                                data_column='repository', 
                                foriegn_table='repository', 
                                foriegn_column='short_name')
errors

In [None]:
# correct_values(
#     db=db,
#     table_name = 'instrument-to-repository',
#     column = 'repository',
#     wrong_value = 'ORNL',
#     correct_value = 'OB.DAAC')

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument-to-measurement_region', 
                                data_index='instrument', 
                                data_column='measurement_region', 
                                foriegn_table='measurement_region', 
                                foriegn_column='short_name')
errors

In [None]:
# correct_values(
#     db=db,
#     table_name = 'instrument-to-measurement_region',
#     column = 'measurement_region',
#     wrong_value = 'Troposphere',
#     correct_value = 'mid-troposphere')
# correct_values(
#     db=db,
#     table_name = 'instrument-to-measurement_region',
#     column = 'measurement_region',
#     wrong_value = 'troposphere',
#     correct_value = 'mid-troposphere')
# correct_values(
#     db=db,
#     table_name = 'instrument-to-measurement_region',
#     column = 'measurement_region',
#     wrong_value = 'subsurface',
#     correct_value = 'subsurface - water')

In [None]:
# db['instrument-to-measurement_region'][db['instrument-to-measurement_region']['measurement_region']=='subsurface - water']

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='deployment-to-geographical_region', 
                                data_index='deployment', 
                                data_column='geographical_region', 
                                foriegn_table='geographical_region', 
                                foriegn_column='short_name')
errors

In [None]:
# correct_values(
#     db=db,
#     table_name = 'deployment-to-geographical_region',
#     column = 'geographical_region',
#     wrong_value = '',
#     correct_value = 'continental')

In [None]:
# errors = validate.foriegn_keys(db, 
#                                 data_table='deployment-to-platform', 
#                                 data_index='deployment', 
#                                 data_column='platform', 
#                                 foriegn_table='platform', 
#                                 foriegn_column='short_name')
# print('\n\ndo I really need to validate this?')
# errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='deployment', 
                                data_index='short_name', 
                                data_column='foreign-campaign-short_name', 
                                foriegn_table='campaign', 
                                foriegn_column='short_name')
print('\n\n TODO this better once you have the data')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='platform', 
                                data_index='short_name', 
                                data_column='foreign-platform_type-short_name', 
                                foriegn_table='platform_type', 
                                foriegn_column='short_name')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='platform-to-gcmd_platform', 
                                data_index='platform', 
                                data_column='gcmd_platform', 
                                foriegn_table='gcmd_platform', 
                                foriegn_column='gcmd_uuid')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='iop', 
                                data_index='short_name', 
                                data_column='foreign-deployment-short_name', 
                                foriegn_table='deployment', 
                                foriegn_column='short_name')
errors

In [None]:
# correct_values(
#     db=db,
#     table_name = 'iop',
#     column = 'foreign-deployment-short_name',
#     wrong_value = 'AirMOSS_deb_2014b',
#     correct_value = 'AirMOSS_dep_2014b')

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='significant_event', 
                                data_index='short_name', 
                                data_column='foreign-deployment-short_name', 
                                foriegn_table='deployment', 
                                foriegn_column='short_name')
errors

### Flight

In [None]:
# flight table, instruments

errors = validate.foriegn_keys(db, 
                               data_table='collection_period', 
                              data_index='foreign-campaign-short_name', 
                              data_column='instrument', 
                              foriegn_table='instrument', 
                              foriegn_column='short_name')
errors

In [None]:
# run with caution, removes all errors indescriminately

db['collection_period'].drop(list(errors.index), inplace=True)

In [None]:
# db['collection_period']=db['collection_period'][db['collection_period']['instrument']!='NAWX radar']

In [None]:
# Flight table, platforms

errors = validate.foriegn_keys(db, 
                      data_table='collection_period', 
                      data_index='foreign-campaign-short_name', 
                      data_column='foreign-platform-short_name', 
                      foriegn_table='platform', 
                      foriegn_column='short_name')
errors

In [None]:
# correct_values(
#     db=db,
#     table_name = 'collection_period',
#     column = 'foreign-platform-short_name',
#     wrong_value = 'UND Citation II',
#     correct_value = 'Citation')
# db['collection_period']=db['collection_period'][db['collection_period']['foreign-platform-short_name']!='Field_Site']

In [None]:
# I think to ignore this???

errors = validate.foriegn_keys(db, 
                      data_table='collection_period', 
                      data_index='foreign-campaign-short_name', 
                      data_column='short_name',#'foreign-deployment-short_name', 
                      foriegn_table='deployment', 
                      foriegn_column='short_name')
errors

In [None]:
# correct_values(
#     db=db,
#     table_name = 'collection_period',
#     column = 'short_name',
#     wrong_value = 'OLYMPEX_dep_2016',
#     correct_value = 'OLYMPEX_dep_2015')

# db['collection_period']=db['collection_period'][db['collection_period']['short_name']!='Citation']

# Dates

In [None]:
for table_name in db.keys():
    # find tables that have date fields
    if len(col_names := [col for col in db[table_name].columns if 'date' in col])>0:
        print(f"{table_name}\n    {', '.join(col_names)}")

In [None]:
table_names = ['campaign', 'deployment', 'iopse'] 
for table_name in table_names:
    print(table_name)
    db[table_name]['valid_date']=False
    
    db[table_name]['valid_date'] = db[table_name].apply(lambda row: validate.vali_date(row['start_date'], row['end_date']), axis=1)
        

In [None]:
# this field no longer exists?

# table_names = ['instrument'] 
# for table_name in table_names:
    
#     db[table_name]['valid_date']=False
#     db[table_name]['valid_date'] = db[table_name].apply(lambda row: validate.vali_date(row['deployment_date'], row['decommision_date']), axis=1)
       

In [None]:
db['campaign'][['short_name','start_date','end_date','valid_date']][db['campaign']['valid_date'].apply(lambda x: not(x))]

# IOPSE

In [None]:
db['iop']

In [None]:
# validate that all iops are unique, should return an empty list

db['iop']['short_name'][db['iop']['short_name'].duplicated()]

In [None]:
# if sig event has matching iop, link them

# if sig event doesn't have matching iop, delete it so there will be no foriegn key

In [None]:
# tag significant events that have IOPs

db['significant_event']['has_iop'] = db['significant_event']['iop_short_name'].apply(lambda short_name: short_name in list(db['iop']['iop_short_name']))


In [None]:
# # check that all significant events with an iop have an iop

# s_i = set(db['iop']['iop_short_name'])
# s_s = set(has_iop['iop_short_name'])
# [s for s in s_s if s not in s_i]

In [None]:
from datetime import datetime
# sig event start >= iop start

def start_val(sig_row, db):
    large_date = sig_row['start_date']
    small_date = db['iop'][db['iop']['iop_short_name']==sig_row['iop_short_name']]['start_date'].iloc[0]
    val = large_date >= small_date
    return val

def end_val(sig_row, db):
    large_date = sig_row['end_date']
    small_date = db['iop'][db['iop']['iop_short_name']==sig_row['iop_short_name']]['end_date'].iloc[0]
    val = large_date <= small_date
    return val

has_iop = db['significant_event'][db['significant_event']['has_iop']].copy()
val_iop_date_start = has_iop.apply(lambda row: not(start_val(row, db)), axis=1)
val_iop_date_end = has_iop.apply(lambda row: not(end_val(row, db)), axis=1)

In [None]:
# display incorrect start dates
has_iop[val_iop_date_start]

In [None]:
# display incorrect end dates
has_iop[val_iop_date_end]

# Filter GCMD Tables

In [None]:
links = list(set(list(db['platform_type']['foreign-platform_type-short_name'])))
full = list(db['platform_type']['short_name'])
[link for link in links if link not in full]

In [None]:
db = filter_gcmd_tables(db)

### Pickle the Data

In [None]:
import pickle

In [None]:
pickle.dump(db, open('ingest_data/db_10_camp','wb'))

In [None]:
db['campaign']['nasa_led']

In [None]:
[key for key in db.keys() if 'gcmd' in key]