# Imports

In [1]:
import pandas as pd
import json
import validate
from general import many_to_many, many_cols
import ingest
import clean

# Pull Data into Pandas

In [2]:
data = ingest.main('Updated Inventory.xlsx')

In [3]:
excel_data = data['excel_data']
db = data['database']

# Cleaning

In [4]:
db = clean.remove_NaN_columns(db)

In [5]:
db = clean.strip_all_columns(db)

['short_name', 'TYPE (IOP or SE)', 'parent short_name', 'foreign_campaign_short_name', 'ADMG Deployment Name', 'description', 'region_description', 'published_list', 'reports', 'reference_file']


In [6]:
db['gcmd_phenomena']['table_code'] = db['gcmd_phenomena']['table_code'].apply(lambda x: str(x))

# Short Name Supplementation

In [22]:
# make unique shortname by combining the campaign name and the table sub short name

db['deployment']['short_name'] = db['deployment']['foreign_campaign_short_name']+'_'+db['deployment']['ignore_deployment_id']

db['collection_period'] = many_to_many(db, 'linking', 'table_instrument', keep_all=True)
db['collection_period']['foreign_campaign_short_name'] = db['collection_period']['foreign_campaign_short_name']+'_'+db['collection_period']['foreign_deployment']

# Find Matching Deployments for IOPSE

In [11]:
db['iopse']['deployment_short_name'] = 'No Matches'

for row in range(len(db['iopse'])):
    iop_start = db['iopse'].iloc[row]['start_date']
    iop_end = db['iopse'].iloc[row]['end_date']
    iop_camp = db['iopse'].iloc[row]['foreign_campaign_short_name']
    
    campaign_filter = db['deployment']['foreign_campaign_short_name'].apply(lambda short_name: short_name == iop_camp)
    possible_campaigns = db['deployment'][campaign_filter]
    
    start_filter = possible_campaigns['start_date'].apply(lambda dep_start: validate.vali_date(dep_start, iop_start))
    end_filter = possible_campaigns['end_date'].apply(lambda dep_end: validate.vali_date(iop_end, dep_end))
    
    matching_deployments = list(possible_campaigns[start_filter*end_filter]['short_name'])
    if len(matching_deployments)>1:
        print(f"error on {db['iopse'].iloc[row]}")
    elif len(matching_deployments)==1:
        matching_deployments = matching_deployments[0]
    else:
        matching_deployments = 'None Found'
    db['iopse']['deployment_short_name'].iloc[row]=matching_deployments

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


# Many to Many Creation

In [12]:
main_table_names = ['campaign', 'platform', 'instrument', 'deployment', 'iopse']

In [13]:
for table in main_table_names:
    print(table)
    for column in [col for col in db[table].keys() if isinstance(col,str) and 'table' in col]:
        new_table_name = f"{table}_to_{column.replace('table_','')}"
        db[new_table_name]=many_to_many(db, table, column)
        print(f'   {new_table_name} created')

campaign
   campaign_to_alias created
   campaign_to_focus_area created
   campaign_to_season created
   campaign_to_platform_type created
   campaign_to_gcmd_phenomenas_uuid created
   campaign_to_repository created
   campaign_to_partner_org created
   campaign_to_gcmd_project created
platform
   platform_to_aliases created
   platform_to_gcmd_platform_uuid created
instrument
   instrument_to_gcmd_instrument created
   instrument_to_instrument_type created
   instrument_to_aliases created
   instrument_to_measurement_keywords_uuid created
   instrument_to_geophysical_concept created
   instrument_to_repository created
   instrument_to_measurement_region created
deployment
   deployment_to_aliases created
   deployment_to_geographical_region created
iopse


In [21]:
db['gcmd_phenomena']

5,table_code,category,topic,term,variable_1,variable_2,variable_3,gcmd_uuid
6,1000,EARTH SCIENCE,Information Not Available,Information Not Available,Information Not Available,Information Not Available,Information Not Available,e9f67a66-e9fc-435c-b720-ae32a2c3d8f5
7,1001,EARTH SCIENCE,AGRICULTURE,AGRICULTURAL AQUATIC SCIENCES,AQUACULTURE,Information Not Available,Information Not Available,8916dafb-5ad5-45c6-ab64-3500ea1e9577
8,1002,EARTH SCIENCE,AGRICULTURE,AGRICULTURAL AQUATIC SCIENCES,FISHERIES,Information Not Available,Information Not Available,c7112a64-be39-414a-9125-f63ab44ecb5b
9,1003,EARTH SCIENCE,AGRICULTURE,AGRICULTURAL AQUATIC SCIENCES,TEST,Information Not Available,Information Not Available,0916afef-a0b7-4ecd-85ba-cc24070470a7
10,1004,EARTH SCIENCE,AGRICULTURE,AGRICULTURAL AQUATIC SCIENCES,Information Not Available,Information Not Available,Information Not Available,ca227ff0-4742-4e51-a763-4582fa28291c
...,...,...,...,...,...,...,...,...
3022,4016,EARTH SCIENCE,TERRESTRIAL HYDROSPHERE,WATER QUALITY/WATER CHEMISTRY,WATER QUALITY INDEXES,Information Not Available,Information Not Available,f2130ca3-3587-4312-b6d4-138456b5ea78
3023,4017,EARTH SCIENCE,TERRESTRIAL HYDROSPHERE,WATER QUALITY/WATER CHEMISTRY,Information Not Available,Information Not Available,Information Not Available,8c02f5d1-ce86-4bf5-84d5-b3496cdba6ad
3024,4018,EARTH SCIENCE,TERRESTRIAL HYDROSPHERE,Information Not Available,Information Not Available,Information Not Available,Information Not Available,885735f3-121e-4ca0-ac8b-f37dbc972f03
3025,1000,EARTH SCIENCE,Information Not Available,Information Not Available,Information Not Available,Information Not Available,Information Not Available,e9f67a66-e9fc-435c-b720-ae32a2c3d8f5


# Validation

### Short Name Duplicates

In [None]:
for table_name in db.keys():
    if 'short_name' in db[table_name].keys():
        print(table_name)
        duplicates = validate.find_duplicates(db, table_name, 'short_name')

        print(f'    {duplicates}')

### Foriegn Key Links

### Campaign

In [None]:
pd.set_option('display.max_rows', 500)

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign_to_gcmd_project', 
                                data_index='campaign', 
                                data_column='gcmd_project', 
                                foriegn_table='gcmd_project', 
                                foriegn_column='uuid')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign_to_focus_area', 
                                data_index='campaign', 
                                data_column='focus_area', 
                                foriegn_table='focus_area', 
                                foriegn_column='short_name')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign_to_season', 
                                data_index='campaign', 
                                data_column='season', 
                                foriegn_table='season', 
                                foriegn_column='short_name')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign_to_platform_type', 
                                data_index='campaign', 
                                data_column='platform_type', 
                                foriegn_table='platform_type', 
                                foriegn_column='short_name')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign_to_gcmd_phenomenas_uuid', 
                                data_index='campaign', 
                                data_column='gcmd_phenomenas_uuid', 
                                foriegn_table='gcmd_phenomena', 
                                foriegn_column='code')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign_to_repository', 
                                data_index='campaign', 
                                data_column='repository', 
                                foriegn_table='repository', 
                                foriegn_column='short_name')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign_to_partner_org', 
                                data_index='campaign', 
                                data_column='partner_org', 
                                foriegn_table='partner_org', 
                                foriegn_column='short_name')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='platform_to_gcmd_platform_uuid', 
                                data_index='platform', 
                                data_column='gcmd_platform_uuid', 
                                foriegn_table='gcmd_platform', 
                                foriegn_column='uuid')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument_to_measurement_keywords_uuid', 
                                data_index='instrument', 
                                data_column='measurement_keywords_uuid', 
                                foriegn_table='gcmd_phenomena', 
                                foriegn_column='code')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument_to_gcmd_instrument', 
                                data_index='instrument', 
                                data_column='gcmd_instrument', 
                                foriegn_table='gcmd_instrument', 
                                foriegn_column='uuid')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument_to_instrument_type', 
                                data_index='instrument', 
                                data_column='instrument_type', 
                                foriegn_table='instrument_type', 
                                foriegn_column='short_name')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument_to_geophysical_concept', 
                                data_index='instrument', 
                                data_column='geophysical_concept', 
                                foriegn_table='geophysical_concept', 
                                foriegn_column='short_name')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument_to_repository', 
                                data_index='instrument', 
                                data_column='repository', 
                                foriegn_table='repository', 
                                foriegn_column='short_name')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument_to_measurement_region', 
                                data_index='instrument', 
                                data_column='measurement_region', 
                                foriegn_table='measurement_region', 
                                foriegn_column='short_name')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='deployment_to_geographical_region', 
                                data_index='deployment', 
                                data_column='geographical_region', 
                                foriegn_table='geographical_region', 
                                foriegn_column='short_name')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='deployment_to_platform', 
                                data_index='deployment', 
                                data_column='platform', 
                                foriegn_table='platform', 
                                foriegn_column='short_name')
print('\n\ndo I really need to validate this?')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='deployment', 
                                data_index='short_name', 
                                data_column='foriegn_campaign_short_name', 
                                foriegn_table='campaign', 
                                foriegn_column='short_name')
print('\n\n TODO this better once you have the data')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='platform', 
                                data_index='short_name', 
                                data_column='foriegn_aircraft_type', 
                                foriegn_table='aircraft_type', 
                                foriegn_column='short_name')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='platform_to_gcmd_platform_uuid', 
                                data_index='platform', 
                                data_column='gcmd_platform_uuid', 
                                foriegn_table='gcmd_platform', 
                                foriegn_column='uuid')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='iopse', 
                                data_index='iopse_id', 
                                data_column='deployment_short_name', 
                                foriegn_table='deployment', 
                                foriegn_column='short_name')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='iopse', 
                                data_index='iopse_id', 
                                data_column='event_type', 
                                foriegn_table='deployment', 
                                foriegn_column='short_name')
errors

### Flight

In [None]:
# flight table, instruments

errors = validate.foriegn_keys(db, data_table='flight', 
                      data_index='foriegn_campaign_short_name', 
                      data_column='instrument', 
                      foriegn_table='instrument', 
                      foriegn_column='short_name')
errors

In [None]:
# Flight table, platforms

errors = validate.foriegn_keys(db, 
                      data_table='flight', 
                      data_index='foriegn_campaign_short_name', 
                      data_column='platform', 
                      foriegn_table='platform', 
                      foriegn_column='short_name')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                      data_table='flight', 
                      data_index='foriegn_campaign_short_name', 
                      data_column='foriegn_deployment_short_name', 
                      foriegn_table='deployment', 
                      foriegn_column='short_name')
errors

# Dates

In [None]:
for table_name in db.keys():
    # find tables that have date fields
    if len(col_names := [col for col in db[table_name].columns if 'date' in col])>0:
        print(f"{table_name}\n    {', '.join(col_names)}")

In [None]:
table_names = ['campaign', 'deployment', 'iopse'] 
for table_name in table_names:
    print(table_name)
    db[table_name]['valid_date']=False
    
    db[table_name]['valid_date'] = db[table_name].apply(lambda row: validate.vali_date(row['start_date'], row['end_date']), axis=1)
        

In [None]:
table_names = ['instrument'] 
for table_name in table_names:
    
    db[table_name]['valid_date']=False
    db[table_name]['valid_date'] = db[table_name].apply(lambda row: validate.vali_date(row['deployment_date'], row['decommision_date']), axis=1)
       

In [None]:
db['campaign'][['short_name','start_date','end_date','valid_date']][db['campaign']['valid_date'].apply(lambda x: not(x))]

# IOPSE

In [None]:
# FILTER 
# TODO: FIX GRIP
db['iopse']=db['iopse'][db['iopse']['iopse_id']!='Information Not Available'][db['iopse']['foriegn_campaign_short_name']!='GRIP'][db['iopse']['foriegn_campaign_short_name']!='ATTREX']


In [None]:
# split into two tables

db['significant_event'] = db['iopse'][db['iopse']['event_type']=='Y'].copy()
db['iop'] = db['iopse'][db['iopse']['event_type']!='Y'].copy()

In [None]:
# validate that all iops are unique

db['iop']['iop_short_name'][db['iop']['iop_short_name'].duplicated()]

In [None]:
# if sig event has matching iop, link them

# if sig event doesn't have matching iop, delete it so there will be no foriegn key

In [None]:
# tag significant events that have IOPs

db['significant_event']['has_iop'] = db['significant_event']['iop_short_name'].apply(lambda short_name: short_name in list(db['iop']['iop_short_name']))


In [None]:
# # check that all significant events with an iop have an iop

# s_i = set(db['iop']['iop_short_name'])
# s_s = set(has_iop['iop_short_name'])
# [s for s in s_s if s not in s_i]

In [None]:
from datetime import datetime
# sig event start >= iop start

def start_val(sig_row, db):
    large_date = sig_row['start_date']
    small_date = db['iop'][db['iop']['iop_short_name']==sig_row['iop_short_name']]['start_date'].iloc[0]
    val = large_date >= small_date
    return val

def end_val(sig_row, db):
    large_date = sig_row['end_date']
    small_date = db['iop'][db['iop']['iop_short_name']==sig_row['iop_short_name']]['end_date'].iloc[0]
    val = large_date <= small_date
    return val

has_iop = db['significant_event'][db['significant_event']['has_iop']].copy()
val_iop_date_start = has_iop.apply(lambda row: not(start_val(row, db)), axis=1)
val_iop_date_end = has_iop.apply(lambda row: not(end_val(row, db)), axis=1)

In [None]:
# display incorrect start dates
has_iop[val_iop_date_start]

In [None]:
# display incorrect end dates
has_iop[val_iop_date_end]

## Remap Limited Fields  I SHOULD DELETE THIS IT RUNS ELSEWHERE

In [None]:
from ingest import rename_columns

In [None]:
column_mapping = json.load(open('limited_col_mapping.json', 'r'))

In [None]:
# remaps limited field names
ingest_order = [
 'platform_type',
 'aircraft_type',
 'instrument_type',
 'home_base',
 'repository',
 'focus_area',
 'season',
 'measurement_region',
 'geographical_region',
 'geophysical_concept',
 'gcmd_phenomena',
 'gcmd_instrument',
 'gcmd_platform',
 'gcmd_project',
 'partner_org'
]
tables_to_remap = ['platform_type',
 'aircraft_type',
 'instrument_type',
 'home_base',
 'repository',
 'focus_area',
 'season',
 'measurement_region',
 'geographical_region',
 'geophysical_concept',]

for table_name in tables_to_remap:
    db[table_name] = rename_columns(db, table_name, column_mapping)

### Ingest Order

In [None]:
ingest_order = json.load(open('ingest_order.json', 'r'))
ingest_order

### Pickle the Data

In [None]:
import pickle

In [None]:
pickle.dump(db, open('ingested_data','wb'))