# Imports

In [1]:
import pandas as pd
import json
import validate
from general import many_to_many, many_cols
import ingest
import clean

# Pull Data into Pandas

In [2]:
data = ingest.main('fresh data.xlsx')

In [3]:
excel_data = data['excel_data']
db = data['database']

# Cleaning

In [4]:
db = clean.remove_NaN_columns(db)

In [5]:
db = clean.strip_all_columns(db)

In [6]:
# change ignore_code field to be string
db['gcmd_phenomena']['ignore_code'] = db['gcmd_phenomena']['ignore_code'].apply(lambda x: str(x))

# Short Name Supplementation

In [7]:
# make unique shortname by combining the campaign name and the table sub short name

db['deployment']['short_name'] = db['deployment']['foreign-campaign-short_name']+'_'+db['deployment']['ignore_deployment_id']

db['collection_period'] = many_to_many(db, 'linking', 'table-instrument-short_name', keep_all=True)
db['collection_period']['short_name'] = db['collection_period']['foreign-campaign-short_name']+'_'+db['collection_period']['foreign-deployment-short_name']

# Find Matching Deployments for IOPSE

In [8]:
# original process before good shortname usage on the sheets


# db['iopse']['deployment_short_name'] = 'No Matches'

# for row in range(len(db['iopse'])):
#     iop_start = db['iopse'].iloc[row]['start_date']
#     iop_end = db['iopse'].iloc[row]['end_date']
#     iop_camp = db['iopse'].iloc[row]['foreign-campaign-short_name']
    
#     campaign_filter = db['deployment']['foreign-campaign-short_name'].apply(lambda short_name: short_name == iop_camp)
#     possible_campaigns = db['deployment'][campaign_filter]
    
#     start_filter = possible_campaigns['start_date'].apply(lambda dep_start: validate.vali_date(dep_start, iop_start))
#     end_filter = possible_campaigns['end_date'].apply(lambda dep_end: validate.vali_date(iop_end, dep_end))
    
#     matching_deployments = list(possible_campaigns[start_filter*end_filter]['short_name'])
#     if len(matching_deployments)>1:
#         print(f"error on {db['iopse'].iloc[row]}")
#     elif len(matching_deployments)==1:
#         matching_deployments = matching_deployments[0]
#     else:
#         matching_deployments = 'None Found'
#     db['iopse']['deployment_short_name'].iloc[row]=matching_deployments

In [9]:
# test for unexpected values in this column
assert set(db['iopse']['type']) == {'IOP', 'SE'}

In [10]:
db['iop'] = db['iopse'][db['iopse']['type']=='IOP']
db['significant_event'] = db['iopse'][db['iopse']['type']=='SE']

# Many to Many Creation

In [11]:
# main_table_names = ['campaign', 'platform', 'instrument', 'deployment']

In [12]:
# for table in main_table_names:
#     print(table)
#     for column in [col for col in db[table].keys() if isinstance(col,str) and 'table' in col]:
#         name = column.split('-')[1]
#         new_table_name = f"{table}-to-{name}"
#         db[new_table_name]=many_to_many(db, table, column)
#         print(f'   {new_table_name} created')

# Campaign Filter

In [13]:
campaign_filter = [
    "ACES",
    "AirMOSS",
    "ARCTAS",
    "CARVE",
    "DC3",
    "GCPEx",
    "GOES-R PLT",
    "GRIP",
    "HS3",
    "OLYMPEX"
]

In [14]:
# campaigns
db['campaign'] = db['campaign'][db['campaign']['short_name'].apply(lambda x: x in campaign_filter)]
assert set(campaign_filter) == set(db['campaign']['short_name'])

In [15]:
# deployments
db['deployment']=db['deployment'][db['deployment']['foreign-campaign-short_name'].apply(lambda short: short in campaign_filter)]

In [16]:
# collection periods
db['collection_period']=db['collection_period'][db['collection_period']['foreign-campaign-short_name'].apply(lambda short: short in campaign_filter)]

In [17]:
# platforms
platform_filter = list(set(list(db['collection_period']['foreign-platform-short_name'])))
db['platform']=db['platform'][db['platform']['short_name'].apply(lambda short: short in platform_filter)]
print('\ncopy these platforms into a file for inventory folks\n')
[print(thing) for thing in platform_filter]


copy these platforms into a file for inventory folks

WB-57f
C-23 Sherpa
CV-580
G-III
Falcon
G-V
Field_Site
UND Citation II
ER-2
ASO
ALTUS II
P-3
DC-8
B-200
GH
WB-57
Citation
ALAR


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [18]:
instrument_filter = list(set(list(db['collection_period']['instrument'])))
db['instrument']=db['instrument'][db['instrument']['short_name'].apply(lambda short: short in instrument_filter)]
print('\ncopy these instruments into a file for inventory folks\n')
[print(thing) for thing in instrument_filter]


copy these instruments into a file for inventory folks

CIT-CIMS
PSAP
PILS
HIWRAP
CSD CL
DFGAS
RSP
Electric Field Mill
NAWX radar
AMS
Nevzorov probe
CCN
CPSD
CI-ITMS
ATLAS
WS-CRDS
Dropsonde
PCASP
PI-Neph
PALMS
FTS
FPDS
CAFS
AirMoss
CPL
CDP
HSRL
Information Not Available
slow antenna
ISAF
SP-2
CSI
CNC
Canisters
LIP
CoSMIR
SMPS
CIMS
APR-2
P-CIMS
CCP
Gerdien Conductivity Probe
Gen-AtmsState
S-HIS
FEGS
MMS
CLH
EFCS (GSFC and MSFC versions)
TE49C
CPI
CAMS
SAGA
CIP
TDL
Gen-Chemistry
ATHOS
WCN
CAR
IR-CO2
DADS
PALS
2D-C/P
PTR-MS
GT-CIMS
OPC
DIAL-HSRL
TOGA
BAT probe
SR
LARGE
Aerolaser
King hot wire probe
TDMA
3V-CPI
Nephelometer
UHSAS
GCAS
Spectrometer
VCSEL
HR-AMS
DACOM
HARP
OAP-2G-P
DV
AVIRIS
Aethalometer
FGM
SPEC
SP2
PIP probe
GC-MS
Picarro
DASH
AVAPS
FLIR
DLH
HVPS-3
EXRAD
CRS
SSFR
PFP
FSSP
AVOCET
ATSP
MSC
RICE
DAWN
AOP
HAMSR
HIRAD
DIAL
LASE
BBR
PCAP
DOPS
HD-SP2
WAS
CPC
CAPS
Accelerometer
TD-LIF


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

# Many to Many Creation

In [19]:
main_table_names = ['campaign', 'platform', 'instrument', 'deployment']

In [20]:
for table in main_table_names:
    print(table)
    for column in [col for col in db[table].keys() if isinstance(col,str) and 'table' in col]:
        name = column.split('-')[1]
        new_table_name = f"{table}-to-{name}"
        db[new_table_name]=many_to_many(db, table, column)
        print(f'   {new_table_name} created')

campaign
   campaign-to-focus_area created
   campaign-to-season created
   campaign-to-platform_type created
   campaign-to-gcmd_phenomena created
   campaign-to-repository created
   campaign-to-partner_org created
   campaign-to-gcmd_project created
platform
   platform-to-gcmd_platform created
instrument
   instrument-to-gcmd_instrument created
   instrument-to-instrument_type created
   instrument-to-measurement_keywords created
   instrument-to-geophysical_concept created
   instrument-to-repository created
   instrument-to-measurement_region created
deployment
   deployment-to-geographical_region created


# Validation

### Short Name Duplicates

In [23]:
for table_name in db.keys():
    if table_name == 'collection_period':
        # this is being skipped because it has been broken out by instrument
        # and therefore it has duplicate short_names
        continue
        
    if 'short_name' in db[table_name].keys():
        print(table_name)
        duplicates = validate.find_duplicates(db, table_name, 'short_name')

        print(f'    {duplicates}')

platform_type
    []
home_base
    []
repository
    []
focus_area
    []
season
    []
instrument_type
    []
measurement_region
    []
geographical_region
    []
geophysical_concept
    ['informationnotavailable']
campaign
    []
platform
    []
instrument
    ['atlas']
deployment
    []
iopse
    []
gcmd_instrument
    ['atlas', 'informationnotavailable', 'wcr', nan, 'epic', 'opc', 'gnssreceiver', 'cris', 'aa', 'iris', 'particlespectrometers', 'ssies', 'icecube', 'aps']
gcmd_platform
    ['kingair', 'informationnotavailable', 'goes10', 'goes11', 'goes12', 'goes13', 'goes14', 'goes15', 'goes16', 'goes1', 'goes2', 'goes3', 'goes4', 'goes5', 'goes6', 'goes7', 'goes8', 'goes9', 'environmentalmodeling']
gcmd_project
    ['afsisclimate', 'camp', 'informationnotavailable', 'iodp', 'landsat7', 'mcmurdopredatorprey', 'notapplicable']
partner_org
    []
iop
    []
significant_event
    []


### Foriegn Key Links

### Campaign

In [24]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign-to-gcmd_project', 
                                data_index='campaign', 
                                data_column='gcmd_project', 
                                foriegn_table='gcmd_project', 
                                foriegn_column='gcmd_uuid')
errors

Unnamed: 0,campaign,gcmd_project,suggestions


In [25]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign-to-focus_area', 
                                data_index='campaign', 
                                data_column='focus_area', 
                                foriegn_table='focus_area', 
                                foriegn_column='short_name')
errors

Unnamed: 0,campaign,focus_area,suggestions


In [26]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign-to-season', 
                                data_index='campaign', 
                                data_column='season', 
                                foriegn_table='season', 
                                foriegn_column='short_name')
errors

Unnamed: 0,campaign,season,suggestions


In [27]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign-to-platform_type', 
                                data_index='campaign', 
                                data_column='platform_type', 
                                foriegn_table='platform_type', 
                                foriegn_column='short_name')
errors

Unnamed: 0,campaign,platform_type,suggestions
17,DC3,Ballons,[Balloons]
25,HS3,remote sensing,[]
26,HS3,in-situ,[]
28,HS3,ground-based platforms,[]


In [28]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign-to-gcmd_phenomena', 
                                data_index='campaign', 
                                data_column='gcmd_phenomena', 
                                foriegn_table='gcmd_phenomena', 
                                foriegn_column='ignore_code')
errors

Unnamed: 0,campaign,gcmd_phenomena,suggestions


In [29]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign-to-repository', 
                                data_index='campaign', 
                                data_column='repository', 
                                foriegn_table='repository', 
                                foriegn_column='short_name')
errors

Unnamed: 0,campaign,repository,suggestions


In [30]:
errors = validate.foriegn_keys(db, 
                                data_table='campaign-to-partner_org', 
                                data_index='campaign', 
                                data_column='partner_org', 
                                foriegn_table='partner_org', 
                                foriegn_column='short_name')
errors

Unnamed: 0,campaign,partner_org,suggestions
2,ARCTAS,California Air Resources Board,[]
3,ARCTAS,International Polar Year,[]
11,HS3,Environment Canada,[]


In [31]:
errors = validate.foriegn_keys(db, 
                                data_table='platform-to-gcmd_platform', 
                                data_index='platform', 
                                data_column='gcmd_platform', 
                                foriegn_table='gcmd_platform', 
                                foriegn_column='gcmd_uuid')
errors

Unnamed: 0,platform,gcmd_platform,suggestions


In [32]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument-to-measurement_keywords', 
                                data_index='instrument', 
                                data_column='measurement_keywords', 
                                foriegn_table='gcmd_phenomena', 
                                foriegn_column='ignore_code')
errors

Unnamed: 0,instrument,measurement_keywords,suggestions
41,CPL,0000,"[1000, 2000, 3000, 4000]"
65,GCAS,GEO-CAPE Airborne Simulator,[]


In [33]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument-to-gcmd_instrument', 
                                data_index='instrument', 
                                data_column='gcmd_instrument', 
                                foriegn_table='gcmd_instrument', 
                                foriegn_column='gcmd_uuid')
errors

Unnamed: 0,instrument,gcmd_instrument,suggestions
15,AVIRIS,d67afd03-3b79-419c-9289-5dde713ab904\n57854209...,[]
29,CIP,92f99316-b581-4adb-9980-aeb6bed64eee,[]
37,CPL,6238fe2-9a87-4e32-b866-c4a637094b51,[6238f3e2-9a87-4e32-b866-c4a637094b51]
56,EXRAD,a212d36d-2a4e-473f-b16a-6e2104b9dd8f\nba3de3fc...,[]


In [34]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument-to-instrument_type', 
                                data_index='instrument', 
                                data_column='instrument_type', 
                                foriegn_table='instrument_type', 
                                foriegn_column='short_name')
errors

Unnamed: 0,instrument,instrument_type,suggestions
7,APR-2,Earth Remote Sensing - Active Remote Sensing,[]
12,AVAPS,Profilers/Sounders,[]
34,CPL,LIDAR,[]
53,FLIR,Earth Remote Sensing - Passive Remote Sensing,[]
62,HAMSR,Spectrometers/Radiometers,[In Situ - Spectrometer/Radiometer]
65,HIRAD,Spectrometers/Radiometers,[In Situ - Spectrometer/Radiometer]
66,HIWRAP,Earth Remote Sensing - Active Remote Sensing,[]
96,S-HIS,interferometer/sounder,[]
113,WS-CRDS,Earth Remote Sensing - Active Remote Sensing,[]


In [35]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument-to-geophysical_concept', 
                                data_index='instrument', 
                                data_column='geophysical_concept', 
                                foriegn_table='geophysical_concept', 
                                foriegn_column='short_name')
errors

Unnamed: 0,instrument,geophysical_concept,suggestions
95,RICE,Cloud Properties,[Soil Properties]
96,RICE,Processes & Dynamics,[]
106,TD-LIF,Atmospheric Chemicals & Trace Gases,[Atmospheric Chemicals & Trace Gasses]


In [36]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument-to-repository', 
                                data_index='instrument', 
                                data_column='repository', 
                                foriegn_table='repository', 
                                foriegn_column='short_name')
errors

Unnamed: 0,instrument,repository,suggestions
12,AVAPS,UCAR/NCAR - Earth Observing Laboratory,[]
95,RICE,NID,[]
105,TD-LIF,NID,[]


In [37]:
errors = validate.foriegn_keys(db, 
                                data_table='instrument-to-measurement_region', 
                                data_index='instrument', 
                                data_column='measurement_region', 
                                foriegn_table='measurement_region', 
                                foriegn_column='short_name')
errors

Unnamed: 0,instrument,measurement_region,suggestions
34,CPL,full column,[]
62,HAMSR,full column,[]
65,HIRAD,boundary layer? sea surface is what I would ca...,[]
66,HIWRAP,full column,[]


In [38]:
errors = validate.foriegn_keys(db, 
                                data_table='deployment-to-geographical_region', 
                                data_index='deployment', 
                                data_column='geographical_region', 
                                foriegn_table='geographical_region', 
                                foriegn_column='short_name')
errors

Unnamed: 0,deployment,geographical_region,suggestions


In [41]:
[key for key in db.keys() if 'deployment' in key]

['deployment', 'deployment-to-geographical_region']

In [39]:
errors = validate.foriegn_keys(db, 
                                data_table='deployment-to-platform', 
                                data_index='deployment', 
                                data_column='platform', 
                                foriegn_table='platform', 
                                foriegn_column='short_name')
print('\n\ndo I really need to validate this?')
errors

KeyError: 'deployment-to-platform'

In [42]:
errors = validate.foriegn_keys(db, 
                                data_table='deployment', 
                                data_index='short_name', 
                                data_column='foreign-campaign-short_name', 
                                foriegn_table='campaign', 
                                foriegn_column='short_name')
print('\n\n TODO this better once you have the data')
errors



 TODO this better once you have the data


6,short_name,foreign-campaign-short_name,suggestions


In [43]:
errors = validate.foriegn_keys(db, 
                                data_table='platform', 
                                data_index='short_name', 
                                data_column='foreign-platform_type-short_name', 
                                foriegn_table='platform_type', 
                                foriegn_column='short_name')
errors

2,short_name,foreign-platform_type-short_name,suggestions
8,ASO,Prop Plane,[]


In [44]:
errors = validate.foriegn_keys(db, 
                                data_table='platform-to-gcmd_platform', 
                                data_index='platform', 
                                data_column='gcmd_platform', 
                                foriegn_table='gcmd_platform', 
                                foriegn_column='gcmd_uuid')
errors

Unnamed: 0,platform,gcmd_platform,suggestions


In [None]:
db['iopse']

In [45]:
errors = validate.foriegn_keys(db, 
                                data_table='iopse', 
                                data_index='iopse_id', 
                                data_column='deployment_short_name', 
                                foriegn_table='deployment', 
                                foriegn_column='short_name')
errors

KeyError: 'deployment_short_name'

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='iopse', 
                                data_index='iopse_id', 
                                data_column='event_type', 
                                foriegn_table='deployment', 
                                foriegn_column='short_name')
errors

### Flight

In [46]:
# flight table, instruments

errors = validate.foriegn_keys(db, 
                               data_table='collection_period', 
                              data_index='foreign-campaign-short_name', 
                              data_column='instrument', 
                              foriegn_table='instrument', 
                              foriegn_column='short_name')
errors

Unnamed: 0,foreign-campaign-short_name,instrument,suggestions
81,GCPEx,NAWX radar,[]
284,ACES,Electric Field Mill,[]
285,ACES,EFCS (GSFC and MSFC versions),[]
286,ACES,Gerdien Conductivity Probe,[]
292,ACES,slow antenna,[]
308,AirMOSS,AirMoss,[]
309,AirMOSS,AirMoss,[]
310,AirMOSS,AirMoss,[]
311,AirMOSS,AirMoss,[]
312,AirMOSS,AirMoss,[]


In [49]:
# Flight table, platforms

errors = validate.foriegn_keys(db, 
                      data_table='collection_period', 
                      data_index='foreign-campaign-short_name', 
                      data_column='foreign-platform-short_name', 
                      foriegn_table='platform', 
                      foriegn_column='short_name')
errors

Unnamed: 0,foreign-campaign-short_name,foreign-platform-short_name,suggestions
15,HS3,WB-57f,[WB-57]
16,HS3,WB-57f,[WB-57]
19,OLYMPEX,UND Citation II,[Citation]
21,OLYMPEX,Field_Site,[]


In [57]:
# I think to ignore this???

errors = validate.foriegn_keys(db, 
                      data_table='collection_period', 
                      data_index='foreign-campaign-short_name', 
                      data_column='short_name',#'foreign-deployment-short_name', 
                      foriegn_table='deployment', 
                      foriegn_column='short_name')
errors

Unnamed: 0,foreign-campaign-short_name,short_name,suggestions
17,OLYMPEX,OLYMPEX_dep_2016,[OLYMPEX_dep_2015]
18,OLYMPEX,OLYMPEX_dep_2016,[OLYMPEX_dep_2015]
19,OLYMPEX,OLYMPEX_dep_2016,[OLYMPEX_dep_2015]
20,OLYMPEX,OLYMPEX_dep_2016,[OLYMPEX_dep_2015]
21,OLYMPEX,OLYMPEX_dep_2016,[OLYMPEX_dep_2015]


# Dates

In [58]:
for table_name in db.keys():
    # find tables that have date fields
    if len(col_names := [col for col in db[table_name].columns if 'date' in col])>0:
        print(f"{table_name}\n    {', '.join(col_names)}")

campaign
    start_date, end_date, ignore_metadata_date
deployment
    start_date, end_date
iopse
    start_date, end_date
iop
    start_date, end_date
significant_event
    start_date, end_date


In [59]:
table_names = ['campaign', 'deployment', 'iopse'] 
for table_name in table_names:
    print(table_name)
    db[table_name]['valid_date']=False
    
    db[table_name]['valid_date'] = db[table_name].apply(lambda row: validate.vali_date(row['start_date'], row['end_date']), axis=1)
        

campaign
deployment
iopse


In [60]:
# this field no longer exists?

# table_names = ['instrument'] 
# for table_name in table_names:
    
#     db[table_name]['valid_date']=False
#     db[table_name]['valid_date'] = db[table_name].apply(lambda row: validate.vali_date(row['deployment_date'], row['decommision_date']), axis=1)
       

In [61]:
db['campaign'][['short_name','start_date','end_date','valid_date']][db['campaign']['valid_date'].apply(lambda x: not(x))]

Unnamed: 0,short_name,start_date,end_date,valid_date


# IOPSE

In [None]:
# validate that all iops are unique

db['iop']['iop_short_name'][db['iop']['iop_short_name'].duplicated()]

In [None]:
# if sig event has matching iop, link them

# if sig event doesn't have matching iop, delete it so there will be no foriegn key

In [None]:
# tag significant events that have IOPs

db['significant_event']['has_iop'] = db['significant_event']['iop_short_name'].apply(lambda short_name: short_name in list(db['iop']['iop_short_name']))


In [None]:
# # check that all significant events with an iop have an iop

# s_i = set(db['iop']['iop_short_name'])
# s_s = set(has_iop['iop_short_name'])
# [s for s in s_s if s not in s_i]

In [62]:
from datetime import datetime
# sig event start >= iop start

def start_val(sig_row, db):
    large_date = sig_row['start_date']
    small_date = db['iop'][db['iop']['iop_short_name']==sig_row['iop_short_name']]['start_date'].iloc[0]
    val = large_date >= small_date
    return val

def end_val(sig_row, db):
    large_date = sig_row['end_date']
    small_date = db['iop'][db['iop']['iop_short_name']==sig_row['iop_short_name']]['end_date'].iloc[0]
    val = large_date <= small_date
    return val

has_iop = db['significant_event'][db['significant_event']['has_iop']].copy()
val_iop_date_start = has_iop.apply(lambda row: not(start_val(row, db)), axis=1)
val_iop_date_end = has_iop.apply(lambda row: not(end_val(row, db)), axis=1)

KeyError: 'has_iop'

In [63]:
# display incorrect start dates
has_iop[val_iop_date_start]

NameError: name 'has_iop' is not defined

In [64]:
# display incorrect end dates
has_iop[val_iop_date_end]

NameError: name 'has_iop' is not defined

### Pickle the Data

In [None]:
import pickle

In [None]:
pickle.dump(db, open('fresh_data','wb'))

In [None]:
db['geographical_region']