### Imports

In [None]:
import pickle
import json
import requests
import numpy as np
import datetime
import pandas as pd
from config.server import (client_id, client_secret, data)

### Load Data Files

In [None]:
db = pickle.load(open('ingest_data/db_10_camp','rb')) # fresh_data_filtered

In [None]:
primary_key_map = json.load(open("config/mapping_primary.json"))
ingest_order = json.load(open("config/ingest_order.json"))

In [None]:
# this stuff shouldn't be ingested anyway, what is it?

# db["campaign"][db["campaign"]["short_name"] == "OLYMPEX"].iloc[0]["ignore_number_deployments"] = 58
# db["campaign"][db["campaign"]["short_name"] == "OLYMPEX"].iloc[0]["ignore_number_deployments"]

### Correct Bad and Missing Data

In [None]:
db['collection_period']['auto_generated']=True

In [None]:
# remove gcmd_project short_name duplicates 
db["gcmd_project"].drop_duplicates(subset ="short_name", keep = False, inplace = True)

# remove missing instrument.short_name
# db["instrument"] = db["instrument"][db["instrument"]["short_name"] != "Information Not Available"]

#TODO: this should ingest as the default null value
# change nan to 0 
db["campaign"]["number_collection_periods"] = db["campaign"]["number_collection_periods"].fillna(0)
db["campaign"]["number_data_products"] = db["campaign"]["number_data_products"].fillna(0)

# there is missing data for campaign.ongoing, fill all in to False
db["campaign"]["ongoing"] = db["campaign"]["ongoing"].fillna(False)

# there is missing data for deployment.number_flights, fill all in to 0
db["deployment"]["number_flights"] = db["deployment"]["number_flights"].fillna(0)

# correct column naming in collection_period table
db['collection_period'].rename(columns={'instrument':'foreign-instrument-short_name'}, inplace=True)

# correct column naming in collection_period table
db['instrument_type'].rename(columns={'foreign-platform_type-short_name':'foreign-instrument_type-short_name'}, inplace=True)

# filter out non-matching short_names
# db['collection_period'] = db['collection_period'][db['collection_period']['foreign-instrument-short_name'] != 'Information Not Available']
# db['collection_period']= db['collection_period'][db['collection_period']['foreign-instrument-short_name']!='NAWX radar']
# db['collection_period']= db['collection_period'][db['collection_period']['foreign-instrument-short_name']!='Electric Field Mill']
# db['collection_period']= db['collection_period'][db['collection_period']['foreign-instrument-short_name']!='EFCS (GSFC and MSFC versions)']

# TODO: why does this field exist at all?
# fill in missing number of flights 
db['collection_period']["number_collection_periods"] = db['collection_period']["number_collection_periods"].fillna(0)

# fill in missing tail numbers
db['collection_period']["platform_identifier"] = db['collection_period']["platform_identifier"].fillna(0)

# create a valid deployment short name
db['collection_period']['foreign-deployment-short_name']=db['collection_period']['foreign-campaign-short_name']+'_'+db['collection_period']['foreign-deployment-short_name']

## Specific Value Correction

In [None]:
def correct_values(db, table_name, column, wrong_value, correct_value):
    db[table_name][column]=db[table_name][column].apply(lambda x: x if x!=wrong_value else correct_value)

In [None]:
correct_values(
    db=db,
    table_name = 'campaign',
    column = 'number_collection_periods',
    wrong_value = 'Information Not Available',
    correct_value = 0)

In [None]:
correct_values(
    db=db,
    table_name = 'iop',
    column = 'region_description',
    wrong_value = 'Information Not Available',
    correct_value = 'Undisclosed Location')

In [None]:
# remove multiple gcmd links. This will need to be properly implemented in the future
correct_values(
    db=db,
    table_name = 'platform_type',
    column = 'gcmd_uuid',
    wrong_value = '227d9c3d-f631-402d-84ed-b8c5a562fc27, 06e037ed-f463-4fa3-a23e-8f694b321eb1',
    correct_value = '227d9c3d-f631-402d-84ed-b8c5a562fc27')

correct_values(
    db=db,
    table_name = 'platform_type',
    column = 'gcmd_uuid',
    wrong_value = '57b7373d-5c21-4abb-8097-a410adc2a074, 491d3fcc-c097-4357-b1cf-39ccf359234, 2219e7fa-9fd0-443d-ab1b-62d1ccf41a89',
    correct_value = '57b7373d-5c21-4abb-8097-a410adc2a074')

correct_values(
    db=db,
    table_name = 'instrument_type',
    column = 'gcmd_uuid',
    wrong_value = '3d25724b-832f-4a61-b0b2-4f2ccecdba94, ebfff02c-2e5a-476e-aafb-c00167bf2daa,  def72d78-3c2f-4f46-91e7-259a0e63e2de',
    correct_value = '3d25724b-832f-4a61-b0b2-4f2ccecdba94')

correct_values(
    db=db,
    table_name = 'instrument_type',
    column = 'gcmd_uuid',
    wrong_value = '78c70202-ab05-40d6-90db-563be2a8dc90, 2315cd93-18c9-4553-a7d2-650d65d95505',
    correct_value = '78c70202-ab05-40d6-90db-563be2a8dc90')

correct_values(
    db=db,
    table_name = 'instrument_type',
    column = 'gcmd_uuid',
    wrong_value = '2724649a-5bae-4b34-89c0-2e5ca6d3203b, 02a7fb42-6ff5-493f-a447-b687f841b2c1, b5d7c2cb-60c4-4dfe-bdc9-31e9fcc97dd0',
    correct_value = '2724649a-5bae-4b34-89c0-2e5ca6d3203b')

correct_values(
    db=db,
    table_name = 'geographical_region',
    column = 'gcmd_uuid',
    wrong_value = 'd40d9651-aa19-4b2c-9764-7371bb64b9a7, 3fedcf7c-7b0c-4b51-abd2-2c54de713061',
    correct_value = 'd40d9651-aa19-4b2c-9764-7371bb64b9a7')

correct_values(
    db=db,
    table_name = 'geophysical_concept',
    column = 'gcmd_uuid',
    wrong_value = '0611b9fd-fd92-4c4d-87bb-bc2f22c548bc, 4dd22dc9-1db4-4187-a2b7-f5b76d666055',
    correct_value = '0611b9fd-fd92-4c4d-87bb-bc2f22c548bc')

correct_values(
    db=db,
    table_name = 'geophysical_concept',
    column = 'gcmd_uuid',
    wrong_value = 'c9e429cb-eff0-4dd3-9eca-527e0081f65c, 62019831-aaba-4d63-a5cd-73138ccfa5d0',
    correct_value = 'c9e429cb-eff0-4dd3-9eca-527e0081f65c')

correct_values(
    db=db,
    table_name = 'geophysical_concept',
    column = 'gcmd_uuid',
    wrong_value = '0af72e0e-52a5-4695-9eaf-d6fbb7991039, 637ac172-e624-4ae0-aac4-0d1adcc889a2',
    correct_value = '0af72e0e-52a5-4695-9eaf-d6fbb7991039')

In [None]:
# supplement missing instrument data
# inventory team needs to actually fill this stuff out correctly
correct_values(
    db=db,
    table_name = 'instrument',
    column = 'table-measurement_region-short_name',
    wrong_value = 'troposphere',
    correct_value = 'Troposphere')
correct_values(
    db=db,
    table_name = 'instrument-to-measurement_region',
    column = 'measurement_region',
    wrong_value = 'troposphere',
    correct_value = 'Troposphere')
#---------------------------

correct_values(
    db=db,
    table_name = 'instrument',
    column = 'technical_contact',
    wrong_value = 'Information Not Available',
    correct_value = 'Fake Contact')

correct_values(
    db=db,
    table_name = 'instrument',
    column = 'spatial_resolution',
    wrong_value = 'Information Not Available',
    correct_value = 'Fake spatial_resolution')

correct_values(
    db=db,
    table_name = 'instrument',
    column = 'temporal_resolution',
    wrong_value = 'Information Not Available',
    correct_value = 'Fake temporal_resolution')

correct_values(
    db=db,
    table_name = 'instrument',
    column = 'radiometric_frequency',
    wrong_value = 'Information Not Available',
    correct_value = 'Fake radiometric_frequency')

correct_values(
    db=db,
    table_name = 'instrument',
    column = 'description',
    wrong_value = 'Information Not Available',
    correct_value = 'Fake description')

correct_values(
    db=db,
    table_name = 'instrument',
    column = 'long_name',
    wrong_value = 'Information Not Available',
    correct_value = 'Fake long_name')


# -------

correct_values(
    db=db,
    table_name = 'instrument-to-instrument_type',
    column = 'instrument_type',
    wrong_value = 'Information Not Available',
    correct_value = 'In Situ - Magnetic/Electric')

correct_values(
    db=db,
    table_name = 'instrument-to-gcmd_phenomena',
    column = 'gcmd_phenomena',
    wrong_value = 'Information Not Available',
    correct_value = '1212')

correct_values(
    db=db,
    table_name = 'instrument-to-measurement_region',
    column = 'measurement_region',
    wrong_value = 'Information Not Available',
    correct_value = 'Troposphere')

# -------

correct_values(
    db=db,
    table_name = 'instrument',
    column = 'table-instrument_type-short_name',
    wrong_value = 'Information Not Available',
    correct_value = 'In Situ - Magnetic/Electric')

correct_values(
    db=db,
    table_name = 'instrument',
    column = 'table-gcmd_phenomena-ignore_code',
    wrong_value = 'Information Not Available',
    correct_value = '1212')

correct_values(
    db=db,
    table_name = 'instrument',
    column = 'table-measurement_region-short_name',
    wrong_value = 'Information Not Available',
    correct_value = 'Troposphere')

In [None]:
# supplement gcmd_platform
correct_values(
    db=db,
    table_name = 'gcmd_platform',
    column = 'description',
    wrong_value = 'Information Not Available',
    correct_value = 'fake description')

In [None]:
# add instrument and gcmd phenomena pairs to the instrument-to-gcmd_phenomena where they are missing, because this
# is a requried field and I don't want to change the models
for instrument in db['instrument']['short_name']:
    bool_list = list(db['instrument-to-gcmd_phenomena']['instrument']==instrument)
    if sum(bool_list)==0:
        db['instrument-to-gcmd_phenomena']=db['instrument-to-gcmd_phenomena'].append(
            {'instrument':instrument, 'gcmd_phenomena':'1212'}, 
            ignore_index=True
        )
        
for instrument in db['instrument']['short_name']:
    bool_list = list(db['instrument-to-measurement_region']['instrument']==instrument)
    if sum(bool_list)==0:
        db['table-measurement_region-short_name']=db['table-measurement_region-short_name'].append(
            {'instrument':instrument, 'measurement_region':'Troposphere'}, 
            ignore_index=True
        )

In [None]:
# supplement platform values

correct_values(
    db=db,
    table_name = 'platform',
    column = 'description',
    wrong_value = 'Information Not Available',
    correct_value = 'fake description')


In [None]:
# supplement campaign values

correct_values(
    db=db,
    table_name = 'campaign',
    column = 'nasa_led',
    wrong_value = 'Information Not Available',
    correct_value = 'True')

In [None]:
# fix a mapping error
# should be able to remove next time I run
db['iop'].rename(columns={'Start': 'start_date'}, inplace=True)

db['significant_event'].rename(columns={'Start': 'start_date'}, inplace=True)

In [None]:
# correct_values(
#     db=db,
#     table_name = 'collection_period',
#     column = 'foreign-deployment-short_name',
#     wrong_value = 'OLYMPEX_dep_2016',
#     correct_value = 'OLYMPEX_dep_2015')

In [None]:
# # delete collection periods from Olympex that had no instruments on the platforms....
# db['collection_period'] = db['collection_period'][db['collection_period']['short_name']!='OLYMPEX_dep_2015']

## Correctly Order Heirarchical Types

In [None]:
# ingest these first
first = db['platform_type'][db['platform_type']['foreign-platform_type-short_name']=='none']

# ingest these second
second = db['platform_type'][db['platform_type']['foreign-platform_type-short_name']!='none']

# correctly ordered
db['platform_type'] = pd.concat([first, second])

# ingest these first
first = db['instrument_type'][db['instrument_type']['foreign-instrument_type-short_name']=='none']

# ingest these second
second = db['instrument_type'][db['instrument_type']['foreign-instrument_type-short_name']!='none']

# correctly ordered
db['instrument_type'] = pd.concat([first, second])

### API STUFF

In [None]:
# !ping admg.nasa-impact.net

In [None]:
# how to get the acess token for using the api

server = 'http://admg.nasa-impact.net'
# server = 'http://localhost:8000'
base_url = f'{server}/api/'

url = f'{server}/authenticate/token/'

response = requests.post(url, data=data, auth=(client_id, client_secret))
access_token = json.loads(response.text)['access_token']
headers = {
    'Authorization': f'Bearer {access_token}',
    'Content-Type': 'application/json',
}
headers

In [None]:
def get_api(url):
    url = f'{base_url}{url}'
    response = requests.get(url, headers=headers)
    return json.loads(response.text)

In [None]:
# use this cell if you are rerunning the ingest halfway throught



## Foreign Key Map

In [None]:
reset_map = True

if reset_map:
    foreign_key_uuid_map = {
        'platform_type': {},
        'home_base': {},
        'repository': {},
        'focus_area': {},
        'season': {},
        'instrument_type': {},
        'measurement_region': {},
        'geographical_region': {},
        'geophysical_concept': {},
        'campaign': {},
        'platform': {},
        'instrument': {},
        'deployment': {},
        'iop': {},
        'significant_event': {},
        'partner_org': {},
        'collection_period': {},
        'gcmd_phenomena': {},
        'gcmd_project': {},
        'gcmd_platform': {},
        'gcmd_instrument': {},
        'measurement_keywords': {},
    }   
else:
    foreign_key_uuid_map = pickle.load(open('foreign_key_uuid_map','rb'))

In [None]:

def call_api(table_name, data):
    """
    Takes a table_name and a line of data, and adds it to the database.
    Stores the generated UUID into the foreign_key_uuid_map for later use.
    
    """

    # handle the spelling misphaps of respository...
    # I think this needs to be a permanent change?
    if data.get('repositorys'):
        data['repositories'] = data.pop('repositorys')
    
    
    print('\n ----- Calling API')
    
    post_url = f'{base_url}{table_name}'
    something_response = requests.post(post_url, data=json.dumps(data), headers=headers)

    if '"success": false' in something_response.text and 'this short name already exists' in something_response.text:
        return f'the following entry already existed {table_name=} {data=}'


    print(f'{table_name=}, {data=}')
    print(f'{something_response.text=}')
    uuid = something_response.text.split(':')[4].strip().split(' ')[0]
    requests.post(f'{base_url}change_request/{uuid}/push', headers=headers).text
    approved = json.loads(requests.post(f'{base_url}change_request/{uuid}/approve', headers=headers).text)
    print(f'{approved=}')
    
    # put the uuid obtained as the uuid for the primary value of the data
    primary_key = primary_key_map[table_name] # gets the correct column, usually short_name
    primary_value = data[primary_key] # finds the actual value for the primary key, usually the short_name
    foreign_key_uuid_map[table_name][primary_value] = approved["data"]["action_info"]["uuid_changed"]
    
    return approved

In [None]:
def remove_ignored_data(data):
    print('\n ----- Removing Ignored Data')
    
    retval = {}
    for key, value in data.items():
        if key == "ignore_code":
            retval[key] = value
        elif 'ignore' not in key:
            retval[key] = value
        
        try:
            if np.isnan(value):
                retval[key] = 0
        except Exception:
            pass 
        
        if isinstance(value, datetime.datetime):
            retval[key] = value.isoformat().split('T')[0]
            
            
    return retval

In [None]:
# ignore_tables = [
#     "instrument-to-instrument_type",
#     "instrument-to-measurement_keywords",
#     "instrument-to-gcmd_instrument"
# ]

def resolve_many_to_many_keys(table_name, data):
    print('\n ----- Resolving Many to Many')
    
    # data should be json of the row
   
    primary_key = primary_key_map[table_name]
    primary_value = data[primary_key]
    tables = [key for key in db[table_name].keys() if "table-" in key]
    
    print(f'{primary_key=}')
    print(f'{primary_value=}')
    print(f'{tables=}')
    
    print('foriegn info -----')
    for table in tables:
        _, foreign_table, foreign_key = table.split("-")
        linking_table = f"{table_name}-to-{foreign_table}"
#         if linking_table not in ignore_tables:
#         print(linking_table, table_name, primary_value, foreign_table)
        foreign_values = db[linking_table][db[linking_table][table_name] == primary_value][foreign_table]
        mapped_uuids = [
            foreign_key_uuid_map[foreign_table][val] 
                for val in foreign_values 
                    if val != "Information Not Available" and foreign_key_uuid_map[foreign_table].get(val)
        ]
        data[f"{foreign_table}s"] = mapped_uuids
        if data.get(table):
            del data[table]
            
        print(f'{foreign_table=}')
        print(f'{foreign_key=}')
        print(f'{linking_table=}')
        print(f'{foreign_values=}')
        print(f'{mapped_uuids=}')
        print()

In [None]:
# def resolve_foreign_keys(table_name, data):
#     # data should be json of the row
#     fields = [key for key in data.keys() if "foreign-" in key]
#     for field in fields:
#         _, foreign_table, foreign_key = field.split("-")
#         if table_name in ["platform_type", "instrument_type"]:
#             pass
#         else:
#             foreign_value = data[field]
#             if foreign_key_uuid_map[foreign_table].get(foreign_value):
#                 mapped_uuid = foreign_key_uuid_map[foreign_table][foreign_value]      
#                 data[foreign_table] = mapped_uuid
#         del data[field]
        
def resolve_foreign_keys(table_name, data):
    print('\n ----- Resolving Foreign Keys')
    # data should be json of the row
    fields = [key for key in data.keys() if "foreign-" in key]
    for field in fields:
      
        _, foreign_table, foreign_key = field.split("-")
        foreign_value = data[field]
        
        print()
        print(f'{foreign_value=}')
        print(f'{fields=}')
        
        if foreign_key_uuid_map[foreign_table].get(foreign_value):
            mapped_uuid = foreign_key_uuid_map[foreign_table][foreign_value]   
            if foreign_table in ['platform_type', 'instrument_type']:
                foreign_table='parent'
            data[foreign_table] = mapped_uuid
        del data[field]

In [None]:
def remove_nones(data):
    print('\n ----- Removing Nones')
    return {key:value for key, value in data.items() if value != 'none' and value != "Information Not Available"}

## Ingest into Database

In [None]:
ingest_order

In [None]:
# ingest_order = [
#  'platform_type',
# #  'instrument_type',
# #  'home_base',
# #  'repository',
# #  'focus_area',
# #  'season',
# #  'measurement_region',
# #  'geographical_region',
# #  'geophysical_concept',
# #  'gcmd_phenomena',
# #  'gcmd_instrument',
# #  'gcmd_platform',
# #  'gcmd_project',
# #  'partner_org',
# #  'instrument',
# #  'platform',
# #  'campaign',
# #  'deployment',
# #  'iop',
# #  'significant_event'
# ]

In [None]:
# ingests everything except for collection period

with open("result.txt", "w") as f:
    for table_name in ingest_order:
    # for table_name in ["platform_type"]:
        for index, row in db[table_name].iterrows():
            print(table_name, index)
            api_data = row.to_dict()
            print(api_data)
            api_data = remove_ignored_data(api_data)
            api_data = remove_nones(api_data)
            primary_key = primary_key_map[table_name]
            primary_value = api_data.get(primary_key)
            if primary_value:
                resolve_many_to_many_keys(table_name, api_data)
                resolve_foreign_keys(table_name, api_data)
                result = call_api(table_name, api_data)
                f.write(f"{json.dumps(result)}\n")
            else:
                f.write(f"{table_name}: {primary_key}, {json.dumps(api_data)}\n")

In [None]:
db['instrument'][db['instrument']['short_name']=='DIAL']['table-instrument_type-short_name']

In [None]:
list(db['instrument'][db['instrument']['short_name']=='CoSMIR']['table-measurement_region-short_name'])

In [None]:
with open("result.txt", "w") as f:
    temp_short_name = db["collection_period"].iloc[0]["short_name"]
    temp = {}
    rows = []
    for index, row in db["collection_period"].iterrows():
        dict_row = row.to_dict()
        if temp_short_name != dict_row["short_name"]:
            rows.append(temp)
            temp = {}
            temp_short_name == dict_row["short_name"]

        api_data = remove_ignored_data(dict_row)
        api_data = remove_nones(dict_row)
        resolve_foreign_keys("collection_period", api_data)
        
        temp = {
            **temp, 
            **api_data, 
            "instruments": [
                *temp.get("instruments", []),
                api_data.get("instrument")
            ]
        }
        if temp.get("instrument"):
            del temp["instrument"]
#     rows=[row for row in rows if len(row['instruments'])<1]
#     print(rows)
    for row in rows:
#         print(row,'\n')
#         print(row)
        row["instruments"] = [val for val in row["instruments"] if val is not None]
        if len(row["instruments"]) == 0:
            continue
        result = call_api("collection_period", row)
        f.write(f"{json.dumps(result)}\n")

In [None]:
# set([i.short_name for i in Instrument.object.all()] ).difference(set(foreign_key_uuid_map["instrument"].keys()))

In [None]:
pickle.dump(foreign_key_uuid_map, open('foreign_key_uuid_map','wb'))
json.dump(foreign_key_uuid_map, open("foreign_key_uuid_map.json", "w"))
        

# Play with Data

In [None]:
import validate

In [None]:
################################################
# TODO: ADD THIS TO THE INGEST VALIDATION FILE #
################################################

errors = validate.foriegn_keys(db, 
                                data_table='collection_period', 
                                data_index='short_name', 
                                data_column='foreign-deployment-short_name', 
                                foriegn_table='deployment', 
                                foriegn_column='short_name')
errors

In [None]:
errors = validate.foriegn_keys(db, 
                                data_table='collection_period', 
                                data_index='short_name', 
                                data_column='foreign-instrument-short_name', 
                                foriegn_table='instrument', 
                                foriegn_column='short_name')
errors

In [None]:
db['collection_period'][db['collection_period']['short_name']=='OLYMPEX_dep_2015']

In [None]:
fori

In [None]:
pickle.dump(foreign_key_uuid_map, open('foreign_key_uuid_map', 'wb'))