In [None]:
import pandas as pd
import internetarchive
import json, re, os
from datetime import datetime

In [None]:
access_key = "xxx"
secret_key = "xxx"
file_path = os.path.expanduser('~') + ""

In [None]:
# From document_exporter
with open(file_path + '/manifest.json') as f:
    d = json.load(f)

tags = []
tags_key = {}
for item in d:
    if item['model'] == "documents.tag":
        tags.append(item['fields']['name'])
        tags_key[item['pk']] = str(item['fields']['name']).replace(" ", "_")

fields = []
fields_key = {}
fields_select_options = {}
for item in d:
    if item['model'] == "documents.customfield":
        fields.append(item['fields']['name'])
        fields_key[item['pk']] = item['fields']['name']
        fields_select_options[item['fields']['name']] = item['fields']['extra_data']['select_options']


def fields_select_find(field_name:str, id:str):
    f_json = fields_select_options[field_name]
    for item in f_json:
        if item['id'] == id:
            return item['label']
        
    raise ValueError('field_name and id not valid')

def metadata_key_transform(metadata_key):
    metadata_key = metadata_key.lower()
    metadata_key = re.sub(r'\W+', '', metadata_key)
    metadata_key = "qsla.hadleyso.com_" + metadata_key
    return metadata_key


In [None]:

documents = []
for item in d:
    if item['model'] == "documents.document":
        temp_dict = {}
        tag_string = ""
        temp_dict["pk"] = item['pk']
        temp_dict["pk_value"] = item['pk']
        temp_dict["archive_filename"] = item['fields']['archive_filename']
        temp_dict["thumbnail_filename"] = str(item['pk']).zfill(7) + ".webp"
        temp_dict["scandate"] = item['fields']['created'][:10]
        temp_dict["checksum"] = item['fields']['checksum']
        temp_dict["archive_checksum"] = item['fields']['archive_checksum']
        for tag in item['fields']['tags']:
            tag_string += " " + tags_key[tag]
        temp_dict['tags'] = tag_string
        documents.append(temp_dict)

df = pd.DataFrame.from_dict(pd.json_normalize(documents), orient='columns')
df.set_index('pk', inplace=True)
df.index.name = None
df[[fields]] = None


for item in d:
    if item['model'] == "documents.customfieldinstance":
        metadata_value = None
        metadata_key = fields_key[item['fields']['field']]
        pk_value = item['fields']['document']

        if pk_value == 3325:
            # debug = True
            print(item)
        else:
            debug = False

        for attribute, value in item['fields'].items():
            if str(attribute).startswith("value_") and (value != None):
                metadata_value = value
                if debug:
                    print(metadata_key, attribute, value)
                
                if attribute == "value_select":
                    metadata_value = fields_select_find(metadata_key, metadata_value)
                    if debug:
                        print(metadata_key, metadata_value)

        df.loc[pk_value, metadata_key] = metadata_value

In [None]:
df = df.fillna("")
df.head(5)

In [None]:
template = pd.read_csv("./metadata.csv")

df_data = pd.DataFrame(columns=template.columns).copy(deep=True)
df_data['com.hadleyso.qsla-checksum'] = ""
df_data['com.hadleyso.qsla-archive_checksum'] = ""
df_data['com.hadleyso.qsla-tags'] = ""
df_data['com.hadleyso.qsla-urn'] = ""


In [None]:
df_data

In [None]:
df['Country'].unique()

with open('./country_iso.json') as f:
    country_iso_json = json.load(f)
    country_iso = pd.DataFrame(country_iso_json)
    
# MOVE AND FORMAT TO IA DF
df_data['file']= file_path + "/archive/" + df['archive_filename']

df_data['identifier'] = df.apply(lambda row: "com.hadleyso.qsla-URN" + str(row['pk_value']),axis=1)

df_data['external-identifier[0]'] = "https://qsla.hadleyso.com/"
df_data['external-identifier[1]'] = df.apply(lambda row: "https://qsla.hadleyso.com/urn?pk=" + str(row['pk_value']),axis=1)
df_data['external-identifier[2]'] = df.apply(lambda row: "urn:com:hadleyso:qsla:" + str(row['pk_value']),axis=1)

df_data['description'] = df.apply(lambda row: "HAM Radio QSL Card between the following callsigns: " + row['From (Call Sign)'] + " " + row['To (Call Sign)'] + " https://qsla.hadleyso.com/urn?pk=" + str(row['pk_value']),axis=1)

df_data['title'] = df.apply(lambda row: row['From (Call Sign)'] + " HAM Radio QSL Card",axis=1)

df_data['creator'] = df['From (Call Sign)']

df_data['date'] = df['Date (UTC)']

df_data["scandate"] = df['scandate']
df_data["com.hadleyso.qsla-checksum"] = df['checksum']
df_data["com.hadleyso.qsla-archive_checksum"] = df['archive_checksum']
df_data['com.hadleyso.qsla-tags'] = df['tags']
df_data['com.hadleyso.qsla-urn'] = df['pk_value']

def iso_code_get(country):
    if len(country) == 0:
        return ""
    
    if country_iso[country_iso['name'] == str(country)].size < 1:
        raise ValueError('Missing country in iso_code_get(): ' + country)
    return country_iso[country_iso['name'] == country]['alpha-2'].values[0]

df_data['coverage[1]'] = df.apply(lambda row: iso_code_get(row['Country']), axis=1)



In [None]:
# FILL IN TEMPLATE
template_raw = template[1:2]

nan_cols = [i for i in template_raw.columns if template_raw[i].isnull().any()]
col_to_copy = [item for item in template.columns.to_list() if item not in nan_cols]

for i in col_to_copy:
    df_data.loc[:, i] = template_raw[i].values[0]

In [None]:
df_data['com.hadleyso.qsla-urn'] = df_data['com.hadleyso.qsla-urn'].astype(str)

In [None]:
df_data.loc[:, df_data.columns != 'file'].to_csv(datetime.today().strftime('%Y-%m-%d-%H%M%S') + "-ia-data.csv")

In [None]:
for index, row in df_data[11730:].iterrows():
    temp_row = row.drop('file')
    temp_row = temp_row.drop('identifier')

    print(row['identifier'])
    print(row['file'])



    md = temp_row.to_dict()
    item = internetarchive.get_item(row['identifier'])
    if len(item.item_metadata) == 0:
        r = item.upload(row['file'], metadata=md, access_key=access_key, secret_key=secret_key)
        print(r[0].status_code)
    else: 
        print("PASS")
    