In [1]:
import os
import csv
import requests
import json
import pandas as pd
import math
import urllib.parse
from pathlib import Path
from datetime import date
from datetime import timedelta


class KnackAFT:
    def __init__(self):
        # API #
        self.API_VERSION = 'v1'
        self.API_KEY = '1a210580-315e-11ea-a6a4-bb031a9e1ba1'
        self.APP_ID = '5e13989941e72c0e039e117f'
        self.CUSTOM_KNACK_ENDPOINT = 'knack.aft.org'
        
        # HTTP REQUESTS #
        self.GET_HEADERS = {'X-Knack-REST-API-KEY':self.API_KEY,'X-Knack-Application-Id':self.APP_ID}
        self.POST_HEADERS = {'X-Knack-REST-API-KEY':self.API_KEY,'X-Knack-Application-Id':self.APP_ID,'content-type':'application/json'}
        self.API_URL = f'https://api.{self.CUSTOM_KNACK_ENDPOINT}/{self.API_VERSION}/'
        self.LOADER_URL = f'https://loader.{self.CUSTOM_KNACK_ENDPOINT}/{self.API_VERSION}/applications/{self.APP_ID}'

        # INTERNAL #
        self.APP_DICT = {}

        
    # function to return key for any value
    def get_key(self, dictionary ,val):
        for key, value in dictionary.items():
            if val == value:
                return key

        return ''
    
    def loader(self):
        res = requests.get(url=self.LOADER_URL)
        objects = res.json()['application']['objects']

        for obj in objects:
            fields = {}
            name = obj['name']
            key = obj['key']

            if 'Entity-' in name:
                for item in obj['fields']:
                    fields.update({item['name']:item['key']})
                self.APP_DICT.update({name.replace('Entity-', ''):{'id':key,'fields':fields}})
    
    # JSON PRINT HELPER #
    def jprint(output):
        print(json.dumps(output, indent=4))
    
    # GET and format json from requestURL
    def getJSON(self, url):
        r = requests.get(url = self.API_URL + url, headers = self.GET_HEADERS)
        return r.json()
    
    def getObjectJSON(self, object_name):
        return (self.getJSON('objects/' + self.APP_DICT[object_name]['id']))['object']
        
    def find_matches(self, object_name, field_name, match_val, multi=False):
        field_id = self.APP_DICT[object_name]['fields'][field_name]
        object_id = self.APP_DICT[object_name]['id']
        
        match_filter = {'match':'and', 'rules':[{'field':field_id, 'operator':'is', 'value': match_val}]}
        filter_for_url = urllib.parse.quote(json.dumps(match_filter))
        request_url = "objects/" + object_id + "/records?filters=" + filter_for_url
        res = self.getJSON(request_url)
        if res["total_records"] == 0:
            return ''
        elif res["total_records"] == 1:
            return res["records"]
        else:
            if multi:
                return res["records"]
            else:
                return ''
            
    def convert_fields_ids2name(self, object_name, ids):
        field_dict = self.APP_DICT[object_name]['fields']
        out_dict = {}
        
        for k, v in ids.items():
            if 'field' in k:
                if 'raw' in k:
                    newk = k.replace('_raw', '')
                    key = self.get_key(field_dict, newk)
                    if key:
                        out_dict.update({(key+'_raw'):v})
                    else:
                        out_dict.update({k:v})
                else:
                    key = self.get_key(field_dict, k)
                    if key:
                        out_dict.update({key:v})
                    else:
                        out_dict.update({k:v})
            else:
                out_dict.update({k:v})
        return out_dict
    
    def convert_fields_name2ids(self, object_name, names):
        DICT = {}
        out = {}
        for k,v in self.APP_DICT[object_name]['fields'].items():
            DICT.update({k.lower():v})
        for k, v in names.items():
            key = DICT[k]
            out.update({key:v})
        return out


In [None]:
# GLOBAL VARIABLES #
FILE_INPUT = 'Export_Full_Results_0000_part_00_combine-3.csv'
BATCH_SIZE = 500

# MAIN LOGIC #
client = KnackAFT()
client.loader()

connections = ['nationaljobclassid', 'unitid', 'affiliateid']
connections_fields = ['field_341', 'field_342', 'field_591']

DICT = client.APP_DICT['LocalJobClass']['fields']
runnerDICT = {k.lower(): v for k, v in DICT.items()}

#jprint(client.APP_DICT['Affiliate'])

def uploader(payload):
    request_url = "https://api.knack.aft.org/v1/objects/object_21/records"
    return requests.post(url = request_url, headers = client.POST_HEADERS, data = json.dumps(payload))
    

def getIds(arg, val):
    if arg == 'nationaljobclassid':
        return 
    if arg == 'unitid':
        return client.find_matches('Unit', 'UnitId', val)
    if arg == 'affiliateid':
        return client.find_matches('Affiliate', 'AffiliateID', val)


def processCSV():
    GLOBAL_NJC_CACHE ={}
    GLOBAL_UNIT_CACHE ={}
    GLOBAL_AFF_CACHE ={}
    
    # Entity Dict
    DICT = {}
    for k,v in client.APP_DICT['LocalJobClass']['fields'].items():
        DICT.update({k.lower():v})
    
    # read and format CSV file
    print('Running file:-', FILE_INPUT)
    
    df = pd.read_csv(FILE_INPUT, quoting=csv.QUOTE_ALL, converters={i: str for i in range(0, 100)})
    df.fillna('', inplace = True)
    for col in df.columns:
        if col in connections:
            df.rename(columns={col:('entity-' + col.replace('id', ''))}, inplace=True)
    for col in df.columns:
        new_col = DICT[col]
        df.rename(columns={col:new_col}, inplace=True)
    
    print('---INFO---')
    print("Number of lines present:-", len(df))
    print("Running in batch size:-", BATCH_SIZE)
    print('Total batches:-' , math.ceil(len(df)/BATCH_SIZE))
    print('----------')
    
    #split into DataFrames
    batch_count = 0
    
    while batch_count < math.ceil(len(df)/BATCH_SIZE):
        mem_df = {}
        if batch_count == 0:
            mem_df = df.iloc[:BATCH_SIZE].copy()
        else:
            mem_df = df.iloc[(BATCH_SIZE*batch_count):(BATCH_SIZE*(batch_count+1))].copy()
        batch_count += 1
        print('Running batch:-' , batch_count)

        # Get NationalJobClass Connection
        for index, val in mem_df['field_341'].items():
            if val:
                if val in GLOBAL_NJC_CACHE:
                    mem_df.loc[0:index,'field_341'] = GLOBAL_NJC_CACHE[val]
                else:
                    idreturned = client.find_matches('NationalJobClass', 'NationalJobClassId', val)[0]['id']
                    mem_df.loc[0:index,'field_341'] = idreturned
                    GLOBAL_NJC_CACHE.update({val:idreturned})


        # Get Unit Connection
        for index, val in mem_df['field_342'].items():
            if val:
                if val in GLOBAL_UNIT_CACHE.keys():
                    mem_df.loc[0:index,'field_342'] = GLOBAL_UNIT_CACHE[val]
                else:
                    idreturned = client.find_matches('Unit', 'UnitId', val)[0]['id']
                    mem_df.loc[0:index,'field_342'] = idreturned
                    GLOBAL_UNIT_CACHE.update({val:idreturned})


        # Get Aff Connection
        for index, val in mem_df['field_591'].items():
            if val:
                if val in GLOBAL_AFF_CACHE.keys():
                    mem_df.loc[0:index,'field_591'] = GLOBAL_AFF_CACHE[val]
                else:
                    idreturned = client.find_matches('Affiliate', 'AffiliateID', val)[0]['id']
                    mem_df.loc[0:index,'field_591'] = idreturned
                    GLOBAL_AFF_CACHE.update({val:idreturned})
                    
                    
        payload_list = mem_df.to_dict('records')
        count = 0
        for payload in payload_list:
            count += 1
            r = uploader(payload)
            if r.status_code != 200:
                print('', end='\n')
                print(payload)
                print(json.dumps(r.json(), indent=4))
                print('Error on line: ', (batch_count-1)*BATCH_SIZE+count+1, ' of CSV input file')
                batch_count = 9999999999999
                break
            else:
                print('Row:-', count, '/', len(payload_list), ':', r, end='\r')
        print('', end='\n')
        print('Batch Done!')
        print('----------')
    print('Input File Done!')
processCSV()

Running file:- Export_Full_Results_0000_part_00_combine-3.csv
---INFO---
Number of lines present:- 113347
Running in batch size:- 500
Total batches:- 227
----------
Running batch:- 1


In [111]:
pip install yaml

[31mERROR: Could not find a version that satisfies the requirement yaml (from versions: none)[0m
[31mERROR: No matching distribution found for yaml[0m
You should consider upgrading via the '/usr/local/Cellar/jupyterlab/3.1.14_1/libexec/bin/python3.9 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
