In [55]:

import requests
import json
import pandas as pd

#To tidy up shareholder's names
def tidystring(item):
    t = item.split(',')
    t.reverse()
    t = " ".join(str(x) for x in t)
    t=t.lstrip()
    return t


#To get address
def GetAddress(Addressblock):
        
    addressString = ''

    if Addressblock:
        if Addressblock['address1']:
            addressString = Addressblock['address1']
        if Addressblock['address2']:
            addressString = addressString + ', ' + Addressblock['address2']
        if Addressblock['address3']:
            addressString = addressString + ', ' + Addressblock['address3']
        if Addressblock['address4']:
            addressString = addressString + ', ' + Addressblock['address4']
    
    return addressString



def get_business_details():

    #Connecting to API to find companies that has the correct search term
    url = f'https://api.business.govt.nz/services/v3/nzbn/entities?search-term=plumb&entity-status=Registered'

    headers = {'Accept': 'application/json','Authorization':'Bearer ###########################'}
    r = requests.get(url, headers=headers)

    data = json.loads(r.text)
    totalitems = data['totalItems']
    pagesize = data['pageSize']


    #Creating empty dataframe
    df = pd.DataFrame(columns = ['company_name','nzbn','directors','shareholders',
                                 'registeredAddress','industry code', 'Registration Date',
                                'Status'],index=range(1,totalitems+12))


    #Setting name and nzbn for companies on first page only
    for index,items in enumerate(data['items']):
        name = items['entityName']
        nzbn = items['nzbn']
        
            
        if 'registrationDate' in items.keys():
            rego = items['registrationDate']
            df.iloc[index] = pd.Series({'company_name':name, 'nzbn':nzbn, 'Registration Date':rego})
        else:
            df.iloc[index] = pd.Series({'company_name':name, 'nzbn':nzbn, 'Registration Date':'None'})
            
    
    
    
    #Need to loop through all the pages!!! Each page only gives 10 results
    for page_no in range(1,totalitems//pagesize + 1): #+1 required because python doesn't include end point
        urls = f'https://api.business.govt.nz/services/v3/nzbn/entities?search-term=plumb&entity-status=Registered&page={page_no}'
        r = requests.get(urls, headers=headers)
        datas = json.loads(r.text)

        #We also need an adjustment to the index, since each time we add 10 new entries
        adjustment = page_no * pagesize
        for index,items in enumerate(datas['items']):
            name = items['entityName']
            nzbn = items['nzbn']
            if 'registrationDate' in items.keys():
                rego = items['registrationDate']
                df.iloc[index+adjustment] = pd.Series({'company_name':name, 'nzbn':nzbn, 'Registration Date':rego})
            else:
                df.iloc[index+adjustment] = pd.Series({'company_name':name, 'nzbn':nzbn, 'Registration Date':'None'})

                
    #This is required to get rid of extra rows at the bottom
    df = df.dropna(subset=['nzbn'])
    
    
    #Now we need to get the directors and all the other info using the nzbn number
    for index, nzbn_num in enumerate(df['nzbn']):
        
        nzbn_url = f'https://api.business.govt.nz/services/v4/nzbn/entities/{nzbn_num}'
        r = requests.get(nzbn_url, headers=headers)
        data = json.loads(r.text)
        
        #Getting directors
        if 'roles' in data.keys():
            staff = data['roles']
            if not(staff is None):
                
                directors = []
                for member in staff:
                    if member['roleType'] == 'Director' and member['roleStatus'] == 'ACTIVE':
                        firstname = member['rolePerson']['firstName']
                        lastname = member['rolePerson']['lastName']
                        director = firstname + " " + lastname
                        directors.append(director)

                directors = ",".join(str(x) for x in directors)
                df['directors'].iloc[index] = directors
            
            
        #Getting registration status
        if 'entityStatusDescription' in data.keys():
            status = data['entityStatusDescription']
        else:
            status=""
        df['Status'].iloc[index] = status

##################################Ethan's code begins###############################

        registeredAddress = ''
        if 'addresses' in data.keys():
            if 'addressList' in data['addresses'].keys():
                address = data['addresses']['addressList']
                if len(address) > 1:
                    registeredAddress = GetAddress(address[1])
                else:
                    registeredAddress = ''

                    
        
        
##################################Ethan's code ends###############################
        
        df['registeredAddress'].iloc[index] = registeredAddress
        
        #Share holders time!
        shareholders = []
        
        #Need to do all checks to make sure our path exists - thus massive if statement
        if 'company-details' in data.keys() and \
        'shareholding' in data['company-details'].keys() and \
        'shareAllocation' in data['company-details']['shareholding'].keys() and \
        len(data['company-details']['shareholding']['shareAllocation']) > 0:
            
            #Then we loop through all the shareholders
            for shareholder in data['company-details']['shareholding']['shareAllocation']:
                if 'shareholder' in shareholder.keys(): #Need to check if 'shareholder' exists....
                    for i in shareholder['shareholder']:
                        if i['type'] == 'entity': #Some share holders are companies!
                            name = i['otherShareholder']['currentEntityName']
                            shareholders.append(name)
                        elif i['type'] == 'individual' or i['type'] == 'director': #Make sure shareholders are people
                            name= i['individualShareholder']['fullName']
                            name = tidystring(name)
                            shareholders.append(name)
                        

        shareholders=set(shareholders)
        shareholders = "; ".join(str(x) for x in shareholders) 
        df['shareholders'].iloc[index] = shareholders


        #Setting industry code for all rows
        #Set initial just in case
        class_code = 0
        #Check if class code actually exists for this business
        if 'industryClassifications' in data.keys() and not (data['industryClassifications'] is None):
            if len(data['industryClassifications']) > 0:
                if 'classificationCode' in data['industryClassifications'][0].keys():
                    class_code = data['industryClassifications'][0]['classificationCode']


        df['industry code'].iloc[index] = class_code 
    

    df.to_csv(r'C:\Users\iVise User\Desktop\Ben\plumb.csv')
    return  

In [56]:
get_business_details()