In [None]:
'''
Overview

download latest acra data
combine multiple files into one
clean and reorganise columns to make it user friendly
'''

In [45]:
# imports
import pandas as pd
import numpy as np
import glob
import requests
import os
import re
from zipfile import ZipFile
from io import BytesIO
from pathlib import Path
from tqdm.notebook import tqdm_notebook

In [30]:
# define acra data download url
# https://data.gov.sg/dataset/acra-information-on-corporate-entities
acra_download_url = 'https://data.gov.sg/dataset/21d477f2-6e1b-4232-82b3-59e804dc2f6a/download'
# define folder to extract acra data files
acra_folder = 'acra-information-on-corporate-entities'
# define metadata file name
metadata_filename = 'metadata-acra-information-on-corporate-entities.txt'

In [32]:
# download data from url
acra_data_zip = requests.get(acra_download_url)
# extract files from zip into a folder
acra_data_zip = ZipFile(BytesIO(acra_data_zip.content))
acra_data_zip.extractall(acra_folder)

In [83]:
# read metadata as one string
metadata = Path(os.path.join(acra_folder, metadata_filename)).read_text()
# define regex pattern to extract last updated date
last_updated_date_pattern = 'Last Updated: \'(\d{4}-\d{2}-\d{2})'
# get first instance of 'Last Updated' date
last_updated = re.findall(last_updated_date_pattern, metadata)[0]

In [133]:
# get a list of all csv files
list_of_files = glob.glob(acra_folder+'\\*.csv')

# create empty dataframe to store data
full_df = pd.DataFrame()
# loop through each file and concat to df
for file in tqdm_notebook(list_of_files[:1]):
    # read csv
    df = pd.read_csv(file)
    # concat df to full_df
    full_df = pd.concat([full_df, df])

# replace 'na' with nan
full_df = full_df.replace('na', np.nan)

  0%|          | 0/1 [00:00<?, ?it/s]

In [134]:
# define date columns
date_columns = ['uen_issue_date', 'registration_incorporation_date', 'account_due_date', 'annual_return_date']
# convert datetime columns to date
full_df[date_columns] = full_df[date_columns].apply(pd.to_datetime)

In [135]:
# define organised list of columns
clean_columns = [
    'uen',
    'uen_issue_date',
    'registration_incorporation_date',
    'issuance_agency_id',
    'entity_name',
    'entity_status_description',
    'entity_type_description',
    'business_constitution_description',
    'company_type_description',
    'primary_ssic_code',
    'primary_ssic_description',
    'primary_user_described_activity',
    'secondary_ssic_code',
    'secondary_ssic_description',
    'secondary_user_described_activity',
    'address_type',
    'block',
    'street_name',
    'building_name',
    'level_no',
    'unit_no',
    'postal_code',
    'other_address_line1',
    'other_address_line2',
    'account_due_date',
    'annual_return_date',
    'no_of_charges',
    'no_of_officers',
    'paf_constitution_description',
    'uen_of_audit_firm1',
    'name_of_audit_firm1',
    'uen_of_audit_firm2',
    'name_of_audit_firm2',
    'uen_of_audit_firm3',
    'name_of_audit_firm3',
    'uen_of_audit_firm4',
    'name_of_audit_firm4',
    'uen_of_audit_firm5',
    'name_of_audit_firm5',
    'former_entity_name1',
    'former_entity_name2',
    'former_entity_name3',
    'former_entity_name4',
    'former_entity_name5',
    'former_entity_name6',
    'former_entity_name7',
    'former_entity_name8',
    'former_entity_name9',
    'former_entity_name10',
    'former_entity_name11',
    'former_entity_name12',
    'former_entity_name13',
    'former_entity_name14',
    'former_entity_name15',
    'paid_up_capital1_currency',
    'paid_up_capital1_ordinary',
    'paid_up_capital1_others',
    'paid_up_capital1_preference',
    'paid_up_capital2_currency',
    'paid_up_capital2_ordinary',
    'paid_up_capital2_others',
    'paid_up_capital2_preference',
    'paid_up_capital3_currency',
    'paid_up_capital3_ordinary',
    'paid_up_capital3_others',
    'paid_up_capital3_preference',
    'paid_up_capital4_currency',
    'paid_up_capital4_ordinary',
    'paid_up_capital4_others',
    'paid_up_capital4_preference',
    'paid_up_capital5_currency',
    'paid_up_capital5_ordinary',
    'paid_up_capital5_others',
    'paid_up_capital5_preference',
    'paid_up_capital6_currency',
    'paid_up_capital6_ordinary',
    'paid_up_capital6_others',
    'paid_up_capital6_preference',
    'paid_up_capital7_currency',
    'paid_up_capital7_ordinary',
    'paid_up_capital7_others',
    'paid_up_capital7_preference',
    'paid_up_capital8_currency',
    'paid_up_capital8_ordinary',
    'paid_up_capital8_others',
    'paid_up_capital8_preference',
    'paid_up_capital9_currency',
    'paid_up_capital9_ordinary',
    'paid_up_capital9_others',
    'paid_up_capital9_preference',
    'paid_up_capital10_currency',
    'paid_up_capital10_ordinary',
    'paid_up_capital10_others',
    'paid_up_capital10_preference'
    ]

In [137]:
# reorganise columns
full_df = full_df[clean_columns]

# display
display(full_df)

Unnamed: 0,uen,uen_issue_date,registration_incorporation_date,issuance_agency_id,entity_name,entity_status_description,entity_type_description,business_constitution_description,company_type_description,primary_ssic_code,...,paid_up_capital8_others,paid_up_capital8_preference,paid_up_capital9_currency,paid_up_capital9_ordinary,paid_up_capital9_others,paid_up_capital9_preference,paid_up_capital10_currency,paid_up_capital10_ordinary,paid_up_capital10_others,paid_up_capital10_preference
0,53011372D,2008-09-12,2004-01-07 14:33:42,ACRA,A & E ENGINEERING SERVICES,Ceased Registration,Business,Sole Proprietor,,95220,...,,,,,,,,,,
1,53358494M,2017-03-15,2017-03-15 12:31:24,ACRA,A & Q TRANSIT,Cancelled (Non-Renewal),Business,Sole Proprietor,,49212,...,,,,,,,,,,
2,53043572A,2008-09-13,2005-04-29 01:38:25,ACRA,A AND Y CONSULTANCY SERVICES,Cancelled,Business,Sole Proprietor,,70202,...,,,,,,,,,,
3,53065972K,2008-09-13,2006-04-10 10:36:33,ACRA,A COFFEE SHOP,Cancelled,Business,Sole Proprietor,,68104,...,,,,,,,,,,
4,53028864L,2008-09-12,2004-09-09 17:07:45,ACRA,A DIXIT & ASSOCIATES,Cancelled,Business,Sole Proprietor,,70209,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145788,53184186C,2011-01-26,2011-01-25 13:39:11,ACRA,A_TALENT,Live,Business,Sole Proprietor,,78104,...,,,,,,,,,,
145789,53172807B,2010-07-31,2010-07-30 13:31:23,ACRA,A`VAIL HAIR STUDIO,Cancelled,Business,Sole Proprietor,,96021,...,,,,,,,,,,
145790,53387283M,2018-09-12,2018-09-12 19:29:29,ACRA,A|A DESIGN,Cancelled (Non-Renewal),Business,Sole Proprietor,,74192,...,,,,,,,,,,
145791,201505959R,2015-03-12,2015-03-05 17:33:39,ACRA,A~STAR PLASTIC LIGHT METAL INDUSTRIES PTE. LTD.,Struck Off,Local Company,,PRIVATE COMPANY LIMITED BY SHARES,38309,...,,,,,,,,,,


In [131]:
# loop through df and drop columns that are all nans
for col in full_df:
    # check all rows in column are nan
    if len(full_df[col]) == len(full_df[col].loc[(full_df[col].isna() == True)]):
        # drop column if all rows are nan
        full_df = full_df.drop(col, axis=1)

In [None]:
# write to file
full_df.to_csv(acra_folder+'-'+last_updated+'.csv', index=False)