Dataset - "Issued Building Permits"

Objective:
- Prepare dataset for visualization so that analyses about the City of Vancouver's development, with regards to construction, can be made, e.g. 'What kind of construction projects are trending?' and 'What local areas are being developed the most?'. 

Notes: 
- Dataset was obtained from the City of Vancouver's Open Data Portal (https://opendata.vancouver.ca)
- Dataset is licensed under the 'Open Government Licence - Vancouver' (https://opendata.vancouver.ca/pages/licence/).

- Dataset consists of data from 2017 to the day before the dataset was published and obtained on the day specified above.
- Original dataset needs be in JSON format.

Columns:
- 'address': Address of building project
- 'permitelapseddays': Number of days between permit number being generated and the permit being issued
- 'yearmonth': Year and month when permit was issued
- Other columns: Self-explanatory

In [None]:
import os
import math
import string
import difflib
import ast
import re
from pathlib import Path
from random import randrange
from datetime import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Functions

def import_data(dataset_name, data_dir_path):

    data_file_paths = _show_data_files(dataset_name, data_dir_path)
    data_file_path = _get_file_index(data_file_paths)

    if data_file_path.suffix == '.json':
        data = pd.read_json(data_file_path)
    elif data_file_path.suffix == '.csv':
        data = pd.read_csv(data_file_path, sep=';')
    else:
        data = None

    return data, data_file_path.name


def _show_data_files(dataset_name, data_dir_path):

    data_file_paths = list(data_dir_path.iterdir())
    relevant_file_paths = [path for path in data_file_paths if dataset_name.lower() in path.name.lower()]

    for index, file_path in enumerate(relevant_file_paths):
        print(f'({index}) {file_path}')

    return relevant_file_paths 


def _get_file_index(data_file_paths):

    while True:
        try:
            data_file_index = int(input('Enter the index of the data file to be imported, e.g. 0: '))
            data_file_path = data_file_paths[data_file_index]
        except (ValueError, IndexError, TypeError):
            print('Please try again with a valid index.')
        else: 
            return data_file_path


def unpack_geom(data):
    
    geom_data = data['geom'].apply(pd.Series)
    geom_data.rename(columns={'coordinates': 'geomcoordinates', 'type':'geomtype'}, inplace=True)
    data = data.join(geom_data)
    data.drop(columns=[0, 'geom'], inplace=True)

    data['geomcoordinates'] = data['geomcoordinates'].apply(to_tuple)

    return data


def show_random_rows(data, n):

    indexes = [randrange(len(data.index)) for _ in range(n)]

    return data.loc[indexes]


def get_metro_van_areas():

    metro_van_membership = pd.read_html('https://en.wikipedia.org/wiki/Metro_Vancouver_Regional_District#Membership')[1]
    metro_van_members = metro_van_membership[0:23]['Member'].values

    return metro_van_members


def to_tuple(value):

    if isinstance(value, list):
        return tuple(value)
    else:
        return value


def remove_escape_chars(value):

    if isinstance(value, str):
        return value.replace('/r/n', ' ')
    else:
        return value


def remove_extra_whitespace(value, multiple=False):

    if not isinstance(value, str):
        return value
    else: 
        if multiple is True:
            return ' '.join(value.split())           
        else:
            return re.sub(r'\s+(?=[,:\(\)])', '', value)


def add_space_postal_code(postal_code):

    if isinstance(postal_code, str):
        return re.sub(r'([A-Za-z0-9]{3})[\s-]*([A-Za-z0-9]{3})\s*$', r'\1 \2', postal_code)
    else:  # Not a postal code
        return postal_code


def extract_address_components(address_col):
    title = r'(#|Unit|Suite|PO\sBOX)'
    unit_num = r'([0-9]*)'
    house_num = r'([0-9]+)'
    street = r'([A-Za-z0-9][A-Za-z0-9\s\.]*[A-Za-z0-9\.])'
    city = r'([A-Za-z]*)'
    province = r'([A-Za-z]{0,3})'
    postal_code = r'([A-Za-z0-9]{3}[\s-][A-Za-z0-9]{3})'

    address_pattern = (rf'({title}?\s?{unit_num}[,\s–-]+)?{house_num}\s?'
                       rf'({street}(?=\sUnit)|{street}(?!\sUnit))?([,\s–-]+'
                       rf'{title}?\s?{unit_num})?,?\s'
                       rf'{city},\s?{province}\s?{postal_code}?\s*$')

    address_components = address_col.str.extract(address_pattern, flags=re.IGNORECASE)
    address_components.drop(columns=[0], inplace=True)
    address_components.rename(columns={1:'title', 2:'unit_number', 3:'house_number', 4:'street', 
                                       8: 'title2',9:'unit_number2', 10:'city', 11:'province', 12:'postal_code'}, inplace=True)
    address_components = address_components.apply(copy_unit_address_data, axis=1)
    address_components.drop(columns=[5, 6, 7, 'title2', 'unit_number2'], inplace=True)
    
    return address_components


def address_df_check(address_df):

    missing = address_df.loc[address_df.isna().all(axis=1)]
    print(f'Number of "NaN" rows: {len(missing.index)}')

    return missing


def expand_direction(street):
    cardinal_dirs = {'N': 'North', 'E': 'East', 'W': 'West', 'S': 'South'}

    if not isinstance(street, str):
        return street

    for cardinal_dir in cardinal_dirs.items():
        street = re.sub(rf'\b{cardinal_dir[0]}\b', cardinal_dir[1], street, flags=re.IGNORECASE)

    return street


def expand_road_type(street):
    road_types = {'Ave': 'Avenue', 'St': 'Street', 'Rd': 'Road'}

    if not isinstance(street, str):
        return street

    for road_type in road_types.items():
        street = re.sub(rf'\b{road_type[0]}\b\s*$', road_type[1], street, flags=re.IGNORECASE)

    return street


def copy_unit_address_data(row):

    if pd.isna(row['title']) and pd.isna(row['unit_number']):
        row['title'] = row['title2']
        row['unit_number'] = row['unit_number2']

    return row


def clean_address_df(address_df):

    address_df.replace('', float('nan'), inplace=True)

    # Correct misspellings in 'city' column
    address_df['city'] = address_df['city'].apply(predict_city, possibilities=metro_van_cities)

    # Clean 'title' column
    address_df.loc[(address_df['title'].isna()) & ~(address_df['unit_number'].isna()), 'title'] = 'Unit'
    address_df.loc[address_df['title'] == '#', 'title'] = 'Unit'
    
    # Correct 'unit_number' and 'house_number' for PO Boxes
    mask = (address_df['title'] == 'PO BOX') & (address_df['unit_number'].isna())
    address_df.loc[mask, ['unit_number', 'house_number']] = address_df.loc[mask, ['house_number', 'unit_number']].values

    # Improve consistency of 'street' column
    address_df['street'] = address_df['street'].apply(fix_capitalization)
    address_df['street'] = address_df['street'].apply(expand_direction)
    address_df['street'] = address_df['street'].apply(expand_road_type)

    return address_df


def predict_city(city, possibilities):

    if not isinstance(city, str):
        return city
        
    close_matches = difflib.get_close_matches(city.lower(), possibilities=possibilities, n=1, cutoff=0.8)

    if close_matches:
        return close_matches[0]
    else:
        return city


def fix_capitalization(value):

    if isinstance(value, str):
        return string.capwords(value)
    else:
        return value


def export_data(data, dataset_name, data_file_name, filetype='csv'):
    
    utc_now = str(datetime.utcnow())
    date_time_now = f'{utc_now[0:10]}-{utc_now[11:13]}{utc_now[14:16]}'

    file_name = re.match(rf'\d{{4}}-\d{{2}}-\d{{2}}-\d{{4}}_{dataset_name}', data_file_name).group()
    new_name = f'{file_name}_(processed_{date_time_now}).{filetype}'

    full_path = DATA_DIR_PATH / new_name
    data.to_csv(full_path, sep=';')
    print(f'{new_name} has been successfully exported to {DATA_DIR_PATH}')

In [None]:
# CONSTANTS

DATASET_NAME = 'issued-building-permits'
DATA_DIR_PATH = Path.cwd() / 'drive' / 'MyDrive' / 'Vancouver Datasets'

# Other Useful Data

# Data saved from Wikipedia
metro_van_members = ['Anmore', 'Belcarra', 'Bowen Island', 'Burnaby', 'Coquitlam', 'Delta',
 'City of Langley', 'Township of Langley', 'Lions Bay', 'Maple Ridge',
 'Metro Vancouver A', 'New Westminster', 'City of North Vancouver',
 'District of North Vancouver', 'Pitt Meadows', 'Port Coquitlam',
 'Port Moody', 'Richmond', 'Surrey', 'Tsawwassen', 'Vancouver',
 'West Vancouver', 'White Rock']

metro_van_cities = np.sort(np.concatenate([metro_van_members, ['Langley', 'Fort Langley', 'North Vancouver']]))

Import Data - Load Data

In [None]:
original_data, DATA_FILE_NAME = import_data(DATASET_NAME, DATA_DIR_PATH)
original_data.head()

Import Data - Unpack Data

In [None]:
# Unpack 'fields' column for actual data on issued building permits
data = original_data['fields'].apply(pd.Series)

In [None]:
# Drop 'geom' column (Column data is redundant and coordinate data is incorrect.)
data.drop(columns='geom', inplace=True)

# Change 'geo_point_2d' values from lists to hashable data type so that df.nunique() will work
data['geo_point_2d'] = data['geo_point_2d'].apply(to_tuple)

# Convert 'issuedate' from str to datetime.datetime or pd.Timestamp
data['issuedate'] = data['issuedate'].apply(pd.to_datetime)

In [None]:
# CHECK
data.head(5)

Analyze Data - Preliminary

In [None]:
print(f'Column names: {data.columns}')
print(f'Number of rows: {len(data.index)}')

In [None]:
address_cols = ['address', 'applicantaddress', 'buildingcontractoraddress']

In [None]:
data.info()

In [None]:
data.nunique()

In [None]:
data.isna().sum()

In [None]:
# CHECK
show_random_rows(data, 5)

Clean Data

Clean Data - Remove Escape Characters and Adjust Whitespace

In [None]:
# Remove escape characters, i.e. '\r\n', in text
data['applicantaddress'] = data['applicantaddress'].apply(remove_escape_chars)
data['buildingcontractoraddress'] = data['buildingcontractoraddress'].apply(remove_escape_chars)

In [None]:
# Remove unnecessary amount of whitespace
data = data.applymap(remove_extra_whitespace, multiple=True)

# Fix whitespace in postal code, if necessary
data[address_cols] = data[address_cols].applymap(remove_extra_whitespace, multiple=False)

# Add any missing whitespace in postal codes
data[address_cols] = data[address_cols].applymap(add_space_postal_code)

Clean Data - Extract Address Components from 'Address' Columns

In [None]:
# Building / Project Address
building_address_df = extract_address_components(data['address'])
building_address_df = clean_address_df(building_address_df)

In [None]:
show_random_rows(building_address_df, 5)

In [None]:
# Building Contractor Address
buildingcon_address_df = extract_address_components(data['buildingcontractoraddress'])
buildingcon_address_df = clean_address_df(buildingcon_address_df)

In [None]:
show_random_rows(buildingcon_address_df, 5)

In [None]:
# Applicant Address
applicant_address_df = extract_address_components(data['applicantaddress'])
applicant_address_df = clean_address_df(applicant_address_df)

In [None]:
show_random_rows(applicant_address_df, 5)

Clean Data - Fill in Missing Data

In [None]:
data['specificusecategory'].fillna('Unknown', inplace=True)
data['propertyuse'].fillna('Unknown', inplace=True)

Clean Data - Remove Redundant Data

Removed:
- 'issueyear': Column contains only information already present in 'issuedate' and does not offer any additional value. 
- 'yearmonth': Same reason as 'issueyear'.

In [None]:
# Remove redundant data
data.drop(columns=['issueyear', 'yearmonth'], inplace=True)

Clean Data - Rename and Reorder Columns

In [None]:
# Rename columns
data.rename(columns={'issuedate':'permitissuedate'}, inplace=True)

# Reorder columns
data = data.reindex(sorted(data.columns), axis=1)

Clean Data - Checks

In [None]:
show_random_rows(data, 5)

Export Data

In [None]:
# export_data(data, DATASET_NAME, DATA_FILE_NAME)