In [1]:
import numpy as np
import pandas as pd
from cleaning_functions.clean_csv import *
from analyze.enums.csv_fetcher import CSV, read_csv
import inflection
from data_dictionaries.nhdb import data_dictionary
from data_dictionaries.ndhb_snake import snake_data_dictionary



In [2]:
# Set Pandas Display Options
pd.options.display.max_seq_items = 2000
pd.options.display.float_format = '{:.0f}'.format

In [3]:
# Load the Data
nhdb_dictionary =convert_dictionary(data_dictionary)
df = pd.read_csv("../data/raw/all_properties.csv")
df

  df = pd.read_csv("../data/raw/all_properties.csv")


Unnamed: 0,NHPDPropertyID,PropertyName,PropertyAddress,City,State,Zip,CBSACode,CBSAType,County,CountyCode,...,NumberActiveMR,NumberInconclusiveMR,NumberInactiveMR,Mr_1_Status,Mr_1_ProgramName,Mr_1_AssistedUnits,Mr_2_Status,Mr_2_ProgramName,Mr_2_AssistedUnits,OldNHPDPropertyID
0,1000072,SUBSIDIZED HOUSING CORPORATION 65,1356 Ashport St,Pomona,CA,91768-2801,31080,Metropolitan Statistical Area,Los Angeles,6037,...,0,0,0,,,,,,,
1,1001300,CHRISTOPHER HOMES INC,17 Christopher Cir,Searcy,AR,72143-4769,42620,Micropolitan Statistical Area,White,5145,...,0,0,0,,,,,,,
2,1044315,AMBER WOODS COOPERATIVE V,10202 John Jay Dr,Indianapolis,IN,46235-2327,26900,Metropolitan Statistical Area,Marion,18097,...,0,0,0,,,,,,,
3,1000049,SUBSIDIZED HOUSING CORPORATION 4,232 S Avenue 56,Los Angeles,CA,90042-4610,31080,Metropolitan Statistical Area,Los Angeles,6037,...,0,0,0,,,,,,,
4,1052703,ELDRIDGE BARSTOW,3920 Hallowing Point Rd,Prince Frederick,MD,20678-3443,47900,Metropolitan Statistical Area,Calvert,24009,...,0,0,0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117324,1154091,,1601 Buena Vista St,San Antonio,TX,78207-3853,41700,Metropolitan Statistical Area,Bexar,48029,...,1,0,0,Active,Mod Rehab,50,,,,
117325,1154092,,184 Elmwood Ave,Burlington,VT,05401-4265,15540,Metropolitan Statistical Area,Chittenden,50007,...,1,0,0,Active,Mod Rehab,16,,,,
117326,1154094,,229 W 2nd Ave,Spokane,WA,99201-3636,44060,Metropolitan Statistical Area,Spokane,53063,...,1,0,0,Active,Mod Rehab,21,,,,
117327,1154095,Seaton Taylor Apartments,402 7TH AVE,HUNTINGTON,WV,25701,26580,,Cabell,54011,...,1,0,0,Active,Mod Rehab,12,,,,


In [4]:
# Clean the column headers
df = clean_headers(df, [remove_special_for_words, truncate, inflection.underscore, snake_case, lower_case])
df.rename(columns = {"percentof_eli_households":"percent_of_eli_households"}, inplace=True)
df

Unnamed: 0,nhpd_property_id,property_name,property_address,city,state,zip,cbsa_code,cbsa_type,county,county_code,...,number_active_mr,number_inconclusive_mr,number_inactive_mr,mr_1_status,mr_1_program_name,mr_1_assisted_units,mr_2_status,mr_2_program_name,mr_2_assisted_units,old_nhpd_property_id
0,1000072,SUBSIDIZED HOUSING CORPORATION 65,1356 Ashport St,Pomona,CA,91768-2801,31080,Metropolitan Statistical Area,Los Angeles,6037,...,0,0,0,,,,,,,
1,1001300,CHRISTOPHER HOMES INC,17 Christopher Cir,Searcy,AR,72143-4769,42620,Micropolitan Statistical Area,White,5145,...,0,0,0,,,,,,,
2,1044315,AMBER WOODS COOPERATIVE V,10202 John Jay Dr,Indianapolis,IN,46235-2327,26900,Metropolitan Statistical Area,Marion,18097,...,0,0,0,,,,,,,
3,1000049,SUBSIDIZED HOUSING CORPORATION 4,232 S Avenue 56,Los Angeles,CA,90042-4610,31080,Metropolitan Statistical Area,Los Angeles,6037,...,0,0,0,,,,,,,
4,1052703,ELDRIDGE BARSTOW,3920 Hallowing Point Rd,Prince Frederick,MD,20678-3443,47900,Metropolitan Statistical Area,Calvert,24009,...,0,0,0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117324,1154091,,1601 Buena Vista St,San Antonio,TX,78207-3853,41700,Metropolitan Statistical Area,Bexar,48029,...,1,0,0,Active,Mod Rehab,50,,,,
117325,1154092,,184 Elmwood Ave,Burlington,VT,05401-4265,15540,Metropolitan Statistical Area,Chittenden,50007,...,1,0,0,Active,Mod Rehab,16,,,,
117326,1154094,,229 W 2nd Ave,Spokane,WA,99201-3636,44060,Metropolitan Statistical Area,Spokane,53063,...,1,0,0,Active,Mod Rehab,21,,,,
117327,1154095,Seaton Taylor Apartments,402 7TH AVE,HUNTINGTON,WV,25701,26580,,Cabell,54011,...,1,0,0,Active,Mod Rehab,12,,,,


In [5]:
#sort the string and int cols into categories based on processing needs

#string cols
id_cols = []
no_special_descriptive_cols = ['property_name', 'property_address', 'city', 'state', 'cbsa_type', 'county', 'property_status', 'owner', 'owner_type','manager_name', 'manager_type', 'target_tenant_type']
special_protected_descriptive_cols = []
int_cols = []

# sort the string cols by correction needed
for col in df.columns:
    name = col.split("_")
    if name[-1] in ['status', 'desc', 'type']:
        no_special_descriptive_cols.append(col)
    elif name[-1]  == 'name':
        special_protected_descriptive_cols.append(col)
    elif name[-1] == 'id':
        id_cols.append(col)
    elif name[-1] in ["subsidies", "units"] or name[0] == "number":
        int_cols.append(col)
    else:
        continue

In [6]:
# Clean the string columns that need special characters removed
df = clean_columns(df, no_special_descriptive_cols, [remove_special_for_words, truncate, lower_case])
df

Unnamed: 0,nhpd_property_id,property_name,property_address,city,state,zip,cbsa_code,cbsa_type,county,county_code,...,number_active_mr,number_inconclusive_mr,number_inactive_mr,mr_1_status,mr_1_program_name,mr_1_assisted_units,mr_2_status,mr_2_program_name,mr_2_assisted_units,old_nhpd_property_id
0,1000072,subsidized housing corporation 65,1356 ashport st,pomona,ca,91768-2801,31080,metropolitan statistical area,los angeles,6037,...,0,0,0,,,,,,,
1,1001300,christopher homes inc,17 christopher cir,searcy,ar,72143-4769,42620,micropolitan statistical area,white,5145,...,0,0,0,,,,,,,
2,1044315,amber woods cooperative v,10202 john jay dr,indianapolis,in,46235-2327,26900,metropolitan statistical area,marion,18097,...,0,0,0,,,,,,,
3,1000049,subsidized housing corporation 4,232 s avenue 56,los angeles,ca,90042-4610,31080,metropolitan statistical area,los angeles,6037,...,0,0,0,,,,,,,
4,1052703,eldridge barstow,3920 hallowing point rd,prince frederick,md,20678-3443,47900,metropolitan statistical area,calvert,24009,...,0,0,0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117324,1154091,,1601 buena vista st,san antonio,tx,78207-3853,41700,metropolitan statistical area,bexar,48029,...,1,0,0,active,Mod Rehab,50,,,,
117325,1154092,,184 elmwood ave,burlington,vt,05401-4265,15540,metropolitan statistical area,chittenden,50007,...,1,0,0,active,Mod Rehab,16,,,,
117326,1154094,,229 w 2nd ave,spokane,wa,99201-3636,44060,metropolitan statistical area,spokane,53063,...,1,0,0,active,Mod Rehab,21,,,,
117327,1154095,seaton taylor apartments,402 7th ave,huntington,wv,25701,26580,,cabell,54011,...,1,0,0,active,Mod Rehab,12,,,,


In [7]:
# Clean the string columns that need special characters to remain
df = clean_columns(df, special_protected_descriptive_cols, [truncate, lower_case])
df

Unnamed: 0,nhpd_property_id,property_name,property_address,city,state,zip,cbsa_code,cbsa_type,county,county_code,...,number_active_mr,number_inconclusive_mr,number_inactive_mr,mr_1_status,mr_1_program_name,mr_1_assisted_units,mr_2_status,mr_2_program_name,mr_2_assisted_units,old_nhpd_property_id
0,1000072,subsidized housing corporation 65,1356 ashport st,pomona,ca,91768-2801,31080,metropolitan statistical area,los angeles,6037,...,0,0,0,,,,,,,
1,1001300,christopher homes inc,17 christopher cir,searcy,ar,72143-4769,42620,micropolitan statistical area,white,5145,...,0,0,0,,,,,,,
2,1044315,amber woods cooperative v,10202 john jay dr,indianapolis,in,46235-2327,26900,metropolitan statistical area,marion,18097,...,0,0,0,,,,,,,
3,1000049,subsidized housing corporation 4,232 s avenue 56,los angeles,ca,90042-4610,31080,metropolitan statistical area,los angeles,6037,...,0,0,0,,,,,,,
4,1052703,eldridge barstow,3920 hallowing point rd,prince frederick,md,20678-3443,47900,metropolitan statistical area,calvert,24009,...,0,0,0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117324,1154091,,1601 buena vista st,san antonio,tx,78207-3853,41700,metropolitan statistical area,bexar,48029,...,1,0,0,active,mod rehab,50,,,,
117325,1154092,,184 elmwood ave,burlington,vt,05401-4265,15540,metropolitan statistical area,chittenden,50007,...,1,0,0,active,mod rehab,16,,,,
117326,1154094,,229 w 2nd ave,spokane,wa,99201-3636,44060,metropolitan statistical area,spokane,53063,...,1,0,0,active,mod rehab,21,,,,
117327,1154095,seaton taylor apartments,402 7th ave,huntington,wv,25701,26580,,cabell,54011,...,1,0,0,active,mod rehab,12,,,,


In [10]:
# fix int columns, removing trailing zeros
def convert_to_int_with_nones(entry):
    try: 
        return int(entry)
    except TypeError:
        return entry
    except ValueError:
        entry = str(entry)
        if "." in entry:
            split_entry = str(entry).split(".")
            if int(split_entry[-1]) == 0:
                return int(split_entry[0])
            return int(entry.split(".")[0])
        else:
            return entry
clean_columns(df, int_cols, [convert_to_int_with_nones])

Unnamed: 0,nhpd_property_id,property_name,property_address,city,state,zip,cbsa_code,cbsa_type,county,county_code,...,number_active_mr,number_inconclusive_mr,number_inactive_mr,mr_1_status,mr_1_program_name,mr_1_assisted_units,mr_2_status,mr_2_program_name,mr_2_assisted_units,old_nhpd_property_id
0,1000072,subsidized housing corporation 65,1356 ashport st,pomona,ca,91768-2801,31080,metropolitan statistical area,los angeles,6037,...,0,0,0,,,,,,,
1,1001300,christopher homes inc,17 christopher cir,searcy,ar,72143-4769,42620,micropolitan statistical area,white,5145,...,0,0,0,,,,,,,
2,1044315,amber woods cooperative v,10202 john jay dr,indianapolis,in,46235-2327,26900,metropolitan statistical area,marion,18097,...,0,0,0,,,,,,,
3,1000049,subsidized housing corporation 4,232 s avenue 56,los angeles,ca,90042-4610,31080,metropolitan statistical area,los angeles,6037,...,0,0,0,,,,,,,
4,1052703,eldridge barstow,3920 hallowing point rd,prince frederick,md,20678-3443,47900,metropolitan statistical area,calvert,24009,...,0,0,0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117324,1154091,,1601 buena vista st,san antonio,tx,78207-3853,41700,metropolitan statistical area,bexar,48029,...,1,0,0,active,mod rehab,50,,,,
117325,1154092,,184 elmwood ave,burlington,vt,05401-4265,15540,metropolitan statistical area,chittenden,50007,...,1,0,0,active,mod rehab,16,,,,
117326,1154094,,229 w 2nd ave,spokane,wa,99201-3636,44060,metropolitan statistical area,spokane,53063,...,1,0,0,active,mod rehab,21,,,,
117327,1154095,seaton taylor apartments,402 7th ave,huntington,wv,25701,26580,,cabell,54011,...,1,0,0,active,mod rehab,12,,,,


In [11]:
# fix date columns
# Note that there is a funky date that gets replaced with missing. 
# "OutOfBoundsDatetime: Out of bounds nanosecond timestamp: 8/1/6439, at position 0"
def fix_date(entry):
    return pd.to_datetime(entry, errors='coerce')

    
date_columns = snake_data_dictionary["datetime64[ns]"]
clean_columns(df, date_columns, [fix_date])
df

Unnamed: 0,nhpd_property_id,property_name,property_address,city,state,zip,cbsa_code,cbsa_type,county,county_code,...,number_active_mr,number_inconclusive_mr,number_inactive_mr,mr_1_status,mr_1_program_name,mr_1_assisted_units,mr_2_status,mr_2_program_name,mr_2_assisted_units,old_nhpd_property_id
0,1000072,subsidized housing corporation 65,1356 ashport st,pomona,ca,91768-2801,31080,metropolitan statistical area,los angeles,6037,...,0,0,0,,,,,,,
1,1001300,christopher homes inc,17 christopher cir,searcy,ar,72143-4769,42620,micropolitan statistical area,white,5145,...,0,0,0,,,,,,,
2,1044315,amber woods cooperative v,10202 john jay dr,indianapolis,in,46235-2327,26900,metropolitan statistical area,marion,18097,...,0,0,0,,,,,,,
3,1000049,subsidized housing corporation 4,232 s avenue 56,los angeles,ca,90042-4610,31080,metropolitan statistical area,los angeles,6037,...,0,0,0,,,,,,,
4,1052703,eldridge barstow,3920 hallowing point rd,prince frederick,md,20678-3443,47900,metropolitan statistical area,calvert,24009,...,0,0,0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117324,1154091,,1601 buena vista st,san antonio,tx,78207-3853,41700,metropolitan statistical area,bexar,48029,...,1,0,0,active,mod rehab,50,,,,
117325,1154092,,184 elmwood ave,burlington,vt,05401-4265,15540,metropolitan statistical area,chittenden,50007,...,1,0,0,active,mod rehab,16,,,,
117326,1154094,,229 w 2nd ave,spokane,wa,99201-3636,44060,metropolitan statistical area,spokane,53063,...,1,0,0,active,mod rehab,21,,,,
117327,1154095,seaton taylor apartments,402 7th ave,huntington,wv,25701,26580,,cabell,54011,...,1,0,0,active,mod rehab,12,,,,


In [12]:
# replace nan strings with none
def replace_nan_with_none(entry):
    entry = str(entry).lower()
    if entry in ['nan', 'none', 'nat']:
        return None
    else:
        return entry

df = clean_columns(df, df.columns, [replace_nan_with_none])
df

Unnamed: 0,nhpd_property_id,property_name,property_address,city,state,zip,cbsa_code,cbsa_type,county,county_code,...,number_active_mr,number_inconclusive_mr,number_inactive_mr,mr_1_status,mr_1_program_name,mr_1_assisted_units,mr_2_status,mr_2_program_name,mr_2_assisted_units,old_nhpd_property_id
0,1000072,subsidized housing corporation 65,1356 ashport st,pomona,ca,91768-2801,31080.0,metropolitan statistical area,los angeles,6037.0,...,0,0,0,,,,,,,
1,1001300,christopher homes inc,17 christopher cir,searcy,ar,72143-4769,42620.0,micropolitan statistical area,white,5145.0,...,0,0,0,,,,,,,
2,1044315,amber woods cooperative v,10202 john jay dr,indianapolis,in,46235-2327,26900.0,metropolitan statistical area,marion,18097.0,...,0,0,0,,,,,,,
3,1000049,subsidized housing corporation 4,232 s avenue 56,los angeles,ca,90042-4610,31080.0,metropolitan statistical area,los angeles,6037.0,...,0,0,0,,,,,,,
4,1052703,eldridge barstow,3920 hallowing point rd,prince frederick,md,20678-3443,47900.0,metropolitan statistical area,calvert,24009.0,...,0,0,0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117324,1154091,,1601 buena vista st,san antonio,tx,78207-3853,41700.0,metropolitan statistical area,bexar,48029.0,...,1,0,0,active,mod rehab,50,,,,
117325,1154092,,184 elmwood ave,burlington,vt,05401-4265,15540.0,metropolitan statistical area,chittenden,50007.0,...,1,0,0,active,mod rehab,16,,,,
117326,1154094,,229 w 2nd ave,spokane,wa,99201-3636,44060.0,metropolitan statistical area,spokane,53063.0,...,1,0,0,active,mod rehab,21,,,,
117327,1154095,seaton taylor apartments,402 7th ave,huntington,wv,25701,26580.0,,cabell,54011.0,...,1,0,0,active,mod rehab,12,,,,


In [13]:
# split data into separate dataframes by program?

In [22]:
# Cast the data
df = df.astype(dtype=convert_dictionary(snake_data_dictionary))
types = pd.DataFrame(df.dtypes)
types[0].loc[types[0] == "object"]

Series([], Name: 0, dtype: object)

In [21]:
# Try to Fix Object Columns
# After all that prep, two variables still remain in "object" format: "total_inactive_subsities" and "average_months_of_tenancy". Let's investigate and try to fix it before declaring the data clean
# df["average_months_of_tenancy"].astype(Fl)
df['total_inactive_subsidies'] = df['total_inactive_subsidies'].astype(int)
df['average_months_of_tenancy'] = df['average_months_of_tenancy'].astype('float64')

dtype('float64')

In [25]:
# write out data
outfile_path = '../data/processed/nhdb.csv'
df.to_csv(outfile_path)

In [26]:
# test the data by reading it back in
df = pd.read_csv(outfile_path)
df

  pd.read_csv(outfile_path)


Unnamed: 0.1,Unnamed: 0,nhpd_property_id,property_name,property_address,city,state,zip,cbsa_code,cbsa_type,county,...,number_active_mr,number_inconclusive_mr,number_inactive_mr,mr_1_status,mr_1_program_name,mr_1_assisted_units,mr_2_status,mr_2_program_name,mr_2_assisted_units,old_nhpd_property_id
0,0,1000072,subsidized housing corporation 65,1356 ashport st,pomona,ca,91768-2801,31080,metropolitan statistical area,los angeles,...,0,0,0,,,,,,,
1,1,1001300,christopher homes inc,17 christopher cir,searcy,ar,72143-4769,42620,micropolitan statistical area,white,...,0,0,0,,,,,,,
2,2,1044315,amber woods cooperative v,10202 john jay dr,indianapolis,in,46235-2327,26900,metropolitan statistical area,marion,...,0,0,0,,,,,,,
3,3,1000049,subsidized housing corporation 4,232 s avenue 56,los angeles,ca,90042-4610,31080,metropolitan statistical area,los angeles,...,0,0,0,,,,,,,
4,4,1052703,eldridge barstow,3920 hallowing point rd,prince frederick,md,20678-3443,47900,metropolitan statistical area,calvert,...,0,0,0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117324,117324,1154091,,1601 buena vista st,san antonio,tx,78207-3853,41700,metropolitan statistical area,bexar,...,1,0,0,active,mod rehab,50,,,,
117325,117325,1154092,,184 elmwood ave,burlington,vt,05401-4265,15540,metropolitan statistical area,chittenden,...,1,0,0,active,mod rehab,16,,,,
117326,117326,1154094,,229 w 2nd ave,spokane,wa,99201-3636,44060,metropolitan statistical area,spokane,...,1,0,0,active,mod rehab,21,,,,
117327,117327,1154095,seaton taylor apartments,402 7th ave,huntington,wv,25701,26580,,cabell,...,1,0,0,active,mod rehab,12,,,,
