In [26]:
import pandas as pd
from utils import *
from typing import Any

<h1>Cleaning a CSV</h1>

Read in a CSV

In [27]:
# load a dataframe
infile_path = "../data/raw/silver_shelby_county_properties_municipality_breakdown - s6_2022_ihe.csv"
df = pd.read_csv(infile_path)
df

Unnamed: 0,Owner,Location,PILOT Parcel No,Underlying Parcel Number,Begin Date,End Date,Assmt Taxes,PILOT Contract Amt,PILOT Debt Service Amt,Difference
0,HILLCREST MEMPHIS LP,"4155 EAST WIND DR., #1",IHE0000A000000,0790040000001C,10/01/2008,09/30/2028,"$53,193","$16,412",$0,"-$36,781"
1,ALCO PERSHING PARTNERS LP,0 STEELE,IHE0000B000000,"07101400000950, 07101400000960",08/01/2008,12/31/2028,"$40,315","$14,754",$0,"-$25,562"
2,CROCKETT PARK (AGNES PLACE APARTMENTS),2645 DAVEY DRIVE,IHE0001B000000,"0710560A00016C, 0710560A00046Z",05/01/2005,05/01/2025,"$60,985",$187,$0,"-$60,797"
3,"UPTOWN VILLAGE APARTMENTS, LP",0 THOMAS,IHE0001L000000,"0010960A000960, 0010960A000970",03/05/2004,03/04/2024,"$48,181","$1,984",$0,"-$46,197"
4,"UPTOWN SENIOR HOUSING DEVELOPMENT, LP",669 THIRD,IHE0001R000000,"0010560000002C, 00105600000070",04/19/2006,04/18/2026,"$41,349","$2,175",$0,"-$39,174"
...,...,...,...,...,...,...,...,...,...,...
102,SHELBY POINTE TOWNHOMES LLC,1500 EAST SHELBY DRIVE,IHE0007K000000,0790500A00025,04/14/2022,04/13/2042,"$46,330","$16,628",$0,"-$29,702"
103,SC BARTLETT INVESTORS LLC,2171 SYCAMORE VIEW RD,IHE0007L000000,088041000017C,03/01/2022,02/28/2042,"$64,513","$27,042",$0,"-$37,471"
104,"APP BREEZY PARTNERS, L.L.L.P",1500 OBERLE AVE,IHE0007N000000,070037000041C,09/01/2022,08/31/2042,"$55,026","$9,196",$0,"-$45,830"
105,SOUTH CITY V,457 BRIGHTSIDE,IHE0007O000000,702000000290,09/01/2022,08/31/2042,"$1,832",$765,$0,"-$1,067"


In [28]:
# Clean the column headers
df = clean_headers(df, [remove_special_for_words, truncate, snake_case, lower_case])
df.columns

Index(['owner', 'location', 'pilot_parcel_no', 'underlying_parcel_number',
       'begin_date', 'end_date', 'assmt_taxes', 'pilot_contract_amt',
       'pilot_debt_service_amt', 'difference'],
      dtype='object')

In [29]:
# Clean the string columns
string_columns = ["owner", 
                  "location"]
df = clean_columns(df, string_columns, [remove_special_for_words, truncate, lower_case])
df

Unnamed: 0,owner,location,pilot_parcel_no,underlying_parcel_number,begin_date,end_date,assmt_taxes,pilot_contract_amt,pilot_debt_service_amt,difference
0,hillcrest memphis lp,4155 east wind dr 1,IHE0000A000000,0790040000001C,10/01/2008,09/30/2028,"$53,193","$16,412",$0,"-$36,781"
1,alco pershing partners lp,0 steele,IHE0000B000000,"07101400000950, 07101400000960",08/01/2008,12/31/2028,"$40,315","$14,754",$0,"-$25,562"
2,crockett park agnes place apartments,2645 davey drive,IHE0001B000000,"0710560A00016C, 0710560A00046Z",05/01/2005,05/01/2025,"$60,985",$187,$0,"-$60,797"
3,uptown village apartments lp,0 thomas,IHE0001L000000,"0010960A000960, 0010960A000970",03/05/2004,03/04/2024,"$48,181","$1,984",$0,"-$46,197"
4,uptown senior housing development lp,669 third,IHE0001R000000,"0010560000002C, 00105600000070",04/19/2006,04/18/2026,"$41,349","$2,175",$0,"-$39,174"
...,...,...,...,...,...,...,...,...,...,...
102,shelby pointe townhomes llc,1500 east shelby drive,IHE0007K000000,0790500A00025,04/14/2022,04/13/2042,"$46,330","$16,628",$0,"-$29,702"
103,sc bartlett investors llc,2171 sycamore view rd,IHE0007L000000,088041000017C,03/01/2022,02/28/2042,"$64,513","$27,042",$0,"-$37,471"
104,app breezy partners l l l p,1500 oberle ave,IHE0007N000000,070037000041C,09/01/2022,08/31/2042,"$55,026","$9,196",$0,"-$45,830"
105,south city v,457 brightside,IHE0007O000000,702000000290,09/01/2022,08/31/2042,"$1,832",$765,$0,"-$1,067"


In [30]:
df.columns

Index(['owner', 'location', 'pilot_parcel_no', 'underlying_parcel_number',
       'begin_date', 'end_date', 'assmt_taxes', 'pilot_contract_amt',
       'pilot_debt_service_amt', 'difference'],
      dtype='object')

In [31]:
# Clean the numeric columns
numeric_columns = ['assmt_taxes', 'pilot_contract_amt',
       'pilot_debt_service_amt', 'difference']
df = clean_columns(df, numeric_columns, [remove_special_for_numbers, convert_only_dash_to_missing])

df

Unnamed: 0,owner,location,pilot_parcel_no,underlying_parcel_number,begin_date,end_date,assmt_taxes,pilot_contract_amt,pilot_debt_service_amt,difference
0,hillcrest memphis lp,4155 east wind dr 1,IHE0000A000000,0790040000001C,10/01/2008,09/30/2028,53193,16412,0,-36781
1,alco pershing partners lp,0 steele,IHE0000B000000,"07101400000950, 07101400000960",08/01/2008,12/31/2028,40315,14754,0,-25562
2,crockett park agnes place apartments,2645 davey drive,IHE0001B000000,"0710560A00016C, 0710560A00046Z",05/01/2005,05/01/2025,60985,187,0,-60797
3,uptown village apartments lp,0 thomas,IHE0001L000000,"0010960A000960, 0010960A000970",03/05/2004,03/04/2024,48181,1984,0,-46197
4,uptown senior housing development lp,669 third,IHE0001R000000,"0010560000002C, 00105600000070",04/19/2006,04/18/2026,41349,2175,0,-39174
...,...,...,...,...,...,...,...,...,...,...
102,shelby pointe townhomes llc,1500 east shelby drive,IHE0007K000000,0790500A00025,04/14/2022,04/13/2042,46330,16628,0,-29702
103,sc bartlett investors llc,2171 sycamore view rd,IHE0007L000000,088041000017C,03/01/2022,02/28/2042,64513,27042,0,-37471
104,app breezy partners l l l p,1500 oberle ave,IHE0007N000000,070037000041C,09/01/2022,08/31/2042,55026,9196,0,-45830
105,south city v,457 brightside,IHE0007O000000,702000000290,09/01/2022,08/31/2042,1832,765,0,-1067


In [32]:
# Clean the date columns

# Because this data has an idiosyncratic "end date" value of "End of Loan", 
# we must define a dataset-specific function
def cast_end_date_to_datetime(entry: str | Number):
    try:
        return pd.to_datetime(str(entry))
    except ValueError:
        return entry
df["begin_date"] = pd.to_datetime(df["begin_date"])
df["end_date"] = df["end_date"].apply(lambda x: cast_end_date_to_datetime(x))
df

Unnamed: 0,owner,location,pilot_parcel_no,underlying_parcel_number,begin_date,end_date,assmt_taxes,pilot_contract_amt,pilot_debt_service_amt,difference
0,hillcrest memphis lp,4155 east wind dr 1,IHE0000A000000,0790040000001C,2008-10-01,2028-09-30,53193,16412,0,-36781
1,alco pershing partners lp,0 steele,IHE0000B000000,"07101400000950, 07101400000960",2008-08-01,2028-12-31,40315,14754,0,-25562
2,crockett park agnes place apartments,2645 davey drive,IHE0001B000000,"0710560A00016C, 0710560A00046Z",2005-05-01,2025-05-01,60985,187,0,-60797
3,uptown village apartments lp,0 thomas,IHE0001L000000,"0010960A000960, 0010960A000970",2004-03-05,2024-03-04,48181,1984,0,-46197
4,uptown senior housing development lp,669 third,IHE0001R000000,"0010560000002C, 00105600000070",2006-04-19,2026-04-18,41349,2175,0,-39174
...,...,...,...,...,...,...,...,...,...,...
102,shelby pointe townhomes llc,1500 east shelby drive,IHE0007K000000,0790500A00025,2022-04-14,2042-04-13,46330,16628,0,-29702
103,sc bartlett investors llc,2171 sycamore view rd,IHE0007L000000,088041000017C,2022-03-01,2042-02-28,64513,27042,0,-37471
104,app breezy partners l l l p,1500 oberle ave,IHE0007N000000,070037000041C,2022-09-01,2042-08-31,55026,9196,0,-45830
105,south city v,457 brightside,IHE0007O000000,702000000290,2022-09-01,2042-08-31,1832,765,0,-1067


In [33]:
# Clean the list columns

# Pandas always reads in lists as strings, so they will ultimately be converted to a string type, but putting them in lists firsts will allow us to convert them back to lists more quickly the next time we read them in because the list brackets will be included 
# in the output string

list_columns = ["pilot_parcel_no", "underlying_parcel_number"]
for col in list_columns:
    df[col] = df[col].apply(lambda x: str(x).split(","))
df

Unnamed: 0,owner,location,pilot_parcel_no,underlying_parcel_number,begin_date,end_date,assmt_taxes,pilot_contract_amt,pilot_debt_service_amt,difference
0,hillcrest memphis lp,4155 east wind dr 1,[IHE0000A000000],[0790040000001C],2008-10-01,2028-09-30,53193,16412,0,-36781
1,alco pershing partners lp,0 steele,[IHE0000B000000],"[07101400000950, 07101400000960]",2008-08-01,2028-12-31,40315,14754,0,-25562
2,crockett park agnes place apartments,2645 davey drive,[IHE0001B000000],"[0710560A00016C, 0710560A00046Z]",2005-05-01,2025-05-01,60985,187,0,-60797
3,uptown village apartments lp,0 thomas,[IHE0001L000000],"[0010960A000960, 0010960A000970]",2004-03-05,2024-03-04,48181,1984,0,-46197
4,uptown senior housing development lp,669 third,[IHE0001R000000],"[0010560000002C, 00105600000070]",2006-04-19,2026-04-18,41349,2175,0,-39174
...,...,...,...,...,...,...,...,...,...,...
102,shelby pointe townhomes llc,1500 east shelby drive,[IHE0007K000000],[0790500A00025],2022-04-14,2042-04-13,46330,16628,0,-29702
103,sc bartlett investors llc,2171 sycamore view rd,[IHE0007L000000],[088041000017C],2022-03-01,2042-02-28,64513,27042,0,-37471
104,app breezy partners l l l p,1500 oberle ave,[IHE0007N000000],[070037000041C],2022-09-01,2042-08-31,55026,9196,0,-45830
105,south city v,457 brightside,[IHE0007O000000],[702000000290],2022-09-01,2042-08-31,1832,765,0,-1067


In [34]:
# Create a unique index column based on the values in the identifying column
# Note: Ids are based on a deterministic hash, which means they are 
# reproducible even if the function is run by a different user at a different time,
# as long as the string in the identifying column is the same. 

identifying_columns = ["owner"]
index_name = "owner_id"
df = set_unique_index(df=df, columns_to_hash = identifying_columns, index_length_limit=12, index_name=index_name)
df

Unnamed: 0_level_0,owner,location,pilot_parcel_no,underlying_parcel_number,begin_date,end_date,assmt_taxes,pilot_contract_amt,pilot_debt_service_amt,difference
owner_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
7a1798f03c29,hillcrest memphis lp,4155 east wind dr 1,[IHE0000A000000],[0790040000001C],2008-10-01,2028-09-30,53193,16412,0,-36781
ef79e221c568,alco pershing partners lp,0 steele,[IHE0000B000000],"[07101400000950, 07101400000960]",2008-08-01,2028-12-31,40315,14754,0,-25562
27185d035674,crockett park agnes place apartments,2645 davey drive,[IHE0001B000000],"[0710560A00016C, 0710560A00046Z]",2005-05-01,2025-05-01,60985,187,0,-60797
3405072585e4,uptown village apartments lp,0 thomas,[IHE0001L000000],"[0010960A000960, 0010960A000970]",2004-03-05,2024-03-04,48181,1984,0,-46197
0dedd00443cc,uptown senior housing development lp,669 third,[IHE0001R000000],"[0010560000002C, 00105600000070]",2006-04-19,2026-04-18,41349,2175,0,-39174
...,...,...,...,...,...,...,...,...,...,...
956d2720b6be,shelby pointe townhomes llc,1500 east shelby drive,[IHE0007K000000],[0790500A00025],2022-04-14,2042-04-13,46330,16628,0,-29702
f5eb91724f5a,sc bartlett investors llc,2171 sycamore view rd,[IHE0007L000000],[088041000017C],2022-03-01,2042-02-28,64513,27042,0,-37471
6490986a08d8,app breezy partners l l l p,1500 oberle ave,[IHE0007N000000],[070037000041C],2022-09-01,2042-08-31,55026,9196,0,-45830
437bd4e14aac,south city v,457 brightside,[IHE0007O000000],[702000000290],2022-09-01,2042-08-31,1832,765,0,-1067


In [35]:
# Check the data types of each variable
df.dtypes

owner                               object
location                            object
pilot_parcel_no                     object
underlying_parcel_number            object
begin_date                  datetime64[ns]
end_date                    datetime64[ns]
assmt_taxes                         object
pilot_contract_amt                  object
pilot_debt_service_amt              object
difference                          object
dtype: object

In [36]:
# Fix the data type of each variable

names_to_types = {'owner': 'string',
                  'location': 'string',
                  'pilot_parcel_no': 'string', 
                  'underlying_parcel_number': 'string',
                  'begin_date': 'datetime64[ns]', 
                  'end_date': 'object',
                  'assmt_taxes': 'float64',
                  'pilot_contract_amt': 'float64',
       'pilot_debt_service_amt': 'float64', 
                  'difference': 'float64'}
df = cast_data_types(df, names_to_types)
df.dtypes

owner                       string[python]
location                    string[python]
pilot_parcel_no             string[python]
underlying_parcel_number    string[python]
begin_date                  datetime64[ns]
end_date                            object
assmt_taxes                        float64
pilot_contract_amt                 float64
pilot_debt_service_amt             float64
difference                         float64
dtype: object

In [37]:
# Save the now-clean csv and convert it to utf-8 format
outfile_path = "../data/processed/s6_2022_ihe.csv"
df.to_csv(outfile_path)
convert_utf8(original_file_path=outfile_path, new_file_path=outfile_path)