In [19]:
import pandas as pd
from utils import *
from typing import Any

<h1>Cleaning a CSV</h1>

Read in a CSV

In [20]:
# load a dataframe
infile_path = "../data/raw/silver_shelby_county_properties_municipality_breakdown - s14_pilot_extension_fund_contracts.csv"
df = pd.read_csv(infile_path)
df

Unnamed: 0,Project Name,Owner,PILOT Parcel No,Underlying Parcel Number,Begin Date,End Date,2022 Extended PILOT Amount Billed,2022 Extended Pilot Amount Paid to Bank trustee
0,Cotton Exchange,"Cotton Exchange Building, A Limited Partnership",ICC0027L000000,204200000290,12/01/1983,12/01/2024,"$49,536","$48,545"
1,Three Sisters Building,"Three Sisters, Ltd.",ICC0029L000000,"00203900000140, 00203900000150, 00203900000160",12/23/1983,12/30/2025,27615,27063
2,Morgan Keegan Tower,Raymond James Tower,ICC0041L000000,200600000240,12/27/1984,12/27/2024,102367,100320
3,Winchester Building,"SUNA Winchester, LLC",ICC0047L000000,"00206000000010, 0020600000001Z",12/28/1984,12/28/2024,11611,-
4,Brinkley Plaza,Olymbec USA LLC,ICC0048L000000,"0020380000008C, 0020380000011C",12/31/1984,12/31/2024,76714,-
5,William Len Building,"Main Monroe Hospitality 2018, LLC",ICC0045L000000,205700000090,12/31/1984,12/31/2024,82342,80695
6,Lowenstein Garage,"Memphis 99 Parking Garage, LP",ICC0050L000000,200700000080,07/25/1985,07/25/2025,12981,12721
7,Autozone Headquarters,"AutoZone, Inc.",ICC0105L000000,0020440000004C,09/01/1993,09/01/2033,320531,314120
8,Peabody Place Gold,"Peabody Place-Gold, LP",ICC0243L000000,"00204300000010, 00204300000020, 00204300000030...",08/18/1994,08/18/2034,150901,147883
9,Peabody Place Haverty's,"Peabody Place-Hav, LP",ICC0244L000000,0020430000006C,08/18/1994,08/18/2034,44031,43150


In [21]:
# Clean the column headers
df = clean_headers(df, [remove_special_for_words, truncate, snake_case, lower_case])
df.columns

Index(['project_name', 'owner', 'pilot_parcel_no', 'underlying_parcel_number',
       'begin_date', 'end_date', '2022_extended_pilot_amount_billed',
       '2022_extended_pilot_amount_paid_to_bank_trustee'],
      dtype='object')

In [22]:
# Clean the string columns
string_columns = ["owner", 
                  "project_name"]
df = clean_columns(df, string_columns, [remove_special_for_words, truncate, lower_case])
df

Unnamed: 0,project_name,owner,pilot_parcel_no,underlying_parcel_number,begin_date,end_date,2022_extended_pilot_amount_billed,2022_extended_pilot_amount_paid_to_bank_trustee
0,cotton exchange,cotton exchange building a limited partnership,ICC0027L000000,204200000290,12/01/1983,12/01/2024,"$49,536","$48,545"
1,three sisters building,three sisters ltd,ICC0029L000000,"00203900000140, 00203900000150, 00203900000160",12/23/1983,12/30/2025,27615,27063
2,morgan keegan tower,raymond james tower,ICC0041L000000,200600000240,12/27/1984,12/27/2024,102367,100320
3,winchester building,suna winchester llc,ICC0047L000000,"00206000000010, 0020600000001Z",12/28/1984,12/28/2024,11611,-
4,brinkley plaza,olymbec usa llc,ICC0048L000000,"0020380000008C, 0020380000011C",12/31/1984,12/31/2024,76714,-
5,william len building,main monroe hospitality 2018 llc,ICC0045L000000,205700000090,12/31/1984,12/31/2024,82342,80695
6,lowenstein garage,memphis 99 parking garage lp,ICC0050L000000,200700000080,07/25/1985,07/25/2025,12981,12721
7,autozone headquarters,autozone inc,ICC0105L000000,0020440000004C,09/01/1993,09/01/2033,320531,314120
8,peabody place gold,peabody place gold lp,ICC0243L000000,"00204300000010, 00204300000020, 00204300000030...",08/18/1994,08/18/2034,150901,147883
9,peabody place haverty s,peabody place hav lp,ICC0244L000000,0020430000006C,08/18/1994,08/18/2034,44031,43150


In [23]:
df.columns

Index(['project_name', 'owner', 'pilot_parcel_no', 'underlying_parcel_number',
       'begin_date', 'end_date', '2022_extended_pilot_amount_billed',
       '2022_extended_pilot_amount_paid_to_bank_trustee'],
      dtype='object')

In [24]:
# Clean the numeric columns
numeric_columns = ['2022_extended_pilot_amount_billed',
       '2022_extended_pilot_amount_paid_to_bank_trustee']
df = clean_columns(df, numeric_columns, [remove_special_for_numbers, convert_only_dash_to_missing])

df

Unnamed: 0,project_name,owner,pilot_parcel_no,underlying_parcel_number,begin_date,end_date,2022_extended_pilot_amount_billed,2022_extended_pilot_amount_paid_to_bank_trustee
0,cotton exchange,cotton exchange building a limited partnership,ICC0027L000000,204200000290,12/01/1983,12/01/2024,49536,48545.0
1,three sisters building,three sisters ltd,ICC0029L000000,"00203900000140, 00203900000150, 00203900000160",12/23/1983,12/30/2025,27615,27063.0
2,morgan keegan tower,raymond james tower,ICC0041L000000,200600000240,12/27/1984,12/27/2024,102367,100320.0
3,winchester building,suna winchester llc,ICC0047L000000,"00206000000010, 0020600000001Z",12/28/1984,12/28/2024,11611,
4,brinkley plaza,olymbec usa llc,ICC0048L000000,"0020380000008C, 0020380000011C",12/31/1984,12/31/2024,76714,
5,william len building,main monroe hospitality 2018 llc,ICC0045L000000,205700000090,12/31/1984,12/31/2024,82342,80695.0
6,lowenstein garage,memphis 99 parking garage lp,ICC0050L000000,200700000080,07/25/1985,07/25/2025,12981,12721.0
7,autozone headquarters,autozone inc,ICC0105L000000,0020440000004C,09/01/1993,09/01/2033,320531,314120.0
8,peabody place gold,peabody place gold lp,ICC0243L000000,"00204300000010, 00204300000020, 00204300000030...",08/18/1994,08/18/2034,150901,147883.0
9,peabody place haverty s,peabody place hav lp,ICC0244L000000,0020430000006C,08/18/1994,08/18/2034,44031,43150.0


In [25]:
# Clean the date columns

# Because this data has an idiosyncratic "end date" value of "End of Loan", 
# we must define a dataset-specific function
def cast_end_date_to_datetime(entry: str | Number):
    try:
        return pd.to_datetime(str(entry))
    except ValueError:
        return entry
df["begin_date"] = pd.to_datetime(df["begin_date"])
df["end_date"] = df["end_date"].apply(lambda x: cast_end_date_to_datetime(x))
df

Unnamed: 0,project_name,owner,pilot_parcel_no,underlying_parcel_number,begin_date,end_date,2022_extended_pilot_amount_billed,2022_extended_pilot_amount_paid_to_bank_trustee
0,cotton exchange,cotton exchange building a limited partnership,ICC0027L000000,204200000290,1983-12-01,2024-12-01,49536,48545.0
1,three sisters building,three sisters ltd,ICC0029L000000,"00203900000140, 00203900000150, 00203900000160",1983-12-23,2025-12-30,27615,27063.0
2,morgan keegan tower,raymond james tower,ICC0041L000000,200600000240,1984-12-27,2024-12-27,102367,100320.0
3,winchester building,suna winchester llc,ICC0047L000000,"00206000000010, 0020600000001Z",1984-12-28,2024-12-28,11611,
4,brinkley plaza,olymbec usa llc,ICC0048L000000,"0020380000008C, 0020380000011C",1984-12-31,2024-12-31,76714,
5,william len building,main monroe hospitality 2018 llc,ICC0045L000000,205700000090,1984-12-31,2024-12-31,82342,80695.0
6,lowenstein garage,memphis 99 parking garage lp,ICC0050L000000,200700000080,1985-07-25,2025-07-25,12981,12721.0
7,autozone headquarters,autozone inc,ICC0105L000000,0020440000004C,1993-09-01,2033-09-01,320531,314120.0
8,peabody place gold,peabody place gold lp,ICC0243L000000,"00204300000010, 00204300000020, 00204300000030...",1994-08-18,2034-08-18,150901,147883.0
9,peabody place haverty s,peabody place hav lp,ICC0244L000000,0020430000006C,1994-08-18,2034-08-18,44031,43150.0


In [26]:
# Clean the list columns

# Pandas always reads in lists as strings, so they will ultimately be converted to a string type, but putting them in lists firsts will allow us to convert them back to lists more quickly the next time we read them in because the list brackets will be included 
# in the output string

list_columns = ["pilot_parcel_no", "underlying_parcel_number"]
for col in list_columns:
    df[col] = df[col].apply(lambda x: str(x).split(","))
df

Unnamed: 0,project_name,owner,pilot_parcel_no,underlying_parcel_number,begin_date,end_date,2022_extended_pilot_amount_billed,2022_extended_pilot_amount_paid_to_bank_trustee
0,cotton exchange,cotton exchange building a limited partnership,[ICC0027L000000],[204200000290],1983-12-01,2024-12-01,49536,48545.0
1,three sisters building,three sisters ltd,[ICC0029L000000],"[00203900000140, 00203900000150, 00203900000...",1983-12-23,2025-12-30,27615,27063.0
2,morgan keegan tower,raymond james tower,[ICC0041L000000],[200600000240],1984-12-27,2024-12-27,102367,100320.0
3,winchester building,suna winchester llc,[ICC0047L000000],"[00206000000010, 0020600000001Z]",1984-12-28,2024-12-28,11611,
4,brinkley plaza,olymbec usa llc,[ICC0048L000000],"[0020380000008C, 0020380000011C]",1984-12-31,2024-12-31,76714,
5,william len building,main monroe hospitality 2018 llc,[ICC0045L000000],[205700000090],1984-12-31,2024-12-31,82342,80695.0
6,lowenstein garage,memphis 99 parking garage lp,[ICC0050L000000],[200700000080],1985-07-25,2025-07-25,12981,12721.0
7,autozone headquarters,autozone inc,[ICC0105L000000],[0020440000004C],1993-09-01,2033-09-01,320531,314120.0
8,peabody place gold,peabody place gold lp,[ICC0243L000000],"[00204300000010, 00204300000020, 00204300000...",1994-08-18,2034-08-18,150901,147883.0
9,peabody place haverty s,peabody place hav lp,[ICC0244L000000],[0020430000006C],1994-08-18,2034-08-18,44031,43150.0


In [27]:
# Create a unique index column based on the values in the identifying column
# Note: Ids are based on a deterministic hash, which means they are 
# reproducible even if the function is run by a different user at a different time,
# as long as the string in the identifying column is the same. 

identifying_columns = ["owner"]
index_name = "owner_id"
df = set_unique_index(df=df, columns_to_hash = identifying_columns, index_length_limit=12, index_name=index_name)
df

Unnamed: 0_level_0,project_name,owner,pilot_parcel_no,underlying_parcel_number,begin_date,end_date,2022_extended_pilot_amount_billed,2022_extended_pilot_amount_paid_to_bank_trustee
owner_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
8774152c7d52,cotton exchange,cotton exchange building a limited partnership,[ICC0027L000000],[204200000290],1983-12-01,2024-12-01,49536,48545.0
6ddf11772c54,three sisters building,three sisters ltd,[ICC0029L000000],"[00203900000140, 00203900000150, 00203900000...",1983-12-23,2025-12-30,27615,27063.0
1a5489fb092f,morgan keegan tower,raymond james tower,[ICC0041L000000],[200600000240],1984-12-27,2024-12-27,102367,100320.0
0cac10103518,winchester building,suna winchester llc,[ICC0047L000000],"[00206000000010, 0020600000001Z]",1984-12-28,2024-12-28,11611,
567dcc9200e4,brinkley plaza,olymbec usa llc,[ICC0048L000000],"[0020380000008C, 0020380000011C]",1984-12-31,2024-12-31,76714,
08fbc058accd,william len building,main monroe hospitality 2018 llc,[ICC0045L000000],[205700000090],1984-12-31,2024-12-31,82342,80695.0
372de50563c7,lowenstein garage,memphis 99 parking garage lp,[ICC0050L000000],[200700000080],1985-07-25,2025-07-25,12981,12721.0
896f604494cb,autozone headquarters,autozone inc,[ICC0105L000000],[0020440000004C],1993-09-01,2033-09-01,320531,314120.0
bb45f8c57623,peabody place gold,peabody place gold lp,[ICC0243L000000],"[00204300000010, 00204300000020, 00204300000...",1994-08-18,2034-08-18,150901,147883.0
7bfab3f25967,peabody place haverty s,peabody place hav lp,[ICC0244L000000],[0020430000006C],1994-08-18,2034-08-18,44031,43150.0


In [28]:
# Check the data types of each variable
df.dtypes

project_name                                               object
owner                                                      object
pilot_parcel_no                                            object
underlying_parcel_number                                   object
begin_date                                         datetime64[ns]
end_date                                           datetime64[ns]
2022_extended_pilot_amount_billed                          object
2022_extended_pilot_amount_paid_to_bank_trustee            object
dtype: object

In [29]:
# Fix the data type of each variable

names_to_types = {'project_name': "string",
                  'owner': 'string',
                  'pilot_parcel_no': 'string', 
                  'underlying_parcel_number': 'string',
                  'begin_date': 'datetime64[ns]', 
                  'end_date': 'object',
                  '2022_extended_pilot_amount_billed': 'float64',
                  '2022_extended_pilot_amount_paid_to_bank_trustee': 'float64',
                  }
df = cast_data_types(df, names_to_types)
df.dtypes

project_name                                       string[python]
owner                                              string[python]
pilot_parcel_no                                    string[python]
underlying_parcel_number                           string[python]
begin_date                                         datetime64[ns]
end_date                                                   object
2022_extended_pilot_amount_billed                         float64
2022_extended_pilot_amount_paid_to_bank_trustee           float64
dtype: object

In [30]:
# Save the now-clean csv and convert it to utf-8 format
outfile_path = "../data/processed/s14_pilot_extension_fund_contracts.csv"
df.to_csv(outfile_path)
convert_utf8(original_file_path=outfile_path, new_file_path=outfile_path)