In [14]:
import pandas as pd
from utils import *
from typing import Any

<h1>Cleaning a CSV</h1>

Read in a CSV

In [15]:
# load a dataframe
infile_path = "../data/raw/silver_shelby_county_properties_municipality_breakdown - s8_2022_imc_contracts.csv"
df = pd.read_csv(infile_path)
df

Unnamed: 0,Owner,Location,PILOT Parcel No,Underlying Parcel Number,Begin Date,End Date,Assmt Taxes,PILOT Contract Amt,PILOT Debt Service Amt,Difference
0,"NATIONAL CHURCH RESIDENCES OF MEMPHIS, TN",5819 NEWBERRY,IMC00000000010,0934000000224C,07/17/2009,End of Loan,"$48,557","$2,940",$0,"-$45,617"
1,BEARMAN-GOLDEN GARDENS INC,4755 NEELY,IMC00000000020,7608900000760,11/20/2009,End of Loan,"$41,933","$3,300",$0,"-$38,633"
2,MEMPHIS VOLUNTEERS OF AMERICA ELDERLY HOUSING INC,3815 AUSTIN PEAY,IMC00000000030,8601500000770,04/21/2010,End of Loan,"$62,055","$2,700",$0,"-$59,355"
3,C E WARE TOWERS INC,3571 HWY 61 S,IMC00000000050,7507400000750,10/01/2008,End of Loan,"$20,937","$3,300",$0,"-$17,637"
4,CALDWELL HOUSING CORP,2310 ARDMORE,IMC00000000060,7209400000750,04/23/2011,End of Loan,"$11,108",$840,$0,"-$10,268"
5,WILLOW CREEK HOUSING INC,3840 COVINGTON PIKE,IMC00000000070,0870780000009C,09/06/2011,End of Loan,"$99,252","$13,920",$0,"-$85,332"
6,CHARIS ACRES,2131 WILSON,IMC00000000080,0790870000058C,12/18/2012,End of Loan,"$27,192","$1,740",$0,"-$25,452"
7,CAAP PLACE OF HOPE INC,1347 FERGUSON,IMC00000000090,0601620000005C,09/03/2010,End of Loan,"$11,153",$900,$0,"-$10,253"
8,ALPHA OMEGA VETERANS SERVICES INC,1465 COURT,IMC00000000100,0170220000007C,08/10/2012,End of Loan,"$21,776","$1,920",$0,"-$19,856"
9,SHIELD INC,2450 KETCHUM,IMC00000000040,6013700002070,11/18/2008,End of Loan,"$12,950","$3,900",$0,"-$9,050"


In [16]:
# Clean the column headers
df = clean_headers(df, [remove_special_for_words, truncate, snake_case, lower_case])
df.columns

Index(['owner', 'location', 'pilot_parcel_no', 'underlying_parcel_number',
       'begin_date', 'end_date', 'assmt_taxes', 'pilot_contract_amt',
       'pilot_debt_service_amt', 'difference'],
      dtype='object')

In [17]:
# Clean the string columns
string_columns = ["owner", 
                  "location"]
df = clean_columns(df, string_columns, [remove_special_for_words, truncate, lower_case])
df

Unnamed: 0,owner,location,pilot_parcel_no,underlying_parcel_number,begin_date,end_date,assmt_taxes,pilot_contract_amt,pilot_debt_service_amt,difference
0,national church residences of memphis tn,5819 newberry,IMC00000000010,0934000000224C,07/17/2009,End of Loan,"$48,557","$2,940",$0,"-$45,617"
1,bearman golden gardens inc,4755 neely,IMC00000000020,7608900000760,11/20/2009,End of Loan,"$41,933","$3,300",$0,"-$38,633"
2,memphis volunteers of america elderly housing inc,3815 austin peay,IMC00000000030,8601500000770,04/21/2010,End of Loan,"$62,055","$2,700",$0,"-$59,355"
3,c e ware towers inc,3571 hwy 61 s,IMC00000000050,7507400000750,10/01/2008,End of Loan,"$20,937","$3,300",$0,"-$17,637"
4,caldwell housing corp,2310 ardmore,IMC00000000060,7209400000750,04/23/2011,End of Loan,"$11,108",$840,$0,"-$10,268"
5,willow creek housing inc,3840 covington pike,IMC00000000070,0870780000009C,09/06/2011,End of Loan,"$99,252","$13,920",$0,"-$85,332"
6,charis acres,2131 wilson,IMC00000000080,0790870000058C,12/18/2012,End of Loan,"$27,192","$1,740",$0,"-$25,452"
7,caap place of hope inc,1347 ferguson,IMC00000000090,0601620000005C,09/03/2010,End of Loan,"$11,153",$900,$0,"-$10,253"
8,alpha omega veterans services inc,1465 court,IMC00000000100,0170220000007C,08/10/2012,End of Loan,"$21,776","$1,920",$0,"-$19,856"
9,shield inc,2450 ketchum,IMC00000000040,6013700002070,11/18/2008,End of Loan,"$12,950","$3,900",$0,"-$9,050"


In [18]:
df.columns

Index(['owner', 'location', 'pilot_parcel_no', 'underlying_parcel_number',
       'begin_date', 'end_date', 'assmt_taxes', 'pilot_contract_amt',
       'pilot_debt_service_amt', 'difference'],
      dtype='object')

In [19]:
# Clean the numeric columns
numeric_columns = ['assmt_taxes', 'pilot_contract_amt',
       'pilot_debt_service_amt', 'difference']
df = clean_columns(df, numeric_columns, [remove_special_for_numbers, convert_only_dash_to_missing])

df

Unnamed: 0,owner,location,pilot_parcel_no,underlying_parcel_number,begin_date,end_date,assmt_taxes,pilot_contract_amt,pilot_debt_service_amt,difference
0,national church residences of memphis tn,5819 newberry,IMC00000000010,0934000000224C,07/17/2009,End of Loan,48557,2940,0,-45617
1,bearman golden gardens inc,4755 neely,IMC00000000020,7608900000760,11/20/2009,End of Loan,41933,3300,0,-38633
2,memphis volunteers of america elderly housing inc,3815 austin peay,IMC00000000030,8601500000770,04/21/2010,End of Loan,62055,2700,0,-59355
3,c e ware towers inc,3571 hwy 61 s,IMC00000000050,7507400000750,10/01/2008,End of Loan,20937,3300,0,-17637
4,caldwell housing corp,2310 ardmore,IMC00000000060,7209400000750,04/23/2011,End of Loan,11108,840,0,-10268
5,willow creek housing inc,3840 covington pike,IMC00000000070,0870780000009C,09/06/2011,End of Loan,99252,13920,0,-85332
6,charis acres,2131 wilson,IMC00000000080,0790870000058C,12/18/2012,End of Loan,27192,1740,0,-25452
7,caap place of hope inc,1347 ferguson,IMC00000000090,0601620000005C,09/03/2010,End of Loan,11153,900,0,-10253
8,alpha omega veterans services inc,1465 court,IMC00000000100,0170220000007C,08/10/2012,End of Loan,21776,1920,0,-19856
9,shield inc,2450 ketchum,IMC00000000040,6013700002070,11/18/2008,End of Loan,12950,3900,0,-9050


In [20]:
# Clean the date columns

# Because this data has an idiosyncratic "end date" value of "End of Loan", 
# we must define a dataset-specific function
def cast_end_date_to_datetime(entry: str | Number):
    try:
        return pd.to_datetime(str(entry))
    except ValueError:
        return entry
df["begin_date"] = pd.to_datetime(df["begin_date"])
df["end_date"] = df["end_date"].apply(lambda x: cast_end_date_to_datetime(x))
df

Unnamed: 0,owner,location,pilot_parcel_no,underlying_parcel_number,begin_date,end_date,assmt_taxes,pilot_contract_amt,pilot_debt_service_amt,difference
0,national church residences of memphis tn,5819 newberry,IMC00000000010,0934000000224C,2009-07-17,End of Loan,48557,2940,0,-45617
1,bearman golden gardens inc,4755 neely,IMC00000000020,7608900000760,2009-11-20,End of Loan,41933,3300,0,-38633
2,memphis volunteers of america elderly housing inc,3815 austin peay,IMC00000000030,8601500000770,2010-04-21,End of Loan,62055,2700,0,-59355
3,c e ware towers inc,3571 hwy 61 s,IMC00000000050,7507400000750,2008-10-01,End of Loan,20937,3300,0,-17637
4,caldwell housing corp,2310 ardmore,IMC00000000060,7209400000750,2011-04-23,End of Loan,11108,840,0,-10268
5,willow creek housing inc,3840 covington pike,IMC00000000070,0870780000009C,2011-09-06,End of Loan,99252,13920,0,-85332
6,charis acres,2131 wilson,IMC00000000080,0790870000058C,2012-12-18,End of Loan,27192,1740,0,-25452
7,caap place of hope inc,1347 ferguson,IMC00000000090,0601620000005C,2010-09-03,End of Loan,11153,900,0,-10253
8,alpha omega veterans services inc,1465 court,IMC00000000100,0170220000007C,2012-08-10,End of Loan,21776,1920,0,-19856
9,shield inc,2450 ketchum,IMC00000000040,6013700002070,2008-11-18,End of Loan,12950,3900,0,-9050


In [21]:
# Clean the list columns

# Pandas always reads in lists as strings, so they will ultimately be converted to a string type, but putting them in lists firsts will allow us to convert them back to lists more quickly the next time we read them in because the list brackets will be included 
# in the output string

list_columns = ["pilot_parcel_no", "underlying_parcel_number"]
for col in list_columns:
    df[col] = df[col].apply(lambda x: str(x).split(","))
df

Unnamed: 0,owner,location,pilot_parcel_no,underlying_parcel_number,begin_date,end_date,assmt_taxes,pilot_contract_amt,pilot_debt_service_amt,difference
0,national church residences of memphis tn,5819 newberry,[IMC00000000010],[0934000000224C],2009-07-17,End of Loan,48557,2940,0,-45617
1,bearman golden gardens inc,4755 neely,[IMC00000000020],[7608900000760],2009-11-20,End of Loan,41933,3300,0,-38633
2,memphis volunteers of america elderly housing inc,3815 austin peay,[IMC00000000030],[8601500000770],2010-04-21,End of Loan,62055,2700,0,-59355
3,c e ware towers inc,3571 hwy 61 s,[IMC00000000050],[7507400000750],2008-10-01,End of Loan,20937,3300,0,-17637
4,caldwell housing corp,2310 ardmore,[IMC00000000060],[7209400000750],2011-04-23,End of Loan,11108,840,0,-10268
5,willow creek housing inc,3840 covington pike,[IMC00000000070],[0870780000009C],2011-09-06,End of Loan,99252,13920,0,-85332
6,charis acres,2131 wilson,[IMC00000000080],[0790870000058C],2012-12-18,End of Loan,27192,1740,0,-25452
7,caap place of hope inc,1347 ferguson,[IMC00000000090],[0601620000005C],2010-09-03,End of Loan,11153,900,0,-10253
8,alpha omega veterans services inc,1465 court,[IMC00000000100],[0170220000007C],2012-08-10,End of Loan,21776,1920,0,-19856
9,shield inc,2450 ketchum,[IMC00000000040],[6013700002070],2008-11-18,End of Loan,12950,3900,0,-9050


In [22]:
# Create a unique index column based on the values in the identifying column
# Note: Ids are based on a deterministic hash, which means they are 
# reproducible even if the function is run by a different user at a different time,
# as long as the string in the identifying column is the same. 

identifying_columns = ["owner"]
index_name = "owner_id"
df = set_unique_index(df=df, columns_to_hash = identifying_columns, index_length_limit=12, index_name=index_name)
df

Unnamed: 0_level_0,owner,location,pilot_parcel_no,underlying_parcel_number,begin_date,end_date,assmt_taxes,pilot_contract_amt,pilot_debt_service_amt,difference
owner_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
3c0bb6c74137,national church residences of memphis tn,5819 newberry,[IMC00000000010],[0934000000224C],2009-07-17,End of Loan,48557,2940,0,-45617
74241be1ef2f,bearman golden gardens inc,4755 neely,[IMC00000000020],[7608900000760],2009-11-20,End of Loan,41933,3300,0,-38633
563b8ec3571d,memphis volunteers of america elderly housing inc,3815 austin peay,[IMC00000000030],[8601500000770],2010-04-21,End of Loan,62055,2700,0,-59355
d69383c7a940,c e ware towers inc,3571 hwy 61 s,[IMC00000000050],[7507400000750],2008-10-01,End of Loan,20937,3300,0,-17637
d468d3cc1805,caldwell housing corp,2310 ardmore,[IMC00000000060],[7209400000750],2011-04-23,End of Loan,11108,840,0,-10268
c7c0dea9d02c,willow creek housing inc,3840 covington pike,[IMC00000000070],[0870780000009C],2011-09-06,End of Loan,99252,13920,0,-85332
b77917c2b467,charis acres,2131 wilson,[IMC00000000080],[0790870000058C],2012-12-18,End of Loan,27192,1740,0,-25452
06c969999775,caap place of hope inc,1347 ferguson,[IMC00000000090],[0601620000005C],2010-09-03,End of Loan,11153,900,0,-10253
65793c10abc9,alpha omega veterans services inc,1465 court,[IMC00000000100],[0170220000007C],2012-08-10,End of Loan,21776,1920,0,-19856
0ea022870c29,shield inc,2450 ketchum,[IMC00000000040],[6013700002070],2008-11-18,End of Loan,12950,3900,0,-9050


In [23]:
# Check the data types of each variable
df.dtypes

owner                               object
location                            object
pilot_parcel_no                     object
underlying_parcel_number            object
begin_date                  datetime64[ns]
end_date                            object
assmt_taxes                         object
pilot_contract_amt                  object
pilot_debt_service_amt              object
difference                          object
dtype: object

In [24]:
# Fix the data type of each variable

names_to_types = {'owner': 'string',
                  'location': 'string',
                  'pilot_parcel_no': 'string', 
                  'underlying_parcel_number': 'string',
                  'begin_date': 'datetime64[ns]', 
                  'end_date': 'object',
                  'assmt_taxes': 'float64',
                  'pilot_contract_amt': 'float64',
       'pilot_debt_service_amt': 'float64', 
                  'difference': 'float64'}
df = cast_data_types(df, names_to_types)
df.dtypes

owner                       string[python]
location                    string[python]
pilot_parcel_no             string[python]
underlying_parcel_number    string[python]
begin_date                  datetime64[ns]
end_date                            object
assmt_taxes                        float64
pilot_contract_amt                 float64
pilot_debt_service_amt             float64
difference                         float64
dtype: object

In [25]:
# Save the now-clean csv and convert it to utf-8 format
outfile_path = "../data/processed/s8_2022_imc_contracts.csv"
df.to_csv(outfile_path)
convert_utf8(original_file_path=outfile_path, new_file_path=outfile_path)