In [1]:
import pandas as pd
from utils import *
from typing import Any



<h1>Cleaning a CSV</h1>

Read in a CSV

In [2]:
# load a dataframe
infile_path = "../data/raw/silver_shelby_county_properties_municipality_breakdown - s_15_in_lieu_delinquent_notices.csv"
df = pd.read_csv(infile_path)
df

Unnamed: 0,Board,Owner,Delinquent Years,Amount Due,Parcel No,PILOT Terminated
0,BARLETT INDUSTRIAL DEVELOPMENT BOARD,,,,,
1,COLLIERVILLE INDUSTRIAL DEVELOPMENT BOARD,"SHELBY GROUP INTERNATIONAL, INC",2022,1842.58,IDB0000C00015A,NO
2,GERMANTOWN INDUSTRIAL DEVELOPMENT BOARD,,,,,
3,EDGE BOARD OF MEMPHIS AND SHELBY COUNTY,AESTHETIC MANAGEMENT PARTNERS INC,2022,10558.52,IDB18370000000,NO
4,EDGE BOARD OF MEMPHIS AND SHELBY COUNTY,"AGILENT TECHNOLOGIES, INC",2022,3757.41,IDB1775A000000,NO
...,...,...,...,...,...,...
68,"HEALTH, EDUCATIONAL, AND HOUSING FACILITIES BO...",LEVI LIMITED PARTNERSHIP,2012-2014,24359.23,IHE0001M000000,YES
69,"HEALTH, EDUCATIONAL, AND HOUSING FACILITIES BO...",LYONS RIDGE APARTMENTS LP,2012-2016,"2 13,137.51",IHE0003B000000,YES
70,"HEALTH, EDUCATIONAL, AND HOUSING FACILITIES BO...",VILLAGE PARKWAY LP,2012-2016,539119.49,IHE0002Z000000,YES
71,"EALTH, EDUCATIONAL, AND HOUSING FACILITIES BOA...",,,,,


In [3]:
# Clean the column headers
df = clean_headers(df, [remove_special_for_words, truncate, snake_case, lower_case])
df.columns

Index(['board', 'owner', 'delinquent_years', 'amount_due', 'parcel_no',
       'pilot_terminated'],
      dtype='object')

In [4]:
# Clean the string columns
string_columns = ["board", 
                  "owner"]
df = clean_columns(df, string_columns, [remove_special_for_words, truncate, lower_case])
df

Unnamed: 0,board,owner,delinquent_years,amount_due,parcel_no,pilot_terminated
0,barlett industrial development board,,,,,
1,collierville industrial development board,shelby group international inc,2022,1842.58,IDB0000C00015A,NO
2,germantown industrial development board,,,,,
3,edge board of memphis and shelby county,aesthetic management partners inc,2022,10558.52,IDB18370000000,NO
4,edge board of memphis and shelby county,agilent technologies inc,2022,3757.41,IDB1775A000000,NO
...,...,...,...,...,...,...
68,health educational and housing facilities boar...,levi limited partnership,2012-2014,24359.23,IHE0001M000000,YES
69,health educational and housing facilities boar...,lyons ridge apartments lp,2012-2016,"2 13,137.51",IHE0003B000000,YES
70,health educational and housing facilities boar...,village parkway lp,2012-2016,539119.49,IHE0002Z000000,YES
71,ealth educational and housing facilities board...,,,,,


In [5]:
df.columns

Index(['board', 'owner', 'delinquent_years', 'amount_due', 'parcel_no',
       'pilot_terminated'],
      dtype='object')

In [6]:
# Clean the numeric columns
numeric_columns = ['amount_due']
df = clean_columns(df, numeric_columns, [remove_special_for_numbers, convert_only_dash_to_missing])

df

Unnamed: 0,board,owner,delinquent_years,amount_due,parcel_no,pilot_terminated
0,barlett industrial development board,,,,,
1,collierville industrial development board,shelby group international inc,2022,1842.58,IDB0000C00015A,NO
2,germantown industrial development board,,,,,
3,edge board of memphis and shelby county,aesthetic management partners inc,2022,10558.52,IDB18370000000,NO
4,edge board of memphis and shelby county,agilent technologies inc,2022,3757.41,IDB1775A000000,NO
...,...,...,...,...,...,...
68,health educational and housing facilities boar...,levi limited partnership,2012-2014,24359.23,IHE0001M000000,YES
69,health educational and housing facilities boar...,lyons ridge apartments lp,2012-2016,213137.51,IHE0003B000000,YES
70,health educational and housing facilities boar...,village parkway lp,2012-2016,539119.49,IHE0002Z000000,YES
71,ealth educational and housing facilities board...,,,,,


In [7]:
# Clean the indicator columns
def pilot_terminated_to_indicator(entry: any):
    entry = str(entry)
    if entry == "NO":
        return 0
    elif entry == "YES":
        return 1
    else:
        return None

df["pilot_terminated"] = df["pilot_terminated"].apply(lambda x: pilot_terminated_to_indicator(x))
df

Unnamed: 0,board,owner,delinquent_years,amount_due,parcel_no,pilot_terminated
0,barlett industrial development board,,,,,
1,collierville industrial development board,shelby group international inc,2022,1842.58,IDB0000C00015A,0.0
2,germantown industrial development board,,,,,
3,edge board of memphis and shelby county,aesthetic management partners inc,2022,10558.52,IDB18370000000,0.0
4,edge board of memphis and shelby county,agilent technologies inc,2022,3757.41,IDB1775A000000,0.0
...,...,...,...,...,...,...
68,health educational and housing facilities boar...,levi limited partnership,2012-2014,24359.23,IHE0001M000000,1.0
69,health educational and housing facilities boar...,lyons ridge apartments lp,2012-2016,213137.51,IHE0003B000000,1.0
70,health educational and housing facilities boar...,village parkway lp,2012-2016,539119.49,IHE0002Z000000,1.0
71,ealth educational and housing facilities board...,,,,,


In [8]:
# Clean the "delinquent_years" column

# Because this data has an idiosyncratic "end date" value of "End of Loan", 
# we must define a dataset-specific function
def delinquent_years_to_list(entry):
    entry = str(entry)
    entry = entry.split("-")
    if len(entry) == 0:
        return None
    elif len(entry) == 1:
        if entry[0] == "nan":
            return None
        else:
            return [int(entry[0])]
    else:
        return [item for item in range (int(entry[0]), int(entry[1]) + 1)]

df["delinquent_years"] = df["delinquent_years"].apply(lambda x: delinquent_years_to_list(x))
df
        

Unnamed: 0,board,owner,delinquent_years,amount_due,parcel_no,pilot_terminated
0,barlett industrial development board,,,,,
1,collierville industrial development board,shelby group international inc,[2022],1842.58,IDB0000C00015A,0.0
2,germantown industrial development board,,,,,
3,edge board of memphis and shelby county,aesthetic management partners inc,[2022],10558.52,IDB18370000000,0.0
4,edge board of memphis and shelby county,agilent technologies inc,[2022],3757.41,IDB1775A000000,0.0
...,...,...,...,...,...,...
68,health educational and housing facilities boar...,levi limited partnership,"[2012, 2013, 2014]",24359.23,IHE0001M000000,1.0
69,health educational and housing facilities boar...,lyons ridge apartments lp,"[2012, 2013, 2014, 2015, 2016]",213137.51,IHE0003B000000,1.0
70,health educational and housing facilities boar...,village parkway lp,"[2012, 2013, 2014, 2015, 2016]",539119.49,IHE0002Z000000,1.0
71,ealth educational and housing facilities board...,,,,,


In [21]:
# Create identifying columns

def set_unique_index(df: pd.DataFrame,
                     columns_to_hash: list[str],
                     index_name="id",
                     index_length_limit: Optional[int] = None):
    df[index_name] = list(
        map(lambda x: deterministic_uuid(''.join([str(col_value) for col_value in x]))[0:index_length_limit],
            df[columns_to_hash].values))
    return df

df = set_unique_index(df=df, columns_to_hash = ["board"], index_length_limit=12, index_name="board_id")
df = set_unique_index(df=df, columns_to_hash = ["owner"], index_length_limit=12, index_name="owner_id")
df.set_index("board_id", inplace=True)
df


Unnamed: 0_level_0,board,owner,delinquent_years,amount_due,parcel_no,pilot_terminated,owner_id
board_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
9929db2d3885,barlett industrial development board,,,,,,43b08d3d1b7d
b8e456900053,collierville industrial development board,shelby group international inc,[2022],1842.58,IDB0000C00015A,0.0,8a9611150e92
bb500062bbfe,germantown industrial development board,,,,,,43b08d3d1b7d
a2a30e152a39,edge board of memphis and shelby county,aesthetic management partners inc,[2022],10558.52,IDB18370000000,0.0,56c594d36bc2
a2a30e152a39,edge board of memphis and shelby county,agilent technologies inc,[2022],3757.41,IDB1775A000000,0.0,5b622a63ecad
...,...,...,...,...,...,...,...
53c1afb3aebe,health educational and housing facilities boar...,levi limited partnership,"[2012, 2013, 2014]",24359.23,IHE0001M000000,1.0,9ad2883c08e5
53c1afb3aebe,health educational and housing facilities boar...,lyons ridge apartments lp,"[2012, 2013, 2014, 2015, 2016]",213137.51,IHE0003B000000,1.0,0cd638e42d40
53c1afb3aebe,health educational and housing facilities boar...,village parkway lp,"[2012, 2013, 2014, 2015, 2016]",539119.49,IHE0002Z000000,1.0,0d7c83819ea5
efdc8a56f149,ealth educational and housing facilities board...,,,,,,43b08d3d1b7d


In [22]:
# Check the data types of each variable
df.dtypes

board                object
owner                object
delinquent_years     object
amount_due           object
parcel_no            object
pilot_terminated    float64
owner_id             object
dtype: object

In [27]:
# Fix the data type of each variable

names_to_types = {'board': "string",
                  'owner': "string",
                  'delinquent_years': "string",
                  'amount_due': "float64",
                  'parcel_no': "string",
                  'pilot_terminated': "Int64",
                  'owner_id': "string"
                  }

df = cast_data_types(df, names_to_types)
df.dtypes

board               string[python]
owner               string[python]
delinquent_years    string[python]
amount_due                 float64
parcel_no           string[python]
pilot_terminated             Int64
owner_id            string[python]
dtype: object

In [28]:
# Save the now-clean csv and convert it to utf-8 format
outfile_path = "../data/processed/s_15_in_lieu_delinquent_notices.csv"
df.to_csv(outfile_path)
convert_utf8(original_file_path=outfile_path, new_file_path=outfile_path)