In [2]:
import pandas as pd
from pandas._libs.internals import defaultdict
from unidecode import unidecode
from collections.abc import Callable
import re
from typing import Optional, Any
import uuid
import random
from numbers import Number

<h1>LOAD CSV</h1>

In [3]:
# load a dataframe
infile_path = "../data/raw/shelby_county_properties_municipality_breakdown - s2_in_lieu_contracts_by_owner.csv"
df = pd.read_csv(infile_path)
df

Unnamed: 0,Owner . .,Location,PILOT (Parcel No),Underlying Parcel Number,Begin Date,End Date,Assmt Taxes,PILOT Contract Amt,PILOT Debt Service Amt,Difference
0,1 SOUTH MAIN LLC - WILMONT HOTEL LIMITED PARTN...,79 MADISON,ICC03900000000,"00203800000020, 00203800000030, 00203800000060...",08/02/2000,08/02/2024,"$240,731","$180,548",$0,"-$60,183"
1,100 SOUTH MAIN PARTNERS,100 MAIN,ICC07260000000,"0020530B000010, 0020530B000030",12/23/2008,12/23/2023,"$112,054","$3,070","$27,246","-$81,738"
2,1030 POPLAR LLC,1030 POPLAR,ICC07900000000,0200900000007C,06/15/2021,06/15/2034,"$6,722","$6,722",$0,$0
3,109 SOUTH SECOND PARTNERSHIP,109 SECOND,ICC07380000000,205300000130,04/07/2011,04/07/2024,"$79,863","$3,486","$19,094","-$57,283"
4,115 UNION AVE LLC,115 UNION,ICC07330000000,205300000020,03/29/2011,03/29/2021,"$29,104","$29,104",$0,$0
...,...,...,...,...,...,...,...,...,...,...
522,"WRIGHT MEDICAL TECHNOLOGY, INC",1023 CHERRY,IDB1590A000000,0015900000000D,12/31/2016,12/31/2031,"$5,522",$100,"$1,381","-$4,042"
523,"WRIGHT MEDICAL TECHNOLOGY, INC",0 MEMPHIS-ARLINGTON,IDB0000A000140,A0141L0A000040,12/31/2014,12/31/2024,"$62,901","$62,901",$0,$0
524,"WRIGHT MEDICAL TECHNOLOGY, INC",11576 MEMPHIS-ARLINGTON,IDB0000A00015A,3000150000000D,12/31/2014,12/31/2024,"$12,884","$12,884",$0,$0
525,"WRIGHT MEDICAL TECHNOLOGY, INC",11576 MEMPHIS ARLINGTON,IDB0000A00020A,3000160000000D,12/31/2015,12/31/2025,"$1,240",$100,$310,-$830


<h1>HEADERS</h3>

In [4]:
# Check the headers
df.columns

Index(['    Owner  . . ', '    Location    ', '    PILOT (Parcel No)    ',
       '    Underlying Parcel Number    ', '    Begin Date',
       '    End Date    ', '    Assmt Taxes    ', '    PILOT Contract Amt    ',
       '    PILOT Debt Service Amt    ', '    Difference'],
      dtype='object')

In [5]:
#Define cleaning functions for Headers

# Remove special characters when dealing with words
# transforms removed characters into spaces, removes . and -
def remove_special_for_words(entry: str | Number):
    return re.sub(r"[^a-zA-Z0-9]+", ' ', str(entry))

# Remove leading and trailing spaces
def truncate(entry: str | Number):
    return str(entry).strip()

# Replace spaces with underscores
def snake_case(entry: str | Number):
    return str(entry).replace(" ", "_")

# Make all letters lowercase
def lower_case(entry: str | Number):
    return str(entry).lower()

In [6]:
#Define applicaiton function for headers

# takes in a list of functions and applies those functions to all headers of a pandas dataframe
def clean_headers(df: pd.DataFrame, cleaning_functions_list:list[Callable]) -> pd.DataFrame:
    column_name_map = {item: item for item in df.columns}
    for item in column_name_map:
        for function in cleaning_functions_list:
            column_name_map[item] = function(column_name_map[item])
    df.rename(columns=column_name_map, inplace=True)
    return df

In [7]:
# Apply functions to the columns headers

df = clean_headers(df, [remove_special_for_words, truncate, snake_case, lower_case])
# Check the new column names
print(df.columns)
# View the new dataframe
df


Index(['owner', 'location', 'pilot_parcel_no', 'underlying_parcel_number',
       'begin_date', 'end_date', 'assmt_taxes', 'pilot_contract_amt',
       'pilot_debt_service_amt', 'difference'],
      dtype='object')


Unnamed: 0,owner,location,pilot_parcel_no,underlying_parcel_number,begin_date,end_date,assmt_taxes,pilot_contract_amt,pilot_debt_service_amt,difference
0,1 SOUTH MAIN LLC - WILMONT HOTEL LIMITED PARTN...,79 MADISON,ICC03900000000,"00203800000020, 00203800000030, 00203800000060...",08/02/2000,08/02/2024,"$240,731","$180,548",$0,"-$60,183"
1,100 SOUTH MAIN PARTNERS,100 MAIN,ICC07260000000,"0020530B000010, 0020530B000030",12/23/2008,12/23/2023,"$112,054","$3,070","$27,246","-$81,738"
2,1030 POPLAR LLC,1030 POPLAR,ICC07900000000,0200900000007C,06/15/2021,06/15/2034,"$6,722","$6,722",$0,$0
3,109 SOUTH SECOND PARTNERSHIP,109 SECOND,ICC07380000000,205300000130,04/07/2011,04/07/2024,"$79,863","$3,486","$19,094","-$57,283"
4,115 UNION AVE LLC,115 UNION,ICC07330000000,205300000020,03/29/2011,03/29/2021,"$29,104","$29,104",$0,$0
...,...,...,...,...,...,...,...,...,...,...
522,"WRIGHT MEDICAL TECHNOLOGY, INC",1023 CHERRY,IDB1590A000000,0015900000000D,12/31/2016,12/31/2031,"$5,522",$100,"$1,381","-$4,042"
523,"WRIGHT MEDICAL TECHNOLOGY, INC",0 MEMPHIS-ARLINGTON,IDB0000A000140,A0141L0A000040,12/31/2014,12/31/2024,"$62,901","$62,901",$0,$0
524,"WRIGHT MEDICAL TECHNOLOGY, INC",11576 MEMPHIS-ARLINGTON,IDB0000A00015A,3000150000000D,12/31/2014,12/31/2024,"$12,884","$12,884",$0,$0
525,"WRIGHT MEDICAL TECHNOLOGY, INC",11576 MEMPHIS ARLINGTON,IDB0000A00020A,3000160000000D,12/31/2015,12/31/2025,"$1,240",$100,$310,-$830


<h1>COLUMNS</h3>

In [8]:
# Check the data types
df.dtypes

owner                       object
location                    object
pilot_parcel_no             object
underlying_parcel_number    object
begin_date                  object
end_date                    object
assmt_taxes                 object
pilot_contract_amt          object
pilot_debt_service_amt      object
difference                  object
dtype: object

In [9]:
# Define an application function for columns
def clean_columns(df: pd.DataFrame, 
                  selected_columns: list[str], 
                  cleaning_functions_list:list[Callable]) -> pd.DataFrame:
    for col in selected_columns:
        for function in cleaning_functions_list:
            df[col] = df[col].apply(lambda x: function(x))
    return df

<h1>STRINGS</h1>

In [10]:
# Apply functions
string_columns = ["owner", "location"]
df = clean_columns(df, string_columns, [remove_special_for_words, truncate, lower_case])
df

Unnamed: 0,owner,location,pilot_parcel_no,underlying_parcel_number,begin_date,end_date,assmt_taxes,pilot_contract_amt,pilot_debt_service_amt,difference
0,1 south main llc wilmont hotel limited partner...,79 madison,ICC03900000000,"00203800000020, 00203800000030, 00203800000060...",08/02/2000,08/02/2024,"$240,731","$180,548",$0,"-$60,183"
1,100 south main partners,100 main,ICC07260000000,"0020530B000010, 0020530B000030",12/23/2008,12/23/2023,"$112,054","$3,070","$27,246","-$81,738"
2,1030 poplar llc,1030 poplar,ICC07900000000,0200900000007C,06/15/2021,06/15/2034,"$6,722","$6,722",$0,$0
3,109 south second partnership,109 second,ICC07380000000,205300000130,04/07/2011,04/07/2024,"$79,863","$3,486","$19,094","-$57,283"
4,115 union ave llc,115 union,ICC07330000000,205300000020,03/29/2011,03/29/2021,"$29,104","$29,104",$0,$0
...,...,...,...,...,...,...,...,...,...,...
522,wright medical technology inc,1023 cherry,IDB1590A000000,0015900000000D,12/31/2016,12/31/2031,"$5,522",$100,"$1,381","-$4,042"
523,wright medical technology inc,0 memphis arlington,IDB0000A000140,A0141L0A000040,12/31/2014,12/31/2024,"$62,901","$62,901",$0,$0
524,wright medical technology inc,11576 memphis arlington,IDB0000A00015A,3000150000000D,12/31/2014,12/31/2024,"$12,884","$12,884",$0,$0
525,wright medical technology inc,11576 memphis arlington,IDB0000A00020A,3000160000000D,12/31/2015,12/31/2025,"$1,240",$100,$310,-$830


In [11]:
df.columns

Index(['owner', 'location', 'pilot_parcel_no', 'underlying_parcel_number',
       'begin_date', 'end_date', 'assmt_taxes', 'pilot_contract_amt',
       'pilot_debt_service_amt', 'difference'],
      dtype='object')

<h2>Clean the Numeric Columns</h2>
<p>The numeric columns in this dataframe contain some special characters which we want to keep (periods and negative signs) and some which we must remove (dollar signs and commas). We need a new function to remove only the special characters we want to remove.</p>

<h4>Define an Entry-Cleaning Function to Remove Only Non-Number-Related Special Characters</h4>

In [12]:
# Remove special characters when dealing with numbers
# No spaces, does not remove - and .
def remove_special_for_numbers(my_string: str | Number):
    return re.sub(r"[^a-zA-Z0-9-.]+", '', str(my_string))

<h4>Apply the New Function to the Numeric Columns</h4>

In [13]:
# Clean the numeric columns
numeric_columns = ['assmt_taxes', 
                   'pilot_contract_amt',
                   'pilot_debt_service_amt',
                   'difference']
df = clean_columns(df, numeric_columns, [remove_special_for_numbers])
df

Unnamed: 0,owner,location,pilot_parcel_no,underlying_parcel_number,begin_date,end_date,assmt_taxes,pilot_contract_amt,pilot_debt_service_amt,difference
0,1 south main llc wilmont hotel limited partner...,79 madison,ICC03900000000,"00203800000020, 00203800000030, 00203800000060...",08/02/2000,08/02/2024,240731,180548,0,-60183
1,100 south main partners,100 main,ICC07260000000,"0020530B000010, 0020530B000030",12/23/2008,12/23/2023,112054,3070,27246,-81738
2,1030 poplar llc,1030 poplar,ICC07900000000,0200900000007C,06/15/2021,06/15/2034,6722,6722,0,0
3,109 south second partnership,109 second,ICC07380000000,205300000130,04/07/2011,04/07/2024,79863,3486,19094,-57283
4,115 union ave llc,115 union,ICC07330000000,205300000020,03/29/2011,03/29/2021,29104,29104,0,0
...,...,...,...,...,...,...,...,...,...,...
522,wright medical technology inc,1023 cherry,IDB1590A000000,0015900000000D,12/31/2016,12/31/2031,5522,100,1381,-4042
523,wright medical technology inc,0 memphis arlington,IDB0000A000140,A0141L0A000040,12/31/2014,12/31/2024,62901,62901,0,0
524,wright medical technology inc,11576 memphis arlington,IDB0000A00015A,3000150000000D,12/31/2014,12/31/2024,12884,12884,0,0
525,wright medical technology inc,11576 memphis arlington,IDB0000A00020A,3000160000000D,12/31/2015,12/31/2025,1240,100,310,-830


<h3>Clean the List Columns</h3>
<p>Pandas stores lists as strings by default, but it still may be wrothwhile to cast them to lists before writing the file out. This ensures that when the file is read in a gain, no mistake can be made about the nature of the column because the leading and trailing brackets will be visible.</p>

<h4>Define An Entry Cleaning Function to Convert to a List, Using Commas as Delimiters</h4>


In [14]:
# Note that the "if" statement here is not necessary. It just protects us as notebook-users from accidentally applying the function twice and winding up with nested lists
def convert_to_list(entry: Any) -> list:
    if isinstance(entry, list):
        return entry
    else:
        return str(entry).split(",")

<h4>Apply The List-Casting Function to All List Columns</h4>

In [15]:
list_columns = ['pilot_parcel_no',
                'underlying_parcel_number']
df = clean_columns(df, list_columns, [convert_to_list])
df

Unnamed: 0,owner,location,pilot_parcel_no,underlying_parcel_number,begin_date,end_date,assmt_taxes,pilot_contract_amt,pilot_debt_service_amt,difference
0,1 south main llc wilmont hotel limited partner...,79 madison,[ICC03900000000],"[00203800000020, 00203800000030, 00203800000...",08/02/2000,08/02/2024,240731,180548,0,-60183
1,100 south main partners,100 main,[ICC07260000000],"[0020530B000010, 0020530B000030]",12/23/2008,12/23/2023,112054,3070,27246,-81738
2,1030 poplar llc,1030 poplar,[ICC07900000000],[0200900000007C],06/15/2021,06/15/2034,6722,6722,0,0
3,109 south second partnership,109 second,[ICC07380000000],[205300000130],04/07/2011,04/07/2024,79863,3486,19094,-57283
4,115 union ave llc,115 union,[ICC07330000000],[205300000020],03/29/2011,03/29/2021,29104,29104,0,0
...,...,...,...,...,...,...,...,...,...,...
522,wright medical technology inc,1023 cherry,[IDB1590A000000],[0015900000000D],12/31/2016,12/31/2031,5522,100,1381,-4042
523,wright medical technology inc,0 memphis arlington,[IDB0000A000140],[A0141L0A000040],12/31/2014,12/31/2024,62901,62901,0,0
524,wright medical technology inc,11576 memphis arlington,[IDB0000A00015A],[3000150000000D],12/31/2014,12/31/2024,12884,12884,0,0
525,wright medical technology inc,11576 memphis arlington,[IDB0000A00020A],[3000160000000D],12/31/2015,12/31/2025,1240,100,310,-830


<h3>Clean the Date Columns</h3>
<p>The dates in this dataframe are being stored as strings, which is not ideal. We want to store dates as timestamps, as these can be added and subtracted among other operations, unlike strings. There are two date columns: begin_date and end_date. Something is odd about the end_date column however: some of the values are not dates at all. We will return to this, but for now lets case "begin_date" to a timestamp.

<h4>Define a Function For Converting an Entry to a Timestamp</h4>
<p>Dates come in a variety of formats, so you will often need to create a custom function to convert dates for your specific data set. Be sure to check the documentation for the pd.to_datetime function before writing one. 

In [16]:
def cast_date_to_datetime(entry: str | Number):
    return pd.to_datetime(str(entry))

<h4>Apply the Function to our Date Columns</h4>

In [17]:
date_columns = ["begin_date"]
df = clean_columns(df, date_columns, [cast_date_to_datetime])
df

Unnamed: 0,owner,location,pilot_parcel_no,underlying_parcel_number,begin_date,end_date,assmt_taxes,pilot_contract_amt,pilot_debt_service_amt,difference
0,1 south main llc wilmont hotel limited partner...,79 madison,[ICC03900000000],"[00203800000020, 00203800000030, 00203800000...",2000-08-02,08/02/2024,240731,180548,0,-60183
1,100 south main partners,100 main,[ICC07260000000],"[0020530B000010, 0020530B000030]",2008-12-23,12/23/2023,112054,3070,27246,-81738
2,1030 poplar llc,1030 poplar,[ICC07900000000],[0200900000007C],2021-06-15,06/15/2034,6722,6722,0,0
3,109 south second partnership,109 second,[ICC07380000000],[205300000130],2011-04-07,04/07/2024,79863,3486,19094,-57283
4,115 union ave llc,115 union,[ICC07330000000],[205300000020],2011-03-29,03/29/2021,29104,29104,0,0
...,...,...,...,...,...,...,...,...,...,...
522,wright medical technology inc,1023 cherry,[IDB1590A000000],[0015900000000D],2016-12-31,12/31/2031,5522,100,1381,-4042
523,wright medical technology inc,0 memphis arlington,[IDB0000A000140],[A0141L0A000040],2014-12-31,12/31/2024,62901,62901,0,0
524,wright medical technology inc,11576 memphis arlington,[IDB0000A00015A],[3000150000000D],2014-12-31,12/31/2024,12884,12884,0,0
525,wright medical technology inc,11576 memphis arlington,[IDB0000A00020A],[3000160000000D],2015-12-31,12/31/2025,1240,100,310,-830


<h3>Clean the Object Columns</h3>
<p>Sometimes, we just can't cast an entire column to a single data type. The "end_date" column is an exmple of this. Click on the arrow next to "end_date" in the dataframe above to sort the values from greatest to smallest. You will notice that the "greatest" value is the string "End of Loan." This value is substantively important for our analysis, so we don't want to replace with it a missing value, nor with a date. We must convert the rest of the dates to the correct format while allowing this "End of Loan" value to persist. This means that this column will remain ans n "object" type.


<h4>Define a Function to Deal with Mixed Date Data</h4>
<p>This function features a "try/except" clause in order to avoid breaking when it hits the "End of Loan" value, which is not compatible with date castintg.</p>

In [18]:
def cast_end_date_to_datetime(entry: str | Number):
    try:
        return pd.to_datetime(str(entry))
    except ValueError:
        return entry

<h4>Apply our Special Mixed-Date-Sting Object Casting Function</h4>

In [19]:
# Note: because we are defining a function we are only going to use once in this case, it would be faster and easier to use a lambda function instead.
# For consistency, we will continue using the method we have already been using, but note that the following line of commented-out code do the same thing!
#df["end_date"] = df["end_date"].apply(lambda x: cast_end_date_to_datetime(x))

date_columns = ['begin_date']
df = clean_columns(df, date_columns, [cast_end_date_to_datetime])
df



Unnamed: 0,owner,location,pilot_parcel_no,underlying_parcel_number,begin_date,end_date,assmt_taxes,pilot_contract_amt,pilot_debt_service_amt,difference
0,1 south main llc wilmont hotel limited partner...,79 madison,[ICC03900000000],"[00203800000020, 00203800000030, 00203800000...",2000-08-02,08/02/2024,240731,180548,0,-60183
1,100 south main partners,100 main,[ICC07260000000],"[0020530B000010, 0020530B000030]",2008-12-23,12/23/2023,112054,3070,27246,-81738
2,1030 poplar llc,1030 poplar,[ICC07900000000],[0200900000007C],2021-06-15,06/15/2034,6722,6722,0,0
3,109 south second partnership,109 second,[ICC07380000000],[205300000130],2011-04-07,04/07/2024,79863,3486,19094,-57283
4,115 union ave llc,115 union,[ICC07330000000],[205300000020],2011-03-29,03/29/2021,29104,29104,0,0
...,...,...,...,...,...,...,...,...,...,...
522,wright medical technology inc,1023 cherry,[IDB1590A000000],[0015900000000D],2016-12-31,12/31/2031,5522,100,1381,-4042
523,wright medical technology inc,0 memphis arlington,[IDB0000A000140],[A0141L0A000040],2014-12-31,12/31/2024,62901,62901,0,0
524,wright medical technology inc,11576 memphis arlington,[IDB0000A00015A],[3000150000000D],2014-12-31,12/31/2024,12884,12884,0,0
525,wright medical technology inc,11576 memphis arlington,[IDB0000A00020A],[3000160000000D],2015-12-31,12/31/2025,1240,100,310,-830


<h1>Section 4: Casting Data Types</h1>
<p>Pandas variables are stored with data types internal to the pandas library. By default, if pandas is uncertain what the data type of a column is, that column is stored as an "object" data type. This type is the hardest to manipulate. We want to avoid storing data as objects unless we have no choice. Casting our data forces us to create handy schema that we can use for reading the data back in later immediately as the correct data type without having to do these operations again. </p>
    

<h3>Check the Current Data Types</h3>

In [20]:
# Check the current data types of the columns
df.dtypes

owner                               object
location                            object
pilot_parcel_no                     object
underlying_parcel_number            object
begin_date                  datetime64[ns]
end_date                            object
assmt_taxes                         object
pilot_contract_amt                  object
pilot_debt_service_amt              object
difference                          object
dtype: object

<h3>Define a Function for Casting Data Types</h3>

In [21]:
# Define a function to cast the data types of all DataFrame Variables using a dictionary of variable names to data types, as strings.
def cast_data_types(df: pd.DataFrame, names_to_types: dict[str,str]) -> pd.DataFrame:
    return df.astype(names_to_types)



<h3>Apply the Function and Cast the Data</h3>

In [22]:
df.columns

Index(['owner', 'location', 'pilot_parcel_no', 'underlying_parcel_number',
       'begin_date', 'end_date', 'assmt_taxes', 'pilot_contract_amt',
       'pilot_debt_service_amt', 'difference'],
      dtype='object')

In [23]:
# Apply our function to our data using a dictionary column names to desired data types
names_to_types = {'owner': 'string',
                  'location': 'string',
                  'pilot_parcel_no': 'string', 
                  'underlying_parcel_number': 'string',
                  'begin_date': 'datetime64[ns]', 
                  'end_date': 'object',
                  'assmt_taxes': 'float64',
                  'pilot_contract_amt': 'float64',
       'pilot_debt_service_amt': 'float64', 

                  }
df = cast_data_types(df, names_to_types)

# Check the new data types of the columns
df.dtypes

owner                       string[python]
location                    string[python]
pilot_parcel_no             string[python]
underlying_parcel_number    string[python]
begin_date                  datetime64[ns]
end_date                            object
assmt_taxes                        float64
pilot_contract_amt                 float64
pilot_debt_service_amt             float64
difference                          object
dtype: object

<h2>Section 5: Creating a Unique ID Column</h2>

<h3>Defining a Deterministic Hash Function (Advanced)</h3>
<p>Sometimes, data does not come with a unique identifier string column. Pandas automatically creates an index column which ennumerates our data (0, 1, 2, 3 ...) but this is not the recommended way to identify unique observations (entries) in a dataframe. When we perform data joins, we want our identifying column to always contain the same unique value when associated with the same object, and for our identifiers to never overlap. Using consecutive numbers prevents overlap, but it can create confusion. Is this really the identifying column, or was it just generated by default ages ago, and everyone forgot about it? In a small dataset where few joins are performed it may not matter, but in a large one it definitely does. </p>

<p>Lets create our own unique identifier based on the values in the "owner" column. We want to be sure that no matter when we run this function, we will always get the same unique identification string when we input the same "owner" value. This way, we can create anonymous unique identifiers using different datasets with different owners, and if any owners happen to be the same, they will still be associated with the same identifier.</p>

In [24]:
# Create a unique alphanumeric string by taking a deterministic hash of an input value
def deterministic_uuid(data:str):
    random.seed(data)
    id = uuid.UUID(bytes=bytes(random.getrandbits(8) for _ in range(16)), version=4)
    id = str(id).replace("-", "")
    return id

# Apply the function above to a DataFrame, using any number of selected columns as identifiers
# Doing it this way allows us to use more than one column to identify unique data points (like if we wanted to use a persons first name AND last name, each in different columns, for instance).
def set_unique_index(df: pd.DataFrame, columns_to_hash: list[str], index_name="id", index_length_limit: Optional[int] = None):
    df[index_name] = list(map(lambda x: deterministic_uuid(''.join([str(col_value) for col_value in x]))[0:index_length_limit], df[columns_to_hash].values))
    df.set_index(index_name, inplace=True)
    return df

<h3>Apply the Functions to Create The ID Column</h3>
<p> Now we will simply apply the functions we defined above, using a list of uniquely-identifying columns (in this case, just "owner") to distinguish one observation from another.

In [25]:
# Create a unique index column based on the values in the identifying column
# Note: Ids are based on a deterministic hash, which means they are 
# reproducible even if the function is run by a different user at a different time,
# as long as the string in the identifying column is the same. 
identifying_columns = ["owner"]

# The argument "index length limit" shortens the ID value generated to something more reasonable for this size dataset. By default, the id string is much longer.
df = set_unique_index(df=df, columns_to_hash = identifying_columns, index_length_limit=12)
df

Unnamed: 0_level_0,owner,location,pilot_parcel_no,underlying_parcel_number,begin_date,end_date,assmt_taxes,pilot_contract_amt,pilot_debt_service_amt,difference
owner_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
4d492ee75e30,1 south main llc wilmont hotel limited partner...,79 madison,['ICC03900000000'],"['00203800000020', ' 00203800000030', ' 002038...",2000-08-02,08/02/2024,240731.0,180548.0,0.0,-60183
468725c9628c,100 south main partners,100 main,['ICC07260000000'],"['0020530B000010', ' 0020530B000030']",2008-12-23,12/23/2023,112054.0,3070.0,27246.0,-81738
b39884512460,1030 poplar llc,1030 poplar,['ICC07900000000'],['0200900000007C'],2021-06-15,06/15/2034,6722.0,6722.0,0.0,0
f26dc5a380c1,109 south second partnership,109 second,['ICC07380000000'],['205300000130'],2011-04-07,04/07/2024,79863.0,3486.0,19094.0,-57283
129ab94e175b,115 union ave llc,115 union,['ICC07330000000'],['205300000020'],2011-03-29,03/29/2021,29104.0,29104.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...
d436d1c9a0ff,wright medical technology inc,1023 cherry,['IDB1590A000000'],['0015900000000D'],2016-12-31,12/31/2031,5522.0,100.0,1381.0,-4042
d436d1c9a0ff,wright medical technology inc,0 memphis arlington,['IDB0000A000140'],['A0141L0A000040'],2014-12-31,12/31/2024,62901.0,62901.0,0.0,0
d436d1c9a0ff,wright medical technology inc,11576 memphis arlington,['IDB0000A00015A'],['3000150000000D'],2014-12-31,12/31/2024,12884.0,12884.0,0.0,0
d436d1c9a0ff,wright medical technology inc,11576 memphis arlington,['IDB0000A00020A'],['3000160000000D'],2015-12-31,12/31/2025,1240.0,100.0,310.0,-830


<h1>Section 6: Writing Out the Clean Data</h1>
<p>Now that the data is clean, we want to write out a clean copy of the data to a new destination. We also may optionally want to convert the outfile to utf-8 format, as this format is the most versitile and can be read by the most sources.

<h3>Write Out the Data</h3>

In [26]:
# Save the now-clean csv
outfile_path = "../data/processed/s2_2019.csv"
df.to_csv(outfile_path, sep=",")

<h3>Define a Function to Convert a File to UTF-8 Format</h3>

In [27]:
# convert a csv into utf8 format
# Note that this function will replace any remaining non-utf-8-compatible special characters in the data with its nearest utf-8 equivalent.

def convert_utf8(original_file_path: str, new_file_path: str):
    df = pd.read_csv(original_file_path, converters=defaultdict(lambda i: str))
    for column in df.columns:
        df[column] = df[column].apply(lambda x: unidecode(str(x)))
    df.to_csv(new_file_path, encoding='utf-8')

<h3>Convert the Outfile to UTF8 Format</h3>

In [28]:
convert_utf8(original_file_path=outfile_path, new_file_path=outfile_path)

<h1>Section 7: Next Steps</h1>
<p>Now that you have the basic tools to clean a dataset, you'll need some functions to manipulate that data easily for presenting summary statistics and charts. In a future notebook, I will present functions for gaining a quick snapshot of a dataset.