In [1]:
import pandas as pd
from utils import *



<h1>Cleaning a CSV</h1>

Read in a CSV

In [2]:
# load a dataframe
infile_path = "../data/raw/silver_shelby_county_properties_municipality_breakdown - s1_2021.csv"
df = pd.read_csv(infile_path)
df

Unnamed: 0,Board,Number of Contracts,Amount Billed Pilot (INL),Amount Billed Debt Service,Assessment,Tax Based on Assessment at 2021 Tax Rate,Difference,Percentage Total PILOT Billing
0,Downtown Memphis Commission,127,"$1,702,925.70","$1,354,076.55","$406,461,240","$14,022,912.78",-10965910.53,18.92%
1,EDGE - Shelby,226,"$4,078,933.42","$5,183,964.81","$730,912,470","$25,216,480.22",-15953581.99,57.32%
2,Industrial Development of Arlington,11,"$109,409.96","$57,885.31","$9,546,730","$329,362.19",-162066.92,1.04%
3,Industrial Development of Bartlett,9,"$22,387.74","$137,868.24","$10,152,780","$350,270.91",-190014.93,0.99%
4,Industrial Development of Collierville,10,"$673,347.93","$816,687.94","$99,353,000","$3,427,678.50",-1937642.63,9.22%
5,Industrial Development of Germantown,4,"$315,681.29","$215,876.06","$38,602,690","$1,331,792.81",-800235.46,3.29%
6,Industrial Development of Millington,1,"$8,757.14","$92,733.24","$10,751,680","$370,932.96",-269442.58,0.63%
7,Health & Education Board - Memphis,101,"$1,263,358.70",$0.00,"$235,359,315","$8,119,896.37",-6856537.67,7.82%
8,Health & Education Board - Shelby,6,"$89,436.98",$0.00,"$30,543,560","$1,053,752.82",-964315.84,0.55%
9,Miscellaneous Contracts,8,"$29,640.00",$0.00,"$9,504,040","$327,889.38",-298249.38,0.18%


In [3]:
# Clean the column headers
df = clean_headers(df, [remove_special_for_words, truncate, snake_case, lower_case])
df.columns

Index(['board', 'number_of_contracts', 'amount_billed_pilot_inl',
       'amount_billed_debt_service', 'assessment',
       'tax_based_on_assessment_at_2021_tax_rate', 'difference',
       'percentage_total_pilot_billing'],
      dtype='object')

In [4]:
# Clean the string columns
string_columns = ["board"]
df = clean_columns(df, string_columns, [remove_special_for_words, truncate, lower_case])
df

Unnamed: 0,board,number_of_contracts,amount_billed_pilot_inl,amount_billed_debt_service,assessment,tax_based_on_assessment_at_2021_tax_rate,difference,percentage_total_pilot_billing
0,downtown memphis commission,127,"$1,702,925.70","$1,354,076.55","$406,461,240","$14,022,912.78",-10965910.53,18.92%
1,edge shelby,226,"$4,078,933.42","$5,183,964.81","$730,912,470","$25,216,480.22",-15953581.99,57.32%
2,industrial development of arlington,11,"$109,409.96","$57,885.31","$9,546,730","$329,362.19",-162066.92,1.04%
3,industrial development of bartlett,9,"$22,387.74","$137,868.24","$10,152,780","$350,270.91",-190014.93,0.99%
4,industrial development of collierville,10,"$673,347.93","$816,687.94","$99,353,000","$3,427,678.50",-1937642.63,9.22%
5,industrial development of germantown,4,"$315,681.29","$215,876.06","$38,602,690","$1,331,792.81",-800235.46,3.29%
6,industrial development of millington,1,"$8,757.14","$92,733.24","$10,751,680","$370,932.96",-269442.58,0.63%
7,health education board memphis,101,"$1,263,358.70",$0.00,"$235,359,315","$8,119,896.37",-6856537.67,7.82%
8,health education board shelby,6,"$89,436.98",$0.00,"$30,543,560","$1,053,752.82",-964315.84,0.55%
9,miscellaneous contracts,8,"$29,640.00",$0.00,"$9,504,040","$327,889.38",-298249.38,0.18%


In [5]:
# Clean the numeric columns
numeric_columns = ['amount_billed_pilot_inl', 
                   'amount_billed_debt_service', 
                   'assessment',
                   'tax_based_on_assessment_at_2021_tax_rate', 
                   'difference',
                   'percentage_total_pilot_billing']
df = clean_columns(df, numeric_columns, [remove_special_for_numbers])
df

Unnamed: 0,board,number_of_contracts,amount_billed_pilot_inl,amount_billed_debt_service,assessment,tax_based_on_assessment_at_2021_tax_rate,difference,percentage_total_pilot_billing
0,downtown memphis commission,127,1702925.7,1354076.55,406461240,14022912.78,-10965910.53,18.92
1,edge shelby,226,4078933.42,5183964.81,730912470,25216480.22,-15953581.99,57.32
2,industrial development of arlington,11,109409.96,57885.31,9546730,329362.19,-162066.92,1.04
3,industrial development of bartlett,9,22387.74,137868.24,10152780,350270.91,-190014.93,0.99
4,industrial development of collierville,10,673347.93,816687.94,99353000,3427678.5,-1937642.63,9.22
5,industrial development of germantown,4,315681.29,215876.06,38602690,1331792.81,-800235.46,3.29
6,industrial development of millington,1,8757.14,92733.24,10751680,370932.96,-269442.58,0.63
7,health education board memphis,101,1263358.7,0.0,235359315,8119896.37,-6856537.67,7.82
8,health education board shelby,6,89436.98,0.0,30543560,1053752.82,-964315.84,0.55
9,miscellaneous contracts,8,29640.0,0.0,9504040,327889.38,-298249.38,0.18


In [6]:
# Check the data types of each variable
df.dtypes

board                                       object
number_of_contracts                          int64
amount_billed_pilot_inl                     object
amount_billed_debt_service                  object
assessment                                  object
tax_based_on_assessment_at_2021_tax_rate    object
difference                                  object
percentage_total_pilot_billing              object
dtype: object

In [7]:
# Fix the data type of each variable
names_to_types = {"board": str, 
                  "number_of_contracts": int, 
                  "amount_billed_pilot_inl": float, 
                  "amount_billed_debt_service": float, 
                  "assessment": float,
                  "tax_based_on_assessment_at_2021_tax_rate": float, 
                  "difference": float,
                  "percentage_total_pilot_billing": float,
                  }
df = cast_data_types(df, names_to_types)
df.dtypes

board                                        object
number_of_contracts                           int64
amount_billed_pilot_inl                     float64
amount_billed_debt_service                  float64
assessment                                  float64
tax_based_on_assessment_at_2021_tax_rate    float64
difference                                  float64
percentage_total_pilot_billing              float64
dtype: object

In [8]:
# Create a unique index column based on the values in the identifying column
# Note: Ids are based on a deterministic hash, which means they are 
# reproducible even if the function is run by a different user at a different time,
# as long as the string in the identifying column is the same. 

identifying_columns = ["board"]
index_name = "board_id"
df = set_unique_index(df=df, columns_to_hash = identifying_columns, index_length_limit=12, index_name=index_name)
df

Unnamed: 0_level_0,board,number_of_contracts,amount_billed_pilot_inl,amount_billed_debt_service,assessment,tax_based_on_assessment_at_2021_tax_rate,difference,percentage_total_pilot_billing
board_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5b1f4c0c65a3,downtown memphis commission,127,1702925.7,1354076.55,406461240.0,14022912.78,-10965910.53,18.92
d5fe3a85ec4e,edge shelby,226,4078933.42,5183964.81,730912470.0,25216480.22,-15953581.99,57.32
abb34861ca91,industrial development of arlington,11,109409.96,57885.31,9546730.0,329362.19,-162066.92,1.04
aada4183f85c,industrial development of bartlett,9,22387.74,137868.24,10152780.0,350270.91,-190014.93,0.99
91e543f3085f,industrial development of collierville,10,673347.93,816687.94,99353000.0,3427678.5,-1937642.63,9.22
3f1cf0abc31c,industrial development of germantown,4,315681.29,215876.06,38602690.0,1331792.81,-800235.46,3.29
757a7152194c,industrial development of millington,1,8757.14,92733.24,10751680.0,370932.96,-269442.58,0.63
3bcfcbd3a12d,health education board memphis,101,1263358.7,0.0,235359315.0,8119896.37,-6856537.67,7.82
64c9e33b120e,health education board shelby,6,89436.98,0.0,30543560.0,1053752.82,-964315.84,0.55
df3a7c5fc143,miscellaneous contracts,8,29640.0,0.0,9504040.0,327889.38,-298249.38,0.18


In [9]:
# Save the now-clean csv and convert it to utf-8 format
outfile_path = "../data/processed/s1_2021.csv"
df.to_csv(outfile_path)
convert_utf8(original_file_path=outfile_path, new_file_path=outfile_path)