In [1]:
import pandas as pd
from utils import *



<h1>Cleaning a CSV</h1>

Read in a CSV

In [2]:
# load a dataframe
infile_path = "../data/raw/silver_shelby_county_properties_municipality_breakdown - s1_2022.csv"
df = pd.read_csv(infile_path)
df

Unnamed: 0,Board,Number of Contracts,Amount Billed Pilot (INL),Amount Billed Debt Service,Assessment,Tax Based on Assessment at 2021 Tax Rate,Difference,Percentage Total PILOT Billing
0,Downtown Memphis Commission,124,"$2,265,277.82","$1,378,186.61","$403,472,360","$13,677,713.00",-10034248.57,20.05%
1,EDGE - Shelby,241,"$4,357,705.55","$5,922,590.25","$845,013,990","$28,645,974.26",-18365678.46,56.57%
2,Industrial Development of Arlington,10,"$116,203.75","$49,372.12","$8,906,110","$301,917.13",-136341.26,0.91%
3,Industrial Development of Bartlett,6,"$21,998.39","$47,646.73","$7,340,910","$248,856.85",-179211.73,0.38%
4,Industrial Development of Collierville,12,"$690,322.00","$914,883.78","$103,923,710","$3,523,013.77",-1917807.99,8.83%
5,Industrial Development of Germantown,4,"$310,191.18","$211,034.77","$34,051,040","$1,154,330.26",-633104.31,2.87%
6,Industrial Development of Millington,1,"$50,530.09","$80,385.75","$10,751,680","$364,481.95",-233566.11,0.72%
7,Health & Education Board - Memphis,107,"$1,627,887.89",$0.00,"$245,059,120","$8,307,504.17",-6679616.28,8.96%
8,Health & Education Board - Shelby,6,"$88,595.41",$0.00,"$30,543,560","$1,035,426.68",-946831.27,0.49%
9,Miscellaneous Contracts,10,"$35,460.00",$0.00,"$10,528,400","$356,912.76",-321452.76,0.20%


In [3]:
# Clean the column headers
df = clean_headers(df, [remove_special_for_words, truncate, snake_case, lower_case])
df.columns

Index(['board', 'number_of_contracts', 'amount_billed_pilot_inl',
       'amount_billed_debt_service', 'assessment',
       'tax_based_on_assessment_at_2021_tax_rate', 'difference',
       'percentage_total_pilot_billing'],
      dtype='object')

In [4]:
# Clean the string columns
string_columns = ["board"]
df = clean_columns(df, string_columns, [remove_special_for_words, truncate, lower_case])
df

Unnamed: 0,board,number_of_contracts,amount_billed_pilot_inl,amount_billed_debt_service,assessment,tax_based_on_assessment_at_2021_tax_rate,difference,percentage_total_pilot_billing
0,downtown memphis commission,124,"$2,265,277.82","$1,378,186.61","$403,472,360","$13,677,713.00",-10034248.57,20.05%
1,edge shelby,241,"$4,357,705.55","$5,922,590.25","$845,013,990","$28,645,974.26",-18365678.46,56.57%
2,industrial development of arlington,10,"$116,203.75","$49,372.12","$8,906,110","$301,917.13",-136341.26,0.91%
3,industrial development of bartlett,6,"$21,998.39","$47,646.73","$7,340,910","$248,856.85",-179211.73,0.38%
4,industrial development of collierville,12,"$690,322.00","$914,883.78","$103,923,710","$3,523,013.77",-1917807.99,8.83%
5,industrial development of germantown,4,"$310,191.18","$211,034.77","$34,051,040","$1,154,330.26",-633104.31,2.87%
6,industrial development of millington,1,"$50,530.09","$80,385.75","$10,751,680","$364,481.95",-233566.11,0.72%
7,health education board memphis,107,"$1,627,887.89",$0.00,"$245,059,120","$8,307,504.17",-6679616.28,8.96%
8,health education board shelby,6,"$88,595.41",$0.00,"$30,543,560","$1,035,426.68",-946831.27,0.49%
9,miscellaneous contracts,10,"$35,460.00",$0.00,"$10,528,400","$356,912.76",-321452.76,0.20%


In [5]:
# Clean the numeric columns
numeric_columns = ['amount_billed_pilot_inl', 
                   'amount_billed_debt_service', 
                   'assessment',
                   'tax_based_on_assessment_at_2021_tax_rate', 
                   'difference',
                   'percentage_total_pilot_billing']
df = clean_columns(df, numeric_columns, [remove_special_for_numbers])
df

Unnamed: 0,board,number_of_contracts,amount_billed_pilot_inl,amount_billed_debt_service,assessment,tax_based_on_assessment_at_2021_tax_rate,difference,percentage_total_pilot_billing
0,downtown memphis commission,124,2265277.82,1378186.61,403472360,13677713.0,-10034248.57,20.05
1,edge shelby,241,4357705.55,5922590.25,845013990,28645974.26,-18365678.46,56.57
2,industrial development of arlington,10,116203.75,49372.12,8906110,301917.13,-136341.26,0.91
3,industrial development of bartlett,6,21998.39,47646.73,7340910,248856.85,-179211.73,0.38
4,industrial development of collierville,12,690322.0,914883.78,103923710,3523013.77,-1917807.99,8.83
5,industrial development of germantown,4,310191.18,211034.77,34051040,1154330.26,-633104.31,2.87
6,industrial development of millington,1,50530.09,80385.75,10751680,364481.95,-233566.11,0.72
7,health education board memphis,107,1627887.89,0.0,245059120,8307504.17,-6679616.28,8.96
8,health education board shelby,6,88595.41,0.0,30543560,1035426.68,-946831.27,0.49
9,miscellaneous contracts,10,35460.0,0.0,10528400,356912.76,-321452.76,0.2


In [6]:
# Check the data types of each variable
df.dtypes

board                                       object
number_of_contracts                          int64
amount_billed_pilot_inl                     object
amount_billed_debt_service                  object
assessment                                  object
tax_based_on_assessment_at_2021_tax_rate    object
difference                                  object
percentage_total_pilot_billing              object
dtype: object

In [7]:
# Fix the data type of each variable
names_to_types = {"board": str, 
                  "number_of_contracts": int, 
                  "amount_billed_pilot_inl": float, 
                  "amount_billed_debt_service": float, 
                  "assessment": float,
                  "tax_based_on_assessment_at_2021_tax_rate": float, 
                  "difference": float,
                  "percentage_total_pilot_billing": float,
                  }
df = cast_data_types(df, names_to_types)
df.dtypes

board                                        object
number_of_contracts                           int64
amount_billed_pilot_inl                     float64
amount_billed_debt_service                  float64
assessment                                  float64
tax_based_on_assessment_at_2021_tax_rate    float64
difference                                  float64
percentage_total_pilot_billing              float64
dtype: object

In [8]:
# Create a unique index column based on the values in the identifying column
# Note: Ids are based on a deterministic hash, which means they are 
# reproducible even if the function is run by a different user at a different time,
# as long as the string in the identifying column is the same. 

identifying_columns = ["board"]
index_name = "board_id"
df = set_unique_index(df=df, columns_to_hash = identifying_columns, index_length_limit=12, index_name=index_name)
df

Unnamed: 0_level_0,board,number_of_contracts,amount_billed_pilot_inl,amount_billed_debt_service,assessment,tax_based_on_assessment_at_2021_tax_rate,difference,percentage_total_pilot_billing
board_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5b1f4c0c65a3,downtown memphis commission,124,2265277.82,1378186.61,403472360.0,13677713.0,-10034248.57,20.05
d5fe3a85ec4e,edge shelby,241,4357705.55,5922590.25,845013990.0,28645974.26,-18365678.46,56.57
abb34861ca91,industrial development of arlington,10,116203.75,49372.12,8906110.0,301917.13,-136341.26,0.91
aada4183f85c,industrial development of bartlett,6,21998.39,47646.73,7340910.0,248856.85,-179211.73,0.38
91e543f3085f,industrial development of collierville,12,690322.0,914883.78,103923710.0,3523013.77,-1917807.99,8.83
3f1cf0abc31c,industrial development of germantown,4,310191.18,211034.77,34051040.0,1154330.26,-633104.31,2.87
757a7152194c,industrial development of millington,1,50530.09,80385.75,10751680.0,364481.95,-233566.11,0.72
3bcfcbd3a12d,health education board memphis,107,1627887.89,0.0,245059120.0,8307504.17,-6679616.28,8.96
64c9e33b120e,health education board shelby,6,88595.41,0.0,30543560.0,1035426.68,-946831.27,0.49
df3a7c5fc143,miscellaneous contracts,10,35460.0,0.0,10528400.0,356912.76,-321452.76,0.2


In [9]:
# Save the now-clean csv and convert it to utf-8 format
outfile_path = "../data/processed/s1_2022.csv"
df.to_csv(outfile_path)
convert_utf8(original_file_path=outfile_path, new_file_path=outfile_path)