In [1]:
import pandas as pd
from utils import *



<h1>Cleaning a CSV</h1>

Read in a CSV

In [2]:
# load a dataframe
infile_path = "../data/raw/silver_shelby_county_properties_municipality_breakdown - s1_2020.csv"
df = pd.read_csv(infile_path)
df

Unnamed: 0,Board,Number of Contracts,Amount Billed Pilot (INL),Amount Billed Debt Service,Assessment,Tax Based on Assessment at 2021 Tax Rate,Difference,Percentage Total PILOT Billing
0,Downtown Memphis Commission,124,"$1,810,641.75","$1,253,932","$362,716,620.00",14690023.11,-11625449.56,18.61%
1,EDGE - Shelby,244,"$3,571,519.04","$5,759,342","$597,033,940.00",24179874.57,-14849013.54,56.66%
2,Industrial Development of Arlington,12,"$99,227.89","$103,320","$12,374,870.00",501182.24,-298634.1,1.23%
3,Industrial Development of Bartlett,11,"$31,642.62","$103,452","$10,632,240.00",430605.72,-295510.76,0.82%
4,Industrial Development of Collierville,11,"$768,997.18","$918,418","$95,793,370.00",3879631.49,-2192216.19,10.25%
5,Industrial Development of Germantown,5,"$347,401.82","$233,408","$34,110,510.00",1381475.66,-800666.29,3.53%
6,Industrial Development of Millington,1,"$10,280.12","$79,383","$7,840,320.00",317532.96,-227869.6,0.54%
7,Health & Education Board - Memphis,80,"$1,227,981.40",$0,"$156,089,720.00",6321633.66,-5093652.26,7.46%
8,Health & Education Board - Shelby,7,"$108,060.40",$0,"$28,279,320.00",1145312.46,-1037252.06,0.66%
9,Miscellaneous Contracts,10,"$35,460.00",$0,"$9,033,520.00",365857.56,-330397.56,0.22%


In [3]:
# Clean the column headers
df = clean_headers(df, [remove_special_for_words, truncate, snake_case, lower_case])
df.columns

Index(['board', 'number_of_contracts', 'amount_billed_pilot_inl',
       'amount_billed_debt_service', 'assessment',
       'tax_based_on_assessment_at_2021_tax_rate', 'difference',
       'percentage_total_pilot_billing'],
      dtype='object')

In [4]:
# Clean the string columns
string_columns = ["board"]
df = clean_columns(df, string_columns, [remove_special_for_words, truncate, lower_case])
df

Unnamed: 0,board,number_of_contracts,amount_billed_pilot_inl,amount_billed_debt_service,assessment,tax_based_on_assessment_at_2021_tax_rate,difference,percentage_total_pilot_billing
0,downtown memphis commission,124,"$1,810,641.75","$1,253,932","$362,716,620.00",14690023.11,-11625449.56,18.61%
1,edge shelby,244,"$3,571,519.04","$5,759,342","$597,033,940.00",24179874.57,-14849013.54,56.66%
2,industrial development of arlington,12,"$99,227.89","$103,320","$12,374,870.00",501182.24,-298634.1,1.23%
3,industrial development of bartlett,11,"$31,642.62","$103,452","$10,632,240.00",430605.72,-295510.76,0.82%
4,industrial development of collierville,11,"$768,997.18","$918,418","$95,793,370.00",3879631.49,-2192216.19,10.25%
5,industrial development of germantown,5,"$347,401.82","$233,408","$34,110,510.00",1381475.66,-800666.29,3.53%
6,industrial development of millington,1,"$10,280.12","$79,383","$7,840,320.00",317532.96,-227869.6,0.54%
7,health education board memphis,80,"$1,227,981.40",$0,"$156,089,720.00",6321633.66,-5093652.26,7.46%
8,health education board shelby,7,"$108,060.40",$0,"$28,279,320.00",1145312.46,-1037252.06,0.66%
9,miscellaneous contracts,10,"$35,460.00",$0,"$9,033,520.00",365857.56,-330397.56,0.22%


In [5]:
# Clean the numeric columns
numeric_columns = ['amount_billed_pilot_inl', 
                   'amount_billed_debt_service', 
                   'assessment',
                   'tax_based_on_assessment_at_2021_tax_rate', 
                   'difference',
                   'percentage_total_pilot_billing']
df = clean_columns(df, numeric_columns, [remove_special_for_numbers])
df

Unnamed: 0,board,number_of_contracts,amount_billed_pilot_inl,amount_billed_debt_service,assessment,tax_based_on_assessment_at_2021_tax_rate,difference,percentage_total_pilot_billing
0,downtown memphis commission,124,1810641.75,1253932,362716620.0,14690023.11,-11625449.56,18.61
1,edge shelby,244,3571519.04,5759342,597033940.0,24179874.57,-14849013.54,56.66
2,industrial development of arlington,12,99227.89,103320,12374870.0,501182.24,-298634.1,1.23
3,industrial development of bartlett,11,31642.62,103452,10632240.0,430605.72,-295510.76,0.82
4,industrial development of collierville,11,768997.18,918418,95793370.0,3879631.49,-2192216.19,10.25
5,industrial development of germantown,5,347401.82,233408,34110510.0,1381475.66,-800666.29,3.53
6,industrial development of millington,1,10280.12,79383,7840320.0,317532.96,-227869.6,0.54
7,health education board memphis,80,1227981.4,0,156089720.0,6321633.66,-5093652.26,7.46
8,health education board shelby,7,108060.4,0,28279320.0,1145312.46,-1037252.06,0.66
9,miscellaneous contracts,10,35460.0,0,9033520.0,365857.56,-330397.56,0.22


In [6]:
# Check the data types of each variable
df.dtypes

board                                       object
number_of_contracts                          int64
amount_billed_pilot_inl                     object
amount_billed_debt_service                  object
assessment                                  object
tax_based_on_assessment_at_2021_tax_rate    object
difference                                  object
percentage_total_pilot_billing              object
dtype: object

In [7]:
# Fix the data type of each variable
names_to_types = {"board": str, 
                  "number_of_contracts": int, 
                  "amount_billed_pilot_inl": float, 
                  "amount_billed_debt_service": float, 
                  "assessment": float,
                  "tax_based_on_assessment_at_2021_tax_rate": float, 
                  "difference": float,
                  "percentage_total_pilot_billing": float,
                  }
df = cast_data_types(df, names_to_types)
df.dtypes

board                                        object
number_of_contracts                           int64
amount_billed_pilot_inl                     float64
amount_billed_debt_service                  float64
assessment                                  float64
tax_based_on_assessment_at_2021_tax_rate    float64
difference                                  float64
percentage_total_pilot_billing              float64
dtype: object

In [8]:
# Create a unique index column based on the values in the identifying column
# Note: Ids are based on a deterministic hash, which means they are 
# reproducible even if the function is run by a different user at a different time,
# as long as the string in the identifying column is the same. 

identifying_columns = ["board"]
index_name = "board_id"
df = set_unique_index(df=df, columns_to_hash = identifying_columns, index_length_limit=12, index_name=index_name)
df

Unnamed: 0_level_0,board,number_of_contracts,amount_billed_pilot_inl,amount_billed_debt_service,assessment,tax_based_on_assessment_at_2021_tax_rate,difference,percentage_total_pilot_billing
board_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5b1f4c0c65a3,downtown memphis commission,124,1810641.75,1253932.0,362716620.0,14690023.11,-11625449.56,18.61
d5fe3a85ec4e,edge shelby,244,3571519.04,5759342.0,597033940.0,24179874.57,-14849013.54,56.66
abb34861ca91,industrial development of arlington,12,99227.89,103320.0,12374870.0,501182.24,-298634.1,1.23
aada4183f85c,industrial development of bartlett,11,31642.62,103452.0,10632240.0,430605.72,-295510.76,0.82
91e543f3085f,industrial development of collierville,11,768997.18,918418.0,95793370.0,3879631.49,-2192216.19,10.25
3f1cf0abc31c,industrial development of germantown,5,347401.82,233408.0,34110510.0,1381475.66,-800666.29,3.53
757a7152194c,industrial development of millington,1,10280.12,79383.0,7840320.0,317532.96,-227869.6,0.54
3bcfcbd3a12d,health education board memphis,80,1227981.4,0.0,156089720.0,6321633.66,-5093652.26,7.46
64c9e33b120e,health education board shelby,7,108060.4,0.0,28279320.0,1145312.46,-1037252.06,0.66
df3a7c5fc143,miscellaneous contracts,10,35460.0,0.0,9033520.0,365857.56,-330397.56,0.22


In [9]:
# Save the now-clean csv and convert it to utf-8 format
outfile_path = "../data/processed/s1_2020.csv"
df.to_csv(outfile_path)
convert_utf8(original_file_path=outfile_path, new_file_path=outfile_path)