# Process Data from Original Source (Fulton Files)

In [1]:
import os
import csv
import pandas as pd
import geopandas as gpd

pd.set_option('display.max_columns', 150)
pd.options.display.float_format = '{:.2f}'.format

FULTON_DIR = './data/raw_fulton/'
fulton_files = os.listdir(FULTON_DIR)

### Helper Functions

In [2]:
def contains_keyword(filename: str, keywords: list[str]) -> str:
    return any(keyword in filename for keyword in keywords)

def clean_and_cast_column(column: pd.Series, var_map: dict) -> pd.Series:
    to_dtype = var_map[column.name] 
    fill_val = None
    
    if to_dtype == "int" or to_dtype == "float":
        fill_val = 0
    elif to_dtype == "string":
        fill_val = ""
    else:
        raise ValueError(f"{to_dtype} is not a valid data type!")
    
    if ((to_dtype == "int" or to_dtype == "float")
        and (column.dtype == "string" or column.dtype == "object")):
        # Remove commas from number strings before converting
        column = column.astype("str").str.replace(",", "").astype('float')
    
    # Record number of filled nulls
    print(f"Number of nulls in column {column.name}: {column.isna().sum()}")
    
    column = column.fillna(fill_val)
    return column.astype(to_dtype)

## Create a single file with all LUC == 101 Fulton County Digest (Parcel) Data for all Years

In [3]:
keywords = ["DIGEST", "NF", "SF"]
digest_cols = {
    "Taxyr": "int",
    "Parid": "string",
    "Situs Adrno": "int",
    "Situs Adrdir": "string",
    "Situs Adrstr": "string",
    "Situs Adrsuf": "string",
    "Cityname": "string",
    "Luc": "string",
    "Calcacres": "float",
    "Own1": "string",
    "Own2": "string",
    "Owner Adrno": "int",
    "Owner Adradd": "string",
    "Owner Adrdir": "string",
    "Owner Adrstr": "string",
    "Owner Adrsuf": "string",
    "Cityname.1": "string",
    "Statecode": "string",
    "Zip1": "string",
    "Aprtot": "float",
    "D Yrblt": "int",
    "D Effyr": "int",
    "D Yrremod": "int",
    "Sfla": "float",
    "Rmbed": "int",
    "Fixbath": "int",
    "Calcacres": "float",
    "Heat": "int",
    "D Effyr": "int",
    "Extwall": "int",
    "Style": "string",
    "Rmtot": "int",
    "D Grade": "string",
    "Bsmt": "int"
}

desired_files = filter(lambda file: contains_keyword(file, keywords), fulton_files)

# Read desired files and only parse desired cols
# Need to ensure LUC is read in as a str so we can filter appropriately
desired_files_dfs = [
    pd.read_excel(
        FULTON_DIR + file,
        usecols=digest_cols,
        dtype={"Luc": "str"}
    ) for file in desired_files
]

In [4]:
# Concat selected digest files
# Select for LUC = 101 (SFH)
# Drop complete duplicates
digest_full = pd.concat(desired_files_dfs)
digest_full['Luc'] = digest_full['Luc'].astype('str')
digest_full = digest_full[digest_full['Luc'] == '101']

In [5]:
# After filtering for LUC, we can continue to cast other columns
rename = {
    "Taxyr": "TAXYR",
    "Parid": "PARID",
    "Cityname.1": "own_cityname",
    "Zip1": "own_zip",
    "Sfla": "sqft_living",
    "D Yrblt": "yr_built",
    "Rmbed": "beds",
    "Fixbath": "baths",
    "Calcacres": "acres",
    "Heat": "heat",
}
init_len = len(digest_full)

digest_full = digest_full.drop_duplicates()
print(f"Init len: {init_len}")
print(f"Number of dropped complete duplicates: {init_len - len(digest_full)}")
print(f"Final len: {len(digest_full)}")

# Drop nulls in important columns
drop_nulls_cols = [
    "Taxyr", "Parid", "Situs Adrno", "Situs Adrstr", "Luc", "D Yrblt",
    "Sfla", "Rmbed", "Fixbath", "Calcacres", "Heat", "Extwall",
    "Style", "Rmtot", "D Grade", "Bsmt", "Owner Adrstr"
]

init_len = len(digest_full)
digest_full = digest_full.dropna(subset=drop_nulls_cols)
print(f"Number of dropped nulls from important columns: {init_len - len(digest_full)}")

# Records nulls and set datatypes
for column in digest_full.columns:
    digest_full[column] = clean_and_cast_column(digest_full[column], digest_cols)
    
digest_full = digest_full.rename(columns=rename)

Init len: 3681749
Number of dropped complete duplicates: 896087
Final len: 2785662
Number of dropped nulls from important columns: 19690
Number of nulls in column Taxyr: 0
Number of nulls in column Parid: 0
Number of nulls in column Situs Adrno: 0
Number of nulls in column Situs Adrdir: 2763914
Number of nulls in column Situs Adrstr: 0
Number of nulls in column Situs Adrsuf: 188543
Number of nulls in column Cityname: 4745
Number of nulls in column Luc: 0
Number of nulls in column Calcacres: 0
Number of nulls in column Own1: 0
Number of nulls in column Own2: 2315696
Number of nulls in column Owner Adrno: 63862
Number of nulls in column Owner Adradd: 2761024
Number of nulls in column Owner Adrdir: 2704437
Number of nulls in column Owner Adrstr: 0
Number of nulls in column Owner Adrsuf: 209443
Number of nulls in column Cityname.1: 12
Number of nulls in column Statecode: 256
Number of nulls in column Zip1: 841
Number of nulls in column Aprtot: 0
Number of nulls in column Extwall: 0
Number 

In [6]:
# Quickly validate no data quality issues
# by looking at number of parcels per year
digest_full.groupby("TAXYR")['PARID'].count()

TAXYR
2010    207544
2011    207578
2012    207757
2013    208987
2014    209878
2015    211153
2016    201392
2017    214320
2018    215890
2019    217756
2020    219842
2021    221480
2022    222395
Name: PARID, dtype: int64

Note: It looks like a few parcels are lost between 2015 and 2016 (approx 10K) - this is potentially a flaw in Fulton data, not our own processing

## Create a File with all Fulton County Sales for all Years

In [7]:
keywords = ["STANDARDS SALES"]
sale_cols = {
    "Taxyr": "int",
    "Parid": "string",
    "Saledt": "string",
    "Luc": "string",
    "SALES PRICE": "float",
    "FAIR MARKET VALUE": "float",
    "DEED TYPE": "string",
    "Saleval": "string",
    "Costval": "string",
    "GRANTOR": "string",
    "GRANTEE": "string",
}

desired_files = filter(lambda file: contains_keyword(file, keywords), fulton_files)

desired_files_dfs = [
    pd.read_csv(
        FULTON_DIR + file,
        sep='\t',
        encoding='latin-1',
        usecols=sale_cols,
        quoting=csv.QUOTE_NONE,
        skipfooter=1,
        on_bad_lines="warn"
    ) for file in desired_files
]

  pd.read_csv(
  pd.read_csv(
  pd.read_csv(
  pd.read_csv(
  pd.read_csv(
  pd.read_csv(
  pd.read_csv(
  pd.read_csv(
  pd.read_csv(
  pd.read_csv(
  pd.read_csv(
  pd.read_csv(


In [8]:
# Select for LUC = 101 (SFH)
sales_full = pd.concat(desired_files_dfs)
sales_full['Luc'] = sales_full['Luc'].astype('str')
sales_full = sales_full[sales_full['Luc'] == '101']

In [9]:
# Concat selected sales files
# Drop complete duplicates
# Record total number of sales
rename = {
    "Taxyr": "TAXYR",
    "Parid": "PARID",
}

init_len = len(sales_full)
sales_full = sales_full.drop_duplicates()
print(f"Init len: {init_len}")
print(f"Number of dropped complete duplicates: {init_len - len(sales_full)}")
print(f"Final len: {len(sales_full)}")

# Records nulls and set datatypes
for column in sales_full.columns:
    sales_full[column] = clean_and_cast_column(sales_full[column], sale_cols)
    
sales_full = sales_full.rename(columns=rename)

Init len: 275814
Number of dropped complete duplicates: 461
Final len: 275353
Number of nulls in column Taxyr: 0
Number of nulls in column Parid: 0
Number of nulls in column Luc: 0
Number of nulls in column Saledt: 0
Number of nulls in column SALES PRICE: 19
Number of nulls in column FAIR MARKET VALUE: 0
Number of nulls in column DEED TYPE: 2
Number of nulls in column Costval: 0
Number of nulls in column Saleval: 118
Number of nulls in column GRANTOR: 18
Number of nulls in column GRANTEE: 8


In [10]:
# Check distribution of data for quick validation
sales_full.groupby("TAXYR")['PARID'].count()

TAXYR
2011    26777
2012    22684
2013    26074
2014    11711
2015    11256
2016    14978
2017    13210
2018    28281
2019    29134
2020    29570
2021    28645
2022    33033
Name: PARID, dtype: int64

In [11]:
# Another data validation check
sales_full.groupby("TAXYR").describe()

Unnamed: 0_level_0,SALES PRICE,SALES PRICE,SALES PRICE,SALES PRICE,SALES PRICE,SALES PRICE,SALES PRICE,SALES PRICE,FAIR MARKET VALUE,FAIR MARKET VALUE,FAIR MARKET VALUE,FAIR MARKET VALUE,FAIR MARKET VALUE,FAIR MARKET VALUE,FAIR MARKET VALUE,FAIR MARKET VALUE
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
TAXYR,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
2011,26777.0,162950.94,630129.89,0.0,0.0,56050.0,189900.0,51718108.0,26777.0,210525.7,292278.99,0.0,42100.0,115300.0,276200.0,8750000.0
2012,22684.0,164550.67,424239.44,0.0,0.0,49500.0,200515.0,20500000.0,22684.0,241854.35,325947.29,1500.0,56600.0,142800.0,314800.0,13698600.0
2013,26074.0,166081.78,440832.36,0.0,0.0,47000.0,212000.0,17038094.0,26074.0,238072.86,323868.49,0.0,39500.0,142225.0,325900.0,6322800.0
2014,11711.0,331640.54,351925.01,0.0,79000.0,255000.0,455000.0,6350000.0,11711.0,294653.6,320690.06,100.0,63900.0,226960.0,402150.0,6189700.0
2015,11256.0,357182.81,392959.23,0.0,101425.0,275000.0,475000.0,13914719.0,11256.0,314046.58,324030.38,1390.0,74887.5,244970.0,428770.0,6151600.0
2016,14978.0,348266.13,401923.49,0.0,85000.0,258325.0,474500.0,17455046.0,14978.0,346104.72,368631.02,0.0,91500.0,257500.0,470000.0,6000000.0
2017,13210.0,386344.64,386724.4,0.0,139000.0,294000.0,517500.0,7200000.0,13210.0,306637.0,320307.43,0.0,75725.0,225900.0,425000.0,6000000.0
2018,28281.0,268587.37,931256.97,0.0,1.0,108000.0,360000.0,135635876.0,28281.0,307217.92,359431.09,0.0,75800.0,198500.0,418000.0,7150000.0
2019,29134.0,522038.36,2227323.74,0.0,1.0,137000.0,379900.0,40120000.0,29134.0,328591.65,367677.75,100.0,107200.0,207000.0,427975.0,7861300.0
2020,29570.0,303400.4,1085088.76,0.0,1.0,155000.0,379000.0,58800000.0,29570.0,364300.06,393939.77,0.0,133500.0,243900.0,467500.0,11350000.0


## Geocode Digest and Sales Data

In [12]:
# Read in parcel boundary data from Fulton County 2022 record
geo_parcels = gpd.read_file("./data/fulton_parcels.geojson")
geo_parcels = geo_parcels[["ParcelID", "OBJECTID", "geometry"]]
geo_parcels = geo_parcels.rename(columns={"ParcelID": "PARID"})
# Ensure no duplicates for merging
print(len(geo_parcels[geo_parcels.duplicated(subset=["PARID"])]))

# Read in neighborhood boundaries and stats from Neighborhood Nexus
geo_atl_nsa = gpd.read_file("./data/atl_nsa.geojson")
geo_atl_nsa = geo_atl_nsa.rename(columns={"NEIGHBORHO": "neighborhood", "geometry": "nsa_boundary"})
geo_atl_nsa = geo_atl_nsa[["neighborhood", "nsa_boundary"]]
# Ensure no duplicates for merging
print(len(geo_atl_nsa[geo_atl_nsa.duplicated(subset=["neighborhood"])]))

# Validate identical CRS for spatial join
geo_parcels = geo_parcels.set_geometry("geometry")
geo_atl_nsa = geo_atl_nsa.set_geometry("nsa_boundary")
print(f"Validate CRS: {geo_parcels.crs == geo_atl_nsa.crs}")

0
0
Validate CRS: True


In [13]:
# Get city name for each parcel with a spatial join
geo_parcels = gpd.sjoin(geo_parcels, geo_atl_nsa, how="left", op="within")

  if await self.run_code(code, result, async_=asy):


In [14]:
# Merge parcel boundaries and cities with digest and salses data
# Just do this for record keeping purposes (e.g. how many can't be matched)
# Actual merging will be done in analysis file due to data format issues
digest_geo = digest_full.merge(geo_parcels, on="PARID", how="inner")
sales_geo = sales_full.merge(geo_parcels, on="PARID", how="inner")

print(f"Num of records not geo matched in digest: {len(digest_full) - len(digest_geo)}")
print(f"Num of records not geo matched in sales: {len(sales_full) - len(sales_geo)}")

Num of records not geo matched in digest: 6556
Num of records not geo matched in sales: 852


In [15]:
# Parcels and sales in ATL
digest_atl = digest_geo[digest_geo['neighborhood'].notna()]
sales_atl = sales_geo[sales_geo['neighborhood'].notna()]
print(f"Number of parcels in ATL: {len(digest_atl)}")
print(f"Number of sales in ATL: {len(sales_atl)}")

Number of parcels in ATL: 1002694
Number of sales in ATL: 110896


## Save Data

In [16]:
# Save parcel geometry data
OUTPUT_PATH = 'output/parcels_geo'
geo_parcels.to_csv(OUTPUT_PATH + '.csv', index=False)

# Save parcel data without geometry data so Parquet format works
OUTPUT_PATH = 'output/'
digest_full.to_parquet(OUTPUT_PATH + 'digest_full.parquet', index=False)
sales_full.to_parquet(OUTPUT_PATH + 'sales_full.parquet', index=False)