In [73]:
import json
import re

import pandas as pd

In [74]:
existing_gt = pd.read_csv("../zoning/prompting/ground_truth.csv", index_col=[0, 2], na_values=["-"])

In [75]:
existing_gt

Unnamed: 0_level_0,Unnamed: 1_level_0,district_page,district,min_lot_size_page_gt,min_unit_size_page_gt,min_lot_size_gt,min_unit_size_gt
town,district_abb,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bristol,R-40,31,Residential R-40,48,,40000,
cheshire,R-20A,19,Residential R-20A,91,,20000,
east-haddam,C/B/IG,24,Commercial/Business/Light Industrial,51/56,51/56,21780,1000
ellington,DMF,10,Designed Multifamily,24,26,"217800, 435600","700, 1000, 1200"
hebron,PRD,10,Planned Residential Development,38,39,348480,750
morris,CB,9,Commercial CB,44,44,40000,"1000, 700, 600"
newington,R-12,"7, 8",Residential R-12,48,,12000,
south-windsor,AA-30,"14, 15",AA-30 Limited Residential,26,,30000,
southington,CB,8,Central Business,71,,30000,
warren,North,14,North,16,,87120,


In [76]:
df = pd.read_excel("../data/zoning_atlas_data_2022.xlsx", "Town Data", index_col=[1, 2])

In [77]:
df = df.set_index(df.index.map(lambda i: (re.sub(r"[\s\-/]+", "-", i[0].strip().lower()), i[1].strip())))

In [78]:
df = df.drop(index=existing_gt.index)

In [79]:
ACRE_TO_SQ_FT = 4840 * 9

In [80]:
missing_min_lot_size_gt = df["1-Family Min. Lot (ACRES)"].isna()
min_lot_size_gt = pd.to_numeric(df["1-Family Min. Lot (ACRES)"], errors="coerce") * ACRE_TO_SQ_FT

In [81]:
missing_min_unit_size_gt = df["1-Family Min. Unit Size (SF)"].isna()
min_unit_size_gt = pd.to_numeric(df["1-Family Min. Unit Size (SF)"], errors="coerce")

The below code finds the rows of the dataframe that have 1-Family Min. Lot sizes
that are special cases, in that they are not simple numeric values but include
some sort of qualifier. We're interested in these cases because they will be
more challenging to correctly determine. These samples will also be harder to
evaluate correctness for, especially automatically, because any divergence in
wording will generally be identified as incorrect.

In [82]:
df_special_case_lot_size = df[(~missing_min_lot_size_gt & min_lot_size_gt.isna()) | (~missing_min_unit_size_gt & min_unit_size_gt.isna())]
df_special_case_lot_size.sample(n=5)

Unnamed: 0_level_0,Unnamed: 1_level_0,State,Full District Name,County,NewCOG,Type of Zoning District,Is it an Overlay District?,1-Family,2-Family,3-Family,4+-Family,...,4-Family Max. Height (# of feet),4-Family Max. Lot Coverage IMPERVIOUS - BUILDINGS (%),4-Family Max. Lot Coverage IMPERVIOUS - BUILDINGS & PAVEMENT (%),Notes,Tooltip Notes,Key,Acres,% Town Area,MunicipalKey,MunicipalAcres
Jurisdiction,AbbreviatedDistrict,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
east-hartford,R-2,CT,Residential 2,Hartford,Capitol Planning Region,Primarily Residential,No,Allowed/Conditional,Prohibited,Prohibited,Prohibited,...,,,,,,,,,,
canton,R-1,CT,R-1 Residential,Hartford,Capitol Planning Region,Primarily Residential,No,Allowed/Conditional,Allowed/Conditional,Prohibited,Prohibited,...,,,,,,,,,,
columbia,LBR,CT,Columbia Lake-Overlay B on Residential Agricul...,Tolland,Capitol Planning Region,Primarily Residential,Yes,Allowed/Conditional,Prohibited,Prohibited,Prohibited,...,,,,,,,,,,
east-hartford,B-1,CT,Business 1,Hartford,Capitol Planning Region,Mixed with Residential,No,Prohibited,Allowed/Conditional,Allowed/Conditional,Special Permit,...,100.0,25.0,,,,,,,,
columbia,LAR,CT,Columbia Lake-Overlay A on Residential Agricul...,Tolland,Capitol Planning Region,Primarily Residential,Yes,Allowed/Conditional,Prohibited,Prohibited,Prohibited,...,,,,,,,,,,


In [83]:
n_samples = 233

In [84]:
n_special_case_samples = 33
n_normal_samples = n_samples - n_special_case_samples

In [85]:
special_case_samples = df_special_case_lot_size.sample(n=n_special_case_samples)

For our normal samples, we want to make sure that they span the rest of the zoning documents.

In [86]:
# TODO: We have zoning codes for these towns but we never processed them! We should do that!
exempt_names = set(("bethlehem", "eastford", "hartford", "north-stonington"))

with open("../data/names_all_towns.json") as f:
    names_all_towns = set(json.load(f)) - exempt_names

In [87]:
len(names_all_towns)

176

In [88]:
special_case_samples.reset_index().Jurisdiction.unique()

array(['colchester', 'stamford', 'haddam', 'suffield', 'canton',
       'clinton', 'lebanon', 'griswold-jewett-city', 'andover',
       'east-hartford', 'north-canaan', 'columbia', 'morris', 'thompson',
       'new-britain', 'marlborough', 'hamden', 'westbrook', 'durham',
       'north-haven', 'sprague'], dtype=object)

In [89]:
remaining_towns = names_all_towns - (set(existing_gt.index.levels[0]) | set(special_case_samples.reset_index().Jurisdiction.unique()))

In [90]:
len(remaining_towns)

146

In [91]:
samples = [df.drop(index=special_case_samples.index).loc[(town, slice(None)), :].sample() for town in remaining_towns]
sample_df = pd.concat((special_case_samples, *samples))
remaining_towns = names_all_towns - (set(existing_gt.index.levels[0]) | set(sample_df.reset_index().Jurisdiction.unique()))


In [92]:
sample_df

Unnamed: 0_level_0,Unnamed: 1_level_0,State,Full District Name,County,NewCOG,Type of Zoning District,Is it an Overlay District?,1-Family,2-Family,3-Family,4+-Family,...,4-Family Max. Height (# of feet),4-Family Max. Lot Coverage IMPERVIOUS - BUILDINGS (%),4-Family Max. Lot Coverage IMPERVIOUS - BUILDINGS & PAVEMENT (%),Notes,Tooltip Notes,Key,Acres,% Town Area,MunicipalKey,MunicipalAcres
Jurisdiction,AbbreviatedDistrict,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
colchester,SZD,CT,Suburban Use,New London,Southeastern Connecticut Planning Region,Mixed with Residential,No,Allowed/Conditional,Allowed/Conditional,Special Permit,Special Permit,...,35,,,,,,,,,
stamford,R-D,CT,Designed Residence,Fairfield,Western Connecticut Planning Region,Primarily Residential,Yes,Special Permit,Special Permit,Special Permit,Prohibited,...,,,,,,,,,,
haddam,R-1,CT,Residential R-1,Middlesex,Lower Connecticut River Valley Planning Region,Primarily Residential,No,Allowed/Conditional,Allowed/Conditional,Prohibited,Prohibited,...,,,,,,,,,,
suffield,R-90,CT,R-90 Single Family Residential,Hartford,Capitol Planning Region,Primarily Residential,No,Allowed/Conditional,Special Permit,Prohibited,Prohibited,...,,,,,,,,,,
canton,R-3,CT,R-3 Residential,Hartford,Capitol Planning Region,Primarily Residential,No,Allowed/Conditional,Allowed/Conditional,Prohibited,Prohibited,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
west-haven,PVD,CT,Planned Village,New Haven,South Central Connecticut Planning Region,Mixed with Residential,Yes,Allowed/Conditional,Allowed/Conditional,Allowed/Conditional,Allowed/Conditional,...,45.75,50,70,,,West Haven--PVD,,,West Haven--PVD,
willington,DC,CT,Designed Commercial,Tolland,Capitol Planning Region,No Residential,No,Prohibited,Prohibited,Prohibited,Prohibited,...,,,,,,Willington--DC,226.409666,1.062677,Willington--DC,224.6
east-granby,PRD,CT,Planned Residential,Hartford,Capitol Planning Region,Primarily Residential,No,Allowed/Conditional,Prohibited,Prohibited,Prohibited,...,,,,,,,,,,
redding,SB,CT,Service Business,Fairfield,Western Connecticut Planning Region,Mixed with Residential,No,Allowed/Conditional,Prohibited,Prohibited,Prohibited,...,,,,ADU expires 36 months from receiving certifica...,ADU permit must be renewed every 3 years.,,,,,


Now that we've got at least one sample from every town, keep sampling randomly the remaining number of samples.

In [93]:
sample_df = pd.concat((sample_df, df.sample(n=n_samples - len(existing_gt) - len(sample_df))))

In [95]:
sample_df = (sample_df[["Full District Name", "1-Family Min. Lot (ACRES)", "1-Family Min. Unit Size (SF)"]]
    .rename(columns={
        "Full District Name": "district",
        "1-Family Min. Lot (ACRES)": "min_lot_size_acres_gt_raw",
        "1-Family Min. Unit Size (SF)": "min_unit_size_sf_gt_raw"
    }).rename_axis(index={
        "Jurisdiction": "town",
        "AbbreviatedDistrict": "district_abb"
    }).assign(
        min_lot_size_gt=min_lot_size_gt,
        district_page=pd.NA,
        min_lot_size_page_gt=pd.NA,
        min_unit_size_gt=min_unit_size_gt,
        min_unit_size_page_gt=pd.NA,
    )
)

In [96]:
sample_df

Unnamed: 0_level_0,Unnamed: 1_level_0,district,min_lot_size_acres_gt_raw,min_unit_size_sf_gt_raw,min_lot_size_gt,district_page,min_lot_size_page_gt,min_unit_size_gt,min_unit_size_page_gt
town,district_abb,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
colchester,SZD,Suburban Use,0.688705,1000 (one-story); 1200 (1.5 story); 1400 (two-...,30000.0,,,,
stamford,R-D,Designed Residence,"25 in RA-3, 15 in RA-2, 10 in RA-1, 8 in R-20 ...",,,,,,
haddam,R-1,Residential R-1,1,"500/studio, 700/1BR, 900/2BR",43560.0,,,,
suffield,R-90,R-90 Single Family Residential,2.066116,1000 (one-bed); 1250 (two-bed),90000.0,,,,
canton,R-3,R-3 Residential,2 (front); 3 (rear),,,,,,
...,...,...,...,...,...,...,...,...,...
redding,R-1/2,Suburban Residential,0.5,,21780.0,,,,
berlin,VDO,Village Overlay,,,,,,,
wallingford,RU-40,Rural Residential-40,1.84,,80150.4,,,,
stafford,B,General Residence B,0.92,1000,40075.2,,,1000.0,


In [97]:
final_sample_df = pd.concat((sample_df, existing_gt))

In [98]:
final_sample_df.to_csv("../data/ground_truth.csv")