In [None]:
#the purpose of this script is to assess properties in cook county and determine what factors lead to
#higher property tax values

import pandas as pd
import requests
from requests.auth import HTTPBasicAuth
import json
import os
import sys
import numpy as np
import statsmodels.api as sm
import math


#set the API URL to use
#url = "http://www.cookcountyassessor.com/Search/Property-Search.aspx"
url_a = 'https://datacatalog.cookcountyil.gov/resource/uzyt-m557.json?tax_year=2024&$offset='  #URL for assessed values
url_p = 'https://datacatalog.cookcountyil.gov/resource/x54s-btds.json?tax_year=2024&$offset='  #URL for property characteristics

#the API keys are saved as environment variables for security
key = os.environ['API_CC_KEY'] 
secret = os.environ['API_CC_SECRET']

#Designate a filepath to save results to as an option to save time
output_path = os.environ['output_path']

#Set a variable to indicate if we want to pull from the API directly vs. pull from a saved csv file pulled previously, which is done to save time
use_API = True

#pd.options.display.max_colwidth = 400
pd.set_option('display.max_columns', None)
np.set_printoptions(threshold=sys.maxsize)



In [None]:
#the purpose of this function is to pull data from the API in batches and return the final data set in a dataframe
#by encapsulating the function here it allows us to isolate how the data is pulled and interact with it as a dataframe

def api_to_df(url, key, secret):

    #Establish a connection to the API
    #this code paginates from the API to pull 1000 records at a time.

    #set the authentification variables
    basic = HTTPBasicAuth(key, secret)

    offset = 0

    #keep looping, we will break the loop at the bottom
    while offset == 0 or len(d) == 1000:

        #authenticate again to url, use the offset as needed
        res = requests.get(url + str(offset), auth=basic)

        #save the json-formatted content to a variable
        d = res.json()

        #on the first iteration of the loop just set the total equal to d, otherwise append it
        if offset == 0:
            d_all = d
        else:
            d_all = d_all + d

        #reset offset to the length of the response
        offset = len(d_all) + 1

        #for testing
        #print(len(d), len(d_all), offset)

        #every 100,000 records let the user know where we are, that we are thinking about them, and it's still working
        if len(d_all)/100000 == math.floor(len(d_all)/100000):
            print("Still going, pulled {} records".format(len(d_all)))

        #break the loop here if we have an offset less than 1000
        if len(d_all) < 1000:
            #print('here')
            break

    #since we want all the fields, just return the dataframe formatted version of the entire json dataset
    return pd.json_normalize(d_all)


In [13]:
#This section is pulling the data, either directly from the API endpoint or through the CSV that has been cached. It takes about an hour to get each data set from the API
#(over 1M records for just one year of data) so that is why pulling from the csv is helpful. Note that dropping to a csv and re-uploading converts string values to numeric

#Pull from the API
if use_API:
    
    #pull the assessment data and property attribute data
    df_a = api_to_df(url_a,key,secret)
    df_p = api_to_df(url_p,key,secret)

    #putting this data in a local csv saves time (takes about an hour to pull just one year of assessment data), but it converts several data types to numeric upon reingestion, including the pin number.
    df_a.to_csv(output_path + "AssessValues2024.csv", index=False)
    df_p.to_csv(output_path + "PropAtrb2024.csv", index=False)
                
else:
    #if we have already pulled from the API and want to save time, pull the same data from the csv which was cached from the same API call previously
    df_a = pd.read_csv(output_path + "AssessValues2024.csv")
    df_p = pd.read_csv(output_path + "PropAtrb2024.csv", low_memory=False)


Still going, pulled 100000 records
Still going, pulled 200000 records
Still going, pulled 300000 records
Still going, pulled 400000 records
Still going, pulled 500000 records
Still going, pulled 600000 records
Still going, pulled 700000 records
Still going, pulled 800000 records
Still going, pulled 900000 records
Still going, pulled 1000000 records
Still going, pulled 1100000 records
Still going, pulled 1200000 records
Still going, pulled 1300000 records
Still going, pulled 1400000 records
Still going, pulled 1500000 records
Still going, pulled 1600000 records
Still going, pulled 1700000 records
Still going, pulled 1800000 records
Still going, pulled 100000 records
Still going, pulled 200000 records
Still going, pulled 300000 records
Still going, pulled 400000 records
Still going, pulled 500000 records
Still going, pulled 600000 records
Still going, pulled 700000 records
Still going, pulled 800000 records
Still going, pulled 900000 records
Still going, pulled 1000000 records
Still goin

In [None]:
#Merge the two data sets and augment the data with some additional calculated fields
df_all = df_a.merge(df_p, how='left', left_on='pin',right_on='pin')

#convert these fields to numeric so they can be divided out
df_all['char_bldg_sf'] = df_all['char_bldg_sf'].astype(float)
df_all['char_land_sf'] = df_all['char_land_sf'].astype(float)
df_all['mailed_tot'] = df_all['mailed_tot'].astype(float)

#Add some fields that will help us
df_all['price_sqft_bldg'] = df_all['mailed_tot'] / df_all['char_bldg_sf']
df_all['price_sqft_land'] = df_all['mailed_tot'] / df_all['char_land_sf']

#check the data we have
df_all.head()

Unnamed: 0,pin,year_x,class_x,township_code_x,township_name,nbhd,mailed_bldg,mailed_land,mailed_tot,certified_bldg,certified_land,certified_tot,board_bldg,board_land,board_tot,year_y,card,class_y,township_code_y,tieback_proration_rate,card_proration_rate,cdu,pin_is_multicard,pin_num_cards,pin_is_multiland,pin_num_landlines,char_yrblt,char_bldg_sf,char_land_sf,char_beds,char_rooms,char_fbath,char_hbath,char_frpl,char_type_resd,char_cnst_qlty,char_apts,char_attic_fnsh,char_gar1_att,char_gar1_area,char_gar1_size,char_gar1_cnst,char_attic_type,char_bsmt,char_ext_wall,char_heat,char_repair_cnd,char_bsmt_fin,char_roof_cnst,char_use,char_site,char_ncu,char_porch,char_air,char_tp_plan,tieback_key_pin,char_renovation,price_sqft_bldg,price_sqft_land
0,30293260680000,2024,234,37,Thornton,37181,19500,3500,23000.0,19500,3500,23000,,,,2024.0,1.0,234.0,37.0,1.0,0.0,AV,False,1.0,False,1.0,1996.0,1716.0,8235.0,3.0,7.0,1.0,1.0,0.0,Split Level,Average,,,No,No,2 cars,Frame,,Partial,Frame + Masonry,Warm Air Furnace,Average,Formal Rec Room,Shingle + Asphalt,Single-Family,Not Relevant To Value,0.0,,Central A/C,Stock Plan,,,13.403263,2.792957
1,30294060520000,2024,203,37,Thornton,37181,9509,2391,11900.0,9509,2391,11900,,,,2024.0,1.0,203.0,37.0,1.0,0.0,AV,False,1.0,False,1.0,1942.0,1021.0,5625.0,3.0,5.0,1.0,0.0,0.0,1.5 Story,Average,,Living Area,No,No,2.5 cars,Frame,Partial,Full,Frame + Masonry,Warm Air Furnace,Average,Unfinished,Shingle + Asphalt,Single-Family,Not Relevant To Value,0.0,,Central A/C,Stock Plan,,,11.65524,2.115556
2,30301000220000,2024,522,37,Thornton,37101,23293,3736,27029.0,23293,3736,27029,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,30301130010000,2024,100,37,Thornton,37101,0,1553,1553.0,0,1553,1553,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,30302040400000,2024,202,37,Thornton,37102,6850,2650,9500.0,6850,2650,9500,,,,2024.0,1.0,202.0,37.0,1.0,0.0,AV,False,1.0,False,1.0,1953.0,931.0,6625.0,3.0,5.0,1.0,0.0,0.0,1 Story,Average,,,No,No,2 cars,Frame,,Slab,Frame + Masonry,Warm Air Furnace,Average,Unfinished,Shingle + Asphalt,Single-Family,Not Relevant To Value,0.0,,No Central A/C,Stock Plan,,,10.204082,1.433962


In [17]:
#run a regression with some select fields to determine what factors contribute most to the assessed building value

#dependent variable: mailed_tot
#independent variables: char_bldg_sf, char_land_sf, char_beds, char_rooms

#set the dependent variable
Y = df_all[['mailed_tot']]

#set the independent variable
X = df_all[['char_bldg_sf','char_land_sf','char_yrblt']]

#*************want to add variables for the neighborhood or class codes

#make sure all the columns are converted to a float datatype
X = X.astype(float)

#replace all the nulls with 0's so the model can run correctly (check on this to make sure this is not skewing things)
X = X.fillna(0)

# Add a constant to the independent variables matrix (for the intercept)
X = sm.add_constant(X)

# Fit the multiple linear regression model
model = sm.OLS(Y, X).fit()

# Print the model summary
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:             mailed_tot   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     1219.
Date:                Mon, 24 Feb 2025   Prob (F-statistic):               0.00
Time:                        11:24:26   Log-Likelihood:            -2.7580e+07
No. Observations:             1872049   AIC:                         5.516e+07
Df Residuals:                 1872045   BIC:                         5.516e+07
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const         6.805e+04    690.045     98.615   

In [None]:
#in conclusion based on these results, ignoring the abysmally low R-squared score for now, if we look at
#attributes that have the most influence over assessed value the building square footage has more influence than
#the land square footage as evidenced by the higher coefficient. The year built has a negative coefficient
#which also makes sense, indicating that a house built earlier generally has a lower price associated with it. This could
#also be influenced by property tax policies like assessed value freezes.