In [1]:
#the purpose of this script is to assess properties in cook county and determine what factors lead to
#higher property tax values

import pandas as pd
import requests
from requests.auth import HTTPBasicAuth
import json
import os
import sys
import numpy as np
import statsmodels.api as sm


#set the API URL to use
#url = "http://www.cookcountyassessor.com/Search/Property-Search.aspx"
url_a = 'https://datacatalog.cookcountyil.gov/resource/uzyt-m557.json?tax_year=2024&$offset='  #URL for assessed values
url_p = 'https://datacatalog.cookcountyil.gov/resource/x54s-btds.json?tax_year=2024&$offset='  #URL for property characteristics

#the API keys are saved as environment variables for security
key = os.environ['API_CC_KEY'] 
secret = os.environ['API_CC_SECRET']

#Designate a filepath to save results to as an option to save time
output_path = "C:\\Users\\matth\\OneDrive\\Documents\\" #os.environ['output_path']

#Set a variable to indicate if we want to pull from the API directly vs. pull from a saved csv file pulled previously, which is done to save time
use_API = False

#pd.options.display.max_colwidth = 400
pd.set_option('display.max_columns', None)
np.set_printoptions(threshold=sys.maxsize)



In [2]:
#the purpose of this function is to pull data from the API in batches and return the final data set in a dataframe

def api_to_df(url, key, secret):

    #Establish a connection to the API
    #this code paginates from the API to pull 1000 records at a time.

    #set the authentification variables
    basic = HTTPBasicAuth(key, secret)

    offset = 0

    #keep looping, we will break the loop at the bottom
    while offset == 0 or len(d) == 1000:

        #authenticate again to url, use the offset as needed
        res = requests.get(url + str(offset), auth=basic)

        #save the content to a variable
        d = res.json()

        #on the first iteration of the loop just set the total equal to d, otherwise append it
        if offset == 0:
            d_all = d
        else:
            d_all = d_all + d

        #reset offset to the length of the response
        offset = len(d_all) + 1

        #for testing
        print(len(d), len(d_all), offset)

        #break the loop here if we have an offset less than 1000
        if len(d_all) < 1000:
            print('here')
            break


    return pd.json_normalize(d_all)


In [4]:
#This section is pulling the data, either directly from the API endpoint or through the CSV that has been cached. It takes about an hour to get each data set from the API
#(over 1M records for just one year of data) so that is why pulling from the csv is helpful. Note that dropping to a csv and re-uploading converts string values to numeric

#Pull from the API
if use_API:
    
    #pull the assessment data and property attribute data
    df_a = api_to_df(url_a,key,secret)
    df_p = api_to_df(url_p,key,secret)

    #putting this data in a local csv saves time (takes about an hour to pull just one year of assessment data), but it converts several data types to numeric upon reingestion, including the pin number.
    df_a.to_csv(output_path + "AssessValues2024.csv", index=False)
    df_p.to_csv(output_path + "PropAtrb2024.csv", index=False)
                
else:
    #if we have already pulled from the API and want to save time, pull the same data from the csv which was cached from the same API call previously
    df_a = pd.read_csv(output_path + "AssessValues2024.csv")
    df_p = pd.read_csv(output_path + "PropAtrb2024.csv", low_memory=False)


In [None]:
#Merge the two data sets and augment the data with some additional calculated fields
df_all = df_a.merge(df_p, how='left', left_on='pin',right_on='pin')

#Add some fields that will help us
df_all['price_sqft_bldg'] = df_all['mailed_tot'] / df_all['char_bldg_sf']
df_all['price_sqft_land'] = df_all['mailed_tot'] / df_all['char_land_sf']

#check the data we have
df_all.head()

Unnamed: 0,pin,tax_year,class_x,township_code_x,township_name,neighborhood_code,mailed_bldg,mailed_land,mailed_tot,certified_bldg,certified_land,certified_tot,year,card,class_y,township_code_y,tieback_proration_rate,card_proration_rate,cdu,pin_is_multicard,pin_num_cards,pin_is_multiland,pin_num_landlines,char_yrblt,char_bldg_sf,char_land_sf,char_beds,char_rooms,char_fbath,char_hbath,char_frpl,char_type_resd,char_cnst_qlty,char_apts,char_attic_fnsh,char_gar1_att,char_gar1_area,char_gar1_size,char_gar1_cnst,char_attic_type,char_bsmt,char_ext_wall,char_heat,char_repair_cnd,char_bsmt_fin,char_roof_cnst,char_use,char_site,char_ncu,char_renovation,recent_renovation,char_porch,char_air,char_tp_plan,tieback_key_pin,price_sqft_bldg,price_sqft_land
0,1011230250000,2024,206,10,Barrington,10012,50141,11484,61625,50141.0,11484.0,61625.0,2024.0,1.0,206,10.0,1.0,0.0,AV,False,1.0,False,1.0,1958.0,3404.0,11484.0,4.0,9.0,2.0,0.0,1.0,2 Story,Average,,,Yes,No,3 cars,Frame,,Partial,Stucco,Warm Air Furnace,Average,Unfinished,Shingle + Asphalt,Single-Family,Not Relevant To Value,0.0,No,False,,Central A/C,Stock Plan,,18.103702,5.366162
1,1012020420000,2024,203,10,Barrington,10012,24556,8040,32596,24556.0,8040.0,32596.0,2024.0,1.0,203,10.0,1.0,0.0,AV,False,1.0,False,1.0,1957.0,1466.0,8040.0,3.0,5.0,1.0,1.0,2.0,1 Story,Average,,,No,No,2 cars,Frame,,Full,Frame + Masonry,Hot Water Steam,Average,Formal Rec Room,Shingle + Asphalt,Single-Family,Not Relevant To Value,0.0,No,False,,Central A/C,Stock Plan,,22.234652,4.054229
2,1012040220000,2024,206,10,Barrington,10012,47064,17123,64187,47064.0,17123.0,64187.0,2024.0,1.0,206,10.0,1.0,0.0,AV,False,1.0,False,1.0,1920.0,2323.0,17123.0,4.0,9.0,2.0,1.0,1.0,2 Story,Average,,,No,No,3 cars,Frame,,Full,Frame,Hot Water Steam,Average,Unfinished,Other,Single-Family,Not Relevant To Value,0.0,No,False,,Central A/C,Stock Plan,,27.63108,3.748584
3,1012050140000,2024,278,10,Barrington,10012,31087,8913,40000,31087.0,8913.0,40000.0,2024.0,1.0,278,10.0,1.0,0.0,AV,False,1.0,False,1.0,1977.0,2243.0,8913.0,4.0,8.0,2.0,1.0,1.0,2 Story,Average,,,Yes,No,2 cars,Frame,,Partial,Frame,Warm Air Furnace,Average,Unfinished,Shingle + Asphalt,Single-Family,Not Relevant To Value,0.0,No,False,,Central A/C,Stock Plan,,17.833259,4.487827
4,1012070210000,2024,203,10,Barrington,10012,20222,10778,31000,20222.0,10778.0,31000.0,2024.0,1.0,203,10.0,1.0,0.0,AV,False,1.0,False,1.0,1956.0,1325.0,10778.0,3.0,6.0,1.0,1.0,0.0,1 Story,Average,,,No,No,0 cars,,,Full,Masonry,Warm Air Furnace,Average,Formal Rec Room,Shingle + Asphalt,Single-Family,Not Relevant To Value,0.0,No,False,,No Central A/C,Stock Plan,,23.396226,2.876229


In [8]:
#run a regression with some select fields to determine what factors contribute most to the assessed building value

#dependent variable: mailed_tot
#independent variables: char_bldg_sf, char_land_sf, char_beds, char_rooms

#set the dependent variable
Y = df_all[['mailed_tot']]

#set the independent variable
X = df_all[['char_bldg_sf','char_land_sf','char_yrblt']]

#*************want to add variables for the neighborhood or class codes

#make sure all the columns are converted to a float datatype
X = X.astype(float)

#replace all the nulls with 0's so the model can run correctly (check on this to make sure this is not skewing things)
X = X.fillna(0)

# Add a constant to the independent variables matrix (for the intercept)
X = sm.add_constant(X)

# Fit the multiple linear regression model
model = sm.OLS(Y, X).fit()

# Print the model summary
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:             mailed_tot   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     1218.
Date:                Sun, 23 Feb 2025   Prob (F-statistic):               0.00
Time:                        22:58:54   Log-Likelihood:            -2.7580e+07
No. Observations:             1872055   AIC:                         5.516e+07
Df Residuals:                 1872051   BIC:                         5.516e+07
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const         6.805e+04    690.051     98.617   

In [None]:
#in conclusion based on these results, ignoring the abysmally low R-squared score for now, if we look at
#attributes that have the most influence over assessed value the building square footage has more influence than
#the land square footage as evidenced by the higher coefficient. The year built has a negative coefficient
#which also makes sense, indicating that a house built earlier generally has a lower price associated with it. This could
#also be influenced by property tax policies like assessed value freezes.