In [9]:
# Seattle Zipcodes from http://www.discoverseattle.net/zipcodes.php
zipcodes = ["98101", "98102", "98103", "98104", "98105", "98106", "98107", "98108", "98109", "98110", "98111", "98112", "98114", "98115", "98116", "98117", "98118", "98119", "98121", "98122", "98124", "98125", "98126", "98129", "98131", "98132", "98133", "98134", "98136", "98138", "98144", "98145", "98146", "98148", "98151", "98154", "98155", "98158", "98160", "98161", "98164", "98166", "98168", "98170", "98171", "98174", "98177", "98178", "98181", "98184", "98185", "98188", "98190", "98191", "98195", "98198", "98199"]

In [10]:
# Certain conditions mean there will not be an entry for a given zipcode
# in the irs datasets. If it is a PO box, or has fewer than a certain
# number of total returns then the zipcode is grouped into larger zipcode
# groups where zeroes at the end represent wildcards. For example
# if the zipcode 98195 had only 10 returns, it would be grouped into
# 98190 or 98100 until a certain minimum amount was reached.

In [11]:
# First we will want to load our datasets into a pandas dataframe
# and see which zipcodes exist in all datasets. It may become necessary
# to ignore the certin datasets if certain zipcodes are not present. 
# for example if earlier datasets do not have all the zipcodes for the
# neighborhoods I am most interested in such as Madison Park and 
# Ranier Beach.

import pandas as pd

In [35]:
# The first six years have the zipcode in the very first column.
seattle_1998 = pd.read_csv('../cleaned_wa_zipcodes/1998_wa_cleaned.csv')
seattle_2001 = pd.read_csv('../cleaned_wa_zipcodes/2001_wa_cleaned.csv')
seattle_2002 = pd.read_csv('../cleaned_wa_zipcodes/2002_wa_cleaned.csv')
seattle_2004 = pd.read_csv('../cleaned_wa_zipcodes/2004_wa_cleaned.csv')
seattle_2005 = pd.read_csv('../cleaned_wa_zipcodes/2005_wa_cleaned.csv')
seattle_2006 = pd.read_csv('../cleaned_wa_zipcodes/2006_wa_cleaned.csv')
# more recent years have their own zipcode column
seattle_2007 = pd.read_csv('../cleaned_wa_zipcodes/2007_wa_cleaned.csv')
seattle_2008 = pd.read_csv('../cleaned_wa_zipcodes/2008_wa_cleaned.csv')
seattle_2009 = pd.read_csv('../cleaned_wa_zipcodes/2009_wa_cleaned.csv')
seattle_2010 = pd.read_csv('../cleaned_wa_zipcodes/2010_wa_cleaned.csv')
seattle_2011 = pd.read_csv('../cleaned_wa_zipcodes/2011_wa_cleaned.csv')
seattle_2012 = pd.read_csv('../cleaned_wa_zipcodes/2012_wa_cleaned.csv')
seattle_2013 = pd.read_csv('../cleaned_wa_zipcodes/2013_wa_cleaned.csv')
seattle_2014 = pd.read_csv('../cleaned_wa_zipcodes/2014_wa_cleaned.csv')
seattle_2015 = pd.read_csv('../cleaned_wa_zipcodes/2015_wa_cleaned.csv')

In [42]:
# Works on files that have been cleaned to reflect
# that the zipcodes are stored in the same column as the AGI
def check_zipcodes_type1(dataframe):
    zipcodes_present = []
    for i in range(0, len(dataframe['adjusted_gross_income_size_and_zipcode'])):
        if dataframe['adjusted_gross_income_size_and_zipcode'][i] in zipcodes:
            zipcodes_present.append(dataframe['adjusted_gross_income_size_and_zipcode'][i])
    return zipcodes_present

# Works on files that have their own zipcode column.
def check_zipcodes_type2(dataframe):
    zipcodes_present = set();
    for i in range(0, len(dataframe['zipcode'])):
        if dataframe['zipcode'][i] in zipcodes:
            zipcodes_present.add(dataframe['zipcode'][i])
    return zipcodes_present

In [53]:
# older
zipcodes_in_1998 = check_zipcodes_type1(seattle_1998)
zipcodes_in_2001 = check_zipcodes_type1(seattle_2001)
zipcodes_in_2002 = check_zipcodes_type1(seattle_2002)
zipcodes_in_2004 = check_zipcodes_type1(seattle_2004)
zipcodes_in_2005 = check_zipcodes_type1(seattle_2005)
zipcodes_in_2006 = check_zipcodes_type1(seattle_2006)

# recent
zipcodes_in_2007 = check_zipcodes_type2(seattle_2007)
zipcodes_in_2008 = check_zipcodes_type2(seattle_2008)
zipcodes_in_2009 = check_zipcodes_type2(seattle_2009)
zipcodes_in_2010 = check_zipcodes_type2(seattle_2010)
zipcodes_in_2011 = check_zipcodes_type2(seattle_2011)
zipcodes_in_2012 = check_zipcodes_type2(seattle_2012)
zipcodes_in_2013 = check_zipcodes_type2(seattle_2013)
zipcodes_in_2014 = check_zipcodes_type2(seattle_2014)
zipcodes_in_2015 = check_zipcodes_type2(seattle_2015)

zipcodes_contained_all_years = [
    zipcodes_in_1998,
    zipcodes_in_2001,
    zipcodes_in_2002,
    zipcodes_in_2004,
    zipcodes_in_2005,
    zipcodes_in_2006,
    zipcodes_in_2007,
    zipcodes_in_2008,
    zipcodes_in_2009,
    zipcodes_in_2010,
    zipcodes_in_2011,
    zipcodes_in_2012,
    zipcodes_in_2013,
    zipcodes_in_2014,
    zipcodes_in_2015
]

In [49]:
# Getting smallest fully applicable zips for older datasets

print("length of 1998: " + str(len(zipcodes_in_1998)))
print("length of 2001: " + str(len(zipcodes_in_2001)))
print("length of 2002: " + str(len(zipcodes_in_2002)))
print("length of 2004: " + str(len(zipcodes_in_2004)))
print("length of 2005: " + str(len(zipcodes_in_2005)))
print("length of 2006: " + str(len(zipcodes_in_2006)))

# Now we check the zipcodes in the more recent years for their smallest most applicable range
print("length of 2007: " + str(len(zipcodes_in_2007)))
print("length of 2008: " + str(len(zipcodes_in_2008)))
print("length of 2009: " + str(len(zipcodes_in_2009)))
print("length of 2010: " + str(len(zipcodes_in_2010)))
print("length of 2011: " + str(len(zipcodes_in_2011)))
print("length of 2012: " + str(len(zipcodes_in_2012)))
print("length of 2013: " + str(len(zipcodes_in_2013)))
print("length of 2014: " + str(len(zipcodes_in_2014)))
print("length of 2015: " + str(len(zipcodes_in_2015)))

length of 1998: 44
length of 2001: 43
length of 2002: 42
length of 2004: 42
length of 2005: 42
length of 2006: 42
length of 2007: 43
length of 2008: 0
length of 2009: 35
length of 2010: 35
length of 2011: 35
length of 2012: 35
length of 2013: 35
length of 2014: 35
length of 2015: 35


In [52]:
print(zipcodes_in_2015)

{'98115', '98133', '98116', '98101', '98177', '98198', '98109', '98108', '98104', '98107', '98112', '98118', '98126', '98125', '98168', '98121', '98148', '98136', '98105', '98122', '98178', '98146', '98155', '98117', '98144', '98188', '98110', '98164', '98199', '98134', '98166', '98103', '98106', '98119', '98102'}


In [60]:
zipcodes_in_2015 = list(zipcodes_in_2015)
final_zips_data_1998_to_2015 = set()
for i in range(0, len(zipcodes_in_2015)):
    for this_year in zipcodes_contained_all_years:
        if zipcodes_in_2015[i] in list(this_year):
            final_zips_data_1998_to_2015.add(zipcodes_in_2015[i])

In [62]:
print(final_zips_data_1998_to_2015)
print("length of final zips: " + str(len(final_zips_data_1998_to_2015)))

{'98115', '98133', '98116', '98101', '98177', '98198', '98109', '98108', '98104', '98107', '98112', '98118', '98126', '98125', '98168', '98121', '98148', '98136', '98105', '98122', '98178', '98146', '98155', '98117', '98144', '98188', '98110', '98164', '98199', '98134', '98166', '98103', '98106', '98119', '98102'}
length of final zips: 35


In [63]:
# the final list of zips that we can track for 15 of the last 20 years is:
# ['98115', '98133', '98116', '98101', '98177', '98198', '98109', '98108', '98104', '98107', '98112', '98118', '98126', '98125', '98168', '98121', '98148', '98136', '98105', '98122', '98178', '98146', '98155', '98117', '98144', '98188', '98110', '98164', '98199', '98134', '98166', '98103', '98106', '98119', '98102']
fifteen_year_zipcodes = ['98115', '98133', '98116', '98101', '98177', '98198', '98109', '98108', '98104', '98107', '98112', '98118', '98126', '98125', '98168', '98121', '98148', '98136', '98105', '98122', '98178', '98146', '98155', '98117', '98144', '98188', '98110', '98164', '98199', '98134', '98166', '98103', '98106', '98119', '98102']

In [None]:
# There is a limited number of columns that are present in all datasets. In particular,
# 1998 and some of the earliest datasets have < 20 columns. Luckily the column that is present
# in all of them is the salaries and wages amount + returns.
# The plan is to get salaries_and_wages_amount / salaries_and_wages_returns for zipcode in
# the fifteen_year_zipcodes list, for all datasets. This should give some vague notion
# as to how incomes have changed for people submitting tax returns in given zipcdes
# over the years.
