# Income Analysis
In order to analyze the data of yearly income we use the "Income in the past 12 months" table.

The table is derived from the "American Community Survey" and it could be of two types: "ACS 5-Year Estimates Subject Tables" or "ACS 1-Year Estimates Subject Tables".
The problem is that this two types of tables can be compared: we can compare two "1-Year" or two "5-Years" that not overlap but we cannot compare a "1-Year" and a "5-Years".

In all the cities the 2020 data is only available in "5-Years" table.

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
def get_incomes(df, city, year):
    '''
    function to get the incomes from the social data and plot it into a bar charts
    '''
    
    estimates = list()
    margins = list()
    
    for i in range(1, 11):
        
        for type in ['household', 'family', 'married', 'nonfamily']:
            
            # obtain the value from the percentage
            estimate = float(df[type + '_estimates'][i][:-1])
            margin = float(df[type + '_margins'][i][1:-1])
            
            # append to the list
            estimates.append(estimate)
            margins.append(margin)
    
    # plot the data
    plt.figure(figsize=(10, 5))
    for i in range(4):
        plt.bar(np.arange(10) + i * 0.2, estimates[i * 10: (i + 1) * 10], width=0.2, label=['household', 'family', 'married', 'nonfamily'][i])
        plt.errorbar(np.arange(10) + i * 0.2, estimates[i * 10: (i + 1) * 10], yerr=margins[i * 10: (i + 1) * 10], fmt='o', color='black', capsize=5)
    plt.xticks(np.arange(10), ['<10k', '10-15k', '15-25k', '25-35k', '35-50k', '50-75k', '75-100k', '100-150k', '150-200k', '>200k'])
    plt.legend()
    plt.title(f'{city} incomes in {year}')
    plt.show()

# Gender and sex analysis
In order to analyze the data of gender per age we use the "Age and Sex" table.

The table is derived from the "American Community Survey" and it could be of two types: "ACS 5-Year Estimates Subject Tables" or "ACS 1-Year Estimates Subject Tables".
The problem is that this two types of tables can be compared: we can compare two "1-Year" or two "5-Years" that not overlap but we cannot compare a "1-Year" and a "5-Years".

In all the cities the 2020 data is only available in "5-Years" table.

In [None]:
def get_gender(df, city, year):
    '''
    function to get the gender from the social data and plot it into a bar charts
    
    Inputs:
        - df: pandas dataframe
        - city: str, city name
        - year: str, year
    '''
    
    estimates = {
        'total': list(),
        'male': list(),
        'female': list()
    }
    margins = {
        'total': list(),
        'male': list(),
        'female': list()
    }
    
    for i in range(2, 20):
        for gender in ['total', 'male', 'female']:
                
                # obtain the value from the percentage

                estimate = df[gender + '_estimates'][i]
                if type(estimate) == float:
                    estimate = str(estimate)
                estimate = int(estimate.replace(',', ''))
                
                margin = df[gender + '_margins'][i][1:]
                if type(margin) == float:
                    margin = str(margin)
                margin = int(margin.replace(',', ''))
                
                # append to the list
                estimates[gender].append(estimate)
                margins[gender].append(margin)
                
    # plot the data
    ages = ['<5', '5-9', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40-44',
            '45-49', '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80-84', '85+']
    plt.figure(figsize=(10, 5))
    for label, value in estimates.items():
        plt.bar(np.arange(18) + list(estimates.keys()).index(label) * 0.2, value, width=0.2, label=label)
        plt.errorbar(np.arange(18) + list(estimates.keys()).index(label) * 0.2, value, yerr=margins[label], fmt='o', color='black', capsize=5)
    plt.xticks(np.arange(18), ages, rotation=45)
    plt.legend(loc='best')
    plt.title(f'{city} ages per gender in {year}')
    plt.ylabel('Number of people')
    plt.show()

# Race analysis
In order to analyze the data of gender per age we use the "Race" table.

The table is derived from the "American Community Survey" and it could be of two types: "ACS 5-Year Estimates Subject Tables" or "ACS 1-Year Estimates Subject Tables".
The problem is that this two types of tables can be compared: we can compare two "1-Year" or two "5-Years" that not overlap but we cannot compare a "1-Year" and a "5-Years".

In all the cities the 2020 data is only available in "5-Years" table.

In [None]:
def get_race(df, city, year):
    '''
    function to get the race from the social data and plot it into a bar charts
    
    Inputs:
        - df: pandas dataframe
        - city: str, city name
        - year: str, year
    '''
    
    labels = ['White', 'Black or \n African American', 'Indian and \n Alaska Native',
            'Asian', 'Native Hawaiian', 'Some Other Race', 'Two or More Races']
    
    estimates = list()
    margins = list()
    
    for i in range(1, 8):
        estimate = df['estimate'][i]
        if type(estimate) == float:
            estimate = str(estimate)
        estimate = int(estimate.replace(',', ''))
        
        margin = df['margin'][i][1:]
        if type(margin) == float:
            margin = str(margin)
        margin = int(margin.replace(',', ''))
        
        estimates.append(estimate)
        margins.append(margin)
    
    # plot the data
    fig, ax = plt.subplots(figsize=(10, 5))
    
    plt.barh(labels, estimates, color=get_colors(estimates))
    plt.errorbar(estimates, labels, xerr=margins, fmt='o', color='black', capsize=5)
    plt.xscale('log')
    plt.yticks(size=8)
    for index in ax.patches:
        width = index.get_width() + margins[ax.patches.index(index)]
        plt.text(width, index.get_y() + index.get_height() / 2, f'{index.get_width()}', ha='left', va='center')
    plt.title(f'Races in {city} in {year}')
    plt.show()

## Anaylsis for each zip code

The data divided for zip code is only available in 5-ACS tables so I used only the 2022 table.

Boston dataset does not have data for 02101, 02102, 02103, 02103, 02104, 02105, 02106, 02107, 02112, 02117, 02123, 02137, 02208, 02209, 02222 zip codes.

Chicago dataset does not have data for 60664, 60666, 60680, 60681, 60690, 60691, 60739 zip codes.

Columbus dataset does not have data for 43216, 43225, 43226, 43234 zip codes.

NYC dataset does not have data for 10008, 10015, 10041, 10045, 10048, 10055, 10060, 10090, 10095, 10098, 10099, from 10103 to 10107, from 10110 to 10112, 10115 10118, from 10120 to 10123, from 10151 to 10155, 10158, 10161, from 10165 to 10168, from 10170 to 10177, 10199, 10270, 10271, 10278, 10279, 10311, 11002, 11005, 11006, 11009, from 11241 to 11243, 11252, 11256, 11351, 11359, 11371, 11424, 11425, 11430 zip codes.

Philadelphia dataset does not have data for 19019, 19101, 19105, from 19109 to 19113, 19155, 19160 zip codes.

San Francisco dataset does not have data for 94101, from 94140 to 94142, 94146, 94147, 94157, 94159, from 94164 to 94170, 94172, 94188 zip codes.

Washington dataset does not have data for 20013, 20022, 20023, 20026, 20027, 20029, 20030, 20033, 20035, from 20038 to 20045, 20046, 20047, from 20049 to 20051, 20053, from 20055 to 20098, all the zip codes starting wit 202, all the zipcodes starting with 203 but 20373, all the zip codes starting with 204 and all the zip codes starting with 205.

In [None]:
import pathlib
import urllib.request
import geopandas as gpd
import matplotlib.pyplot as plt
import sys
import os
import pandas as pd
from copy import deepcopy

debug = True
categorical = True
show = True

states_filename = "tl_2017_us_state.zip"
states_url = f"https://www2.census.gov/geo/tiger/TIGER2017/STATE/{states_filename}"
states_file = pathlib.Path(states_filename)

zipcode_filename = "tl_2017_us_zcta510.zip"
zipcode_url = f"https://www2.census.gov/geo/tiger/TIGER2017/ZCTA5/{zipcode_filename}"
zipcode_file = pathlib.Path(zipcode_filename)

for data_file, url in zip([states_file, zipcode_file], [states_url, zipcode_url]):
    if not data_file.is_file():
        with urllib.request.urlopen(url) as response, open(data_file, 'wb') as f:
            f.write(response.read())

zipcode_gdf = gpd.read_file(f"zip://{zipcode_file}")
states_gdf = gpd.read_file(f"zip://{states_file}")

In [None]:
def plot_map(city_name, function, path):
    
    city = deepcopy(zipcode_gdf)
    
    results = list()
    
    for zip in os.listdir(path):
        
        if not zip.endswith('.csv'):
        
            zip_path = os.path.join(path, zip)
            
            for file in os.listdir(zip_path):
                
                file_path = os.path.join(zip_path, file)
                df = pd.read_csv(file_path)
            
        
                if function == 'income' and file == 'income.csv':
                    household, married, nonfamily, family = get_income(df)
                    if household is not None:
                        city.loc[city.ZCTA5CE10.str.startswith(zip), 'household'] = household
                        city.loc[city.ZCTA5CE10.str.startswith(zip), 'married'] = married
                        city.loc[city.ZCTA5CE10.str.startswith(zip), 'nonfamily'] = nonfamily
                        city.loc[city.ZCTA5CE10.str.startswith(zip), 'family'] = family
                elif function == 'gender' and file == 'gender.csv':
                    maximum = get_gender(df)
                    if maximum is not None:
                        city.loc[city.ZCTA5CE10.str.startswith(zip), 'result'] = maximum
                elif function == 'race' and file == 'race.csv':
                    maximum = get_race(df)
                    if maximum is not None:
                        city.loc[city.ZCTA5CE10.str.startswith(zip), 'result'] = maximum
    
    if function == 'gender':
        
        city = city[city['result'].notna()]
        
        if categorical:
            order = ['<5', '5-9', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40-44',
                    '45-49', '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80-84', '85+']
            
            city.plot(column='result', cmap='viridis', legend=True, legend_kwds={'loc': 'center left', 'bbox_to_anchor': (1, 0.5)}, categories = order)
            
            if show:
                counters = {age: 0 for age in order}
                for age in city['result']:
                    counters[age] += 1
                print(counters)
                
        else:
            city.plot(column='result', cmap='viridis', legend = True)
            
            if show:
                counter = 0
                for age in city['result']:
                    counter += age
                counter /= len(city['result'])
                print(counter)

        plt.xticks([], [])
        plt.yticks([], [])
        plt.title(f'Gender in {city_name} in 2022')
        
    elif function == 'income':
        city = city[city['household'].notna()]
        
        if categorical:
            
            order = ['<10,000', '10,000-14,999', '15,000-24,999', '25,000-34,999', '35,000-49,999',
                    '50,000-74,999', '75,000-99,999', '100,000-149,999', '150,000-199,999', '200,000+']
            
            types = ['household', 'married', 'nonfamily', 'family']
            for type in types:
                city.plot(column=type, cmap='viridis', categorical = True, categories = order, legend=True, legend_kwds={'loc': 'center left', 'bbox_to_anchor': (1.6, 1)})
                plt.xticks([], [])
                plt.yticks([], [])
                plt.title(type + ' income in ' + city_name + ' in 2022')
                if results:
                    for type in types:
                        counters = {income: 0 for income in order}
                        for income in city[type]:
                            counters[income] += 1
                        print(counters)
        else:
                
                fig, ax = plt.subplots(2, 2, figsize=(7, 7))
                
                types = ['household', 'married', 'nonfamily', 'family']
                for x in [0, 1]:
                    for y in [0, 1]:
                        type = types[x + y * 2]
                        city.plot(column=type, cmap='viridis', ax=ax[x, y])
                        ax[x, y].set_xticks([], [])
                        ax[x, y].set_yticks([], [])
                        plt.suptitle('Income in ' + city_name + ' in 2022')
                        ax[x, y].set_title(type)
                
                axs = ax.ravel()
                fig.colorbar(ax[0, 0].collections[0], ax=axs, shrink= 0.5)
                
                if show:
                    counter = {
                        'household': 0,
                        'married': 0,
                        'nonfamily': 0,
                        'family': 0
                    }
                    for type in types:
                        for income in city[type]:
                            counter[type] += income
                        counter[type] /= len(city[type])
                    print(counter)
    
        
    elif function == 'race':
        city = city[city['result'].notna()]
        
        order = ['White', 'Black or \n African American', 'Indian and \n Alaska Native',
            'Asian', 'Native Hawaiian', 'Some Other Race', 'Two or More Races']
        
        city.plot(column='result', cmap='viridis', legend=True, legend_kwds={'loc': 'center left', 'bbox_to_anchor': (1, 0.5)}, categories = order)
        plt.xticks([], [])
        plt.yticks([], [])
        plt.title(f'Races in {city_name} in 2022')
        
        if show:
            counter = {race: 0 for race in order}
            for race in city['result']:
                counter[race] += 1
            print(counter)
        
    plt.show()

In [None]:
def get_gender(df):
    '''
    function to get the gender from the social data and plot it
    
    Inputs:
        - df: pandas dataframe
    
    Outputs:
        - the most common age or the mean age
    '''
    
    if categorical:
        estimates = list()
        margins = list()
        is_age = False
        for i in range(30):
            if df['label'][i].endswith('Under 5 years'):
                is_age = True
            if is_age:
                estimate = df['total_estimates'][i]
                if type(estimate) == float:
                    estimate = str(estimate)
                estimate = int(estimate.replace(',', ''))
                estimates.append(estimate)
            if df['label'][i].endswith('85 years and over'):
                break
        # return the maximum value
        ages = ['<5', '5-9', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40-44',
                '45-49', '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80-84', '85+']
        max_total = estimates.index(max(estimates))
        return ages[max_total]
    else:
        for i in range(40):
            if df['label'][i].endswith('Median age (years)'):
                estimate = df['total_estimates'][i]
                if estimate != '-':
                    return float(df['total_estimates'][i])
        return None

In [None]:
def get_income(df):
    '''
    function to get the income from the social data and plot it
    
    Inputs:
        - df: pandas dataframe
    
    Outputs:
        - the most common income or the median income
    '''
    estimates = {
        'household': list(),
        'family': list(),
        'married': list(),
        'nonfamily': list()
    }
    if categorical:
        is_income = False
        for i in range(16):
            if df['label'][i].endswith('Less than $10,000'):
                is_income = True
            if is_income:
                for type in ['household', 'married', 'nonfamily', 'family']:
                    estimate = float(df[type + '_estimates'][i][:-1])
                    estimates[type].append(estimate)
            if df['label'][i].endswith('$200,000 or more'):
                is_income = False
        
        # return the maximum value
        incomes = ['<10,000', '10,000-14,999', '15,000-24,999', '25,000-34,999', '35,000-49,999',
                '50,000-74,999', '75,000-99,999', '100,000-149,999', '150,000-199,999', '200,000+']
        return incomes[estimates['household'].index(max(estimates['household']))], incomes[estimates['married'].index(max(estimates['married']))], incomes[estimates['nonfamily'].index(max(estimates['nonfamily']))], incomes[estimates['family'].index(max(estimates['family']))]
    else:
        for i in range(16):
            if df['label'][i].endswith('Median income (dollars)'):
                for type in ['household', 'married', 'nonfamily', 'family']:
                    estimate = df[type + '_estimates'][i]
                    if estimate == '-':
                        return None, None, None, None
                    else:
                        if estimate.endswith('+'):
                            estimate = estimate[:-1]
                        estimate = int(estimate.replace(',', ''))
                        estimates[type].append(estimate)
        return estimates['household'][0], estimates['married'][0], estimates['nonfamily'][0], estimates['family'][0]

In [None]:
def get_race(df):
    '''
    the function to get the race from the social data and plot it
    
    Input:
        - df: pandas dataframe
    
    Output:
        - the most common race
    '''
    
    estimates = list()
    
    is_race = False
    for i in range(10):
        if df['label'][i].endswith('White alone'):
            is_race = True
        if is_race:
            estimate = df['total_estimates'][i]
            if type(estimate) != str:
                estimate = str(estimate)
            estimate = int(estimate.replace(',', ''))
            estimates.append(estimate)
        if df['label'][i].endswith('Two or More Races'):
            is_race = False
    
    # return the maximum value
    races = ['White', 'Black or \n African American', 'Indian and \n Alaska Native',
        'Asian', 'Native Hawaiian', 'Some Other Race', 'Two or More Races']
    return races[estimates.index(max(estimates))]