# US_Census_Data [Harris County]

In [150]:
import sys
import os
import pandas as pd
import networkx as nx
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import numpy as np
import re 
import plotly.express as px

### Functions

In [151]:
def create_df(file_path):
    df = pd.read_csv(file_path)
    return df

def transposed_df(df):
    t_df = df.T
    return t_df

def extract_index(df):
    new_index = []
    for i in df.index:
        match = re.findall(r'\d+\.?\d*', i)
        if match:
            new_index.append(match[0])
        else:
            new_index.append('')
    df.index = new_index
    return df

def transform_index(index):
    return index.map(lambda x: ''.join(x.split('.')) if '.' in x else x + '00')


Top_Tracts = ["430500", "552200", "251300", "554902", "522401", 
              "340400", "554001", "251402", "521900", "430800",
              "323802", "250500", "324000", "420100", "454600", 
              "430400", "342700", "252700", "542302", "342001",
              "342300", "322200", "222200", "551800", "233702"]

Bottom_Tracts = ["342400", "431600", "254100", "534001", "532900", 
                 "421900", "332800", "550800", "430300", "532700", 
                 "343200", "532400", "251501", "251600", "412600", 
                 "221900", "430900", "251401", "553403", "430600",
                 "430700", "343000", "250701", "450700", "412800"]

def tract_matching_t(top_25_tracts):
    
    compare = [430500, 552200, 554902, 251300, 420100,
               521900, 340400, 554001, 522401, 430800,
               251402, 324000, 323802, 250500, 342001,
               430400, 342700, 454600, 342300, 252700,
               542302, 551800, 322200, 222200, 411400]
    
    matched_tracts = set(compare).intersection(top_25_tracts)
    if matched_tracts:
        print("Matched tracts")
        for tract in matched_tracts:
            print(tract)
    else: 
        print("No matching tracts")
    return matched_tracts 

def tract_matching_b(bottom_25_tracts):
    
    compare =  [342400, 431600, 254100, 534001, 532900, 
                421900, 332800, 550800, 430300, 532700, 
                343200, 532400, 251501, 251600, 412600, 
                221900, 430900, 251401, 553403, 430600,
                430700, 343000, 250701, 450700, 412800]
    
    matched_tracts = set(compare).intersection(bottom_25_tracts)
    if matched_tracts:
        print("Matched tracts")
        for tract in matched_tracts:
            print(tract)
    else: 
        print("No matching tracts")
    return matched_tracts 

def get_sparse_matrix(df, tracts): #row index = home, col index = target
    num_tracts = len(tracts)
    directed_matrix = np.zeros((num_tracts, num_tracts),dtype=float)
    for index, row in df.iterrows():
        home_index = np.where(tracts == row['Home_TRACTCE'])[0][0]
        school_index = np.where(tracts == row['School_TRACTCE'])[0][0]
        directed_matrix[home_index, school_index] = row['Visitor_Count']
    sparse_matrix = csr_matrix(directed_matrix)
    return sparse_matrix

def get_weighted_adjacency_matrices(list_dfs, tracts):
    mats = []
    for df in list_dfs:
        mats.append(get_sparse_matrix(df, tracts))
    return mats

def get_node_degrees_in(list_mats):
    degrees = []
    for mat in list_mats:
        sums = np.array(mat.sum(axis=0)).flatten()
        degrees.append(sums)
    degrees = np.array(degrees)
    return degrees

def get_node_degrees_out(list_mats):
    degrees = []
    for mat in list_mats:
        sums = np.array(mat.sum(axis=1)).flatten()
        degrees.append(sums)
    degrees = np.array(degrees)
    return degrees

def filter_file(df, index_name, df_name):
    
    """
    Filters the given DataFrame to retain only the rows containing the specified index name.
    Transforms the resulting indices to 6-digit tract numbers and ensures they are part of a predefined set of SafeGraph tracts.
    Parameters:
    df (DataFrame): The original DataFrame to filter.
    index_name (str): The substring to match within the DataFrame's index.
    df_name (DataFrame): An empty DataFrame to store the filtered results.
    Returns:
    DataFrame: A DataFrame with the filtered and transformed indices.
    """
    
    df_name = pd.DataFrame()
    t_df = transposed_df(df)
    for index, row in t_df.iterrows():
        if index_name in index:
            df_name = pd.concat([df_name, pd.DataFrame([row], index=[index])])
            
    df_name = extract_index(df_name)       
    df_name.index = transform_index(df_name.index)
    df_name = df_name[df_name.index.isin(tracts_set)]
    
    return df_name

def px_scatter_plot(df, x_column, y_column, x_label, y_label, title):
    fig = px.scatter(df, x=x_column, y=y_column, labels={x_column: x_label, y_column: y_label})
    fig.update_layout(title=title)
    fig.show()
    return fig 

### Setting up path to files and creating Dataframes

In [152]:
BASE_FOLDER = '/Users/alishakhan/Desktop/Research/POIs'
CENSUS_path = '/Users/alishakhan/Desktop/Research/Census Files/Harris'

sys.path.append(BASE_FOLDER)
poi_name = 'School'
folder_path = BASE_FOLDER+'/'+poi_name

files = sorted(os.listdir(folder_path))
list_dfs = []
for file in files:
    if file.endswith('.csv'):
        df = pd.read_csv(os.path.join(folder_path, file))
        list_dfs.append(df)

dates = []
for df in list_dfs:
  dates.append(df.iloc[0,0])
for df in list_dfs:
  df.loc[df[poi_name + '_TRACTCE'] == df['Home_TRACTCE'], 'Distance_Covered (km)'] = .02

df_combined = pd.concat(list_dfs)
tracts = np.unique(
    df_combined[['Home_TRACTCE', poi_name + '_TRACTCE']].values)
num_tracts = np.shape(tracts)[0]
distances = np.unique(
    df_combined[['Distance_Covered (km)']].values)
tracts_set = set(map(str, tracts))

#Dataframe display settings
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 1000)

### Out-Degree Change for Storm Week (Week 8) [Schools]

In [153]:
schools_mats = get_weighted_adjacency_matrices(list_dfs, tracts)
degrees_in = get_node_degrees_in(schools_mats)
degrees_out = get_node_degrees_out(schools_mats)
weekly_means_out = np.mean(degrees_out, axis=0)

np.set_printoptions(threshold=np.inf)
tract_change = []

for week in degrees_out:
    change = week - weekly_means_out
    tract_change.append(change)

e_df = pd.DataFrame({
    'Change': tract_change[7]
},index = tracts)
mats = get_weighted_adjacency_matrices(list_dfs, tracts)
base_degrees = get_node_degrees_out(mats[1:4])
base_mean = np.mean(base_degrees, axis = 0)

e_df['Base Degrees Mean'] = -base_mean

e_df = e_df.sort_values(by = 'Base Degrees Mean')

fig = px.scatter(e_df, x = e_df.index.astype(str), y = 'Change', labels={'x': 'Tract Index', 'Change': 'Change from Baseline (1/4 - 2/7)'})
fig.update_layout(title='Out-Degree Change for Storm Week (Week 8) [Schools]')
fig.show()

percent_e = (e_df['Change'] - e_df['Base Degrees Mean']) / e_df['Base Degrees Mean']
e_df['Percent Change from Baseline'] = percent_e

fig2 = px.scatter(e_df, x = e_df.index.astype(str), y = 'Percent Change from Baseline' , labels={'x': 'Tract Index', 'Percent Change from Baseline': 'Percent Change from Baseline (1/4 - 2/7)'})
fig2.update_layout(title='Percent Out-Degree Change for Storm Week (Week 8) [Schools]')
fig2.show()

### Plotting Demographic Data vs. Change from Baseline  

#### Percent Non-White vs. Change from Baseline 

In [154]:
file = 'race_harris.csv'
file_path = CENSUS_path + '/' + file
df_h = create_df(file_path)
t_df = transposed_df(df_h)
t_df = extract_index(t_df)

t_df.index = t_df.index.map(lambda x: ''.join(x.split('.')) if '.' in x else x + '00')
t_df = t_df[t_df.index.isin(tracts_set)]
t_df.index = t_df.index.astype(str)
t_df[1] = t_df[1].str.replace(',', '')
t_df[1] = t_df[1].fillna('0')

In [155]:
t_df[1] = pd.to_numeric(t_df[1])

sorted_df = t_df.sort_values(by = 1)

top_25_white = sorted_df[1].tail(25)
top_25_white = top_25_white.index
top_25_white = top_25_white.astype(int)
tract_matching_t(top_25_white)

Matched tracts
342001
251300
342300
554902


{251300, 342001, 342300, 554902}

In [156]:
t_df[2] = t_df[2].str.replace(',', '')
t_df[2] = pd.to_numeric(t_df[2])
sorted_df = t_df.sort_values(by = 2)
top_25_AA = sorted_df[2].tail(25)
top_25_AA = top_25_AA.index
top_25_AA = top_25_AA.astype(int)
tract_matching_t(top_25_AA)

No matching tracts


set()

In [157]:
t_df[3] = t_df[3].str.replace(',', '')
t_df[3] = pd.to_numeric(t_df[3])
sorted_df = t_df.sort_values(by = 3)
top_25_AI = sorted_df[3].tail(25)
top_25_AI = top_25_AI.index
top_25_AI = top_25_AI.astype(int)
tract_matching_t(top_25_AI)

No matching tracts


set()

In [158]:
t_df[4] = t_df[4].str.replace(',', '')
t_df[4] = pd.to_numeric(t_df[4])
sorted_df = t_df.sort_values(by = 4)
top_25_asian = sorted_df[4].tail(25)
top_25_asian = top_25_asian.index
top_25_asian = top_25_asian.astype(int)
tract_matching_t(top_25_asian)

Matched tracts
552200
430800


{430800, 552200}

In [159]:
t_df[5] = t_df[5].str.replace(',', '')
t_df[5] = pd.to_numeric(t_df[5])
sorted_df = t_df.sort_values(by = 5)
top_25_hawaiian = sorted_df[5].tail(25)
tract_matching_t(top_25_hawaiian)

No matching tracts


set()

In [160]:
#Missing tracts data for white pop
#980000     NaN
#980100     NaN
#324200     NaN
#340201     NaN

#### Income vs. Change from Baseline 

In [161]:
file = 'income_harris.csv'
file_path = CENSUS_path + '/' + file
df_income = create_df(file_path)

index_name = "Median income (dollars)!!Estimate"
f_income = filter_file(df_income, index_name, f_income)

f_income[1] = (f_income[1].str.replace(',', '').str.replace('-', '72353').str.replace('250000+', '250000'))
f_income[1] = pd.to_numeric(f_income[1])
sorted_df = f_income.sort_values(by = 1)

e_df = e_df.sort_index()
e_df['Income'] = f_income[1].values
e_df = e_df.sort_values(by = 'Base Degrees Mean')

fig1 = px_scatter_plot(e_df, 'Income', 'Change', 'Income', 'Change from Baseline (1/4 - 2/7)', 'Out-Degree Change vs Income for Storm Week (Week 8) [Schools]')
fig2 = px_scatter_plot(e_df, 'Income', 'Percent Change from Baseline', 'Income', 'Percent Change from Baseline (1/4 - 2/7)', 'Percent Out-Degree Change vs Income for Storm Week (Week 8) [Schools]')

#### Comparing the Income of the Top 25 Most Changing Tracts and the Bottom 25 Least Changing Tracts

In [162]:
f_income_top = f_income[f_income.index.isin(Top_Tracts)]
f_income_bottom = f_income[f_income.index.isin(Bottom_Tracts)]

sorted_df_t = f_income_top.sort_values(by = 1)
sorted_df_b = f_income_bottom.sort_values(by = 1)

top_25_income = sorted_df_t[1]
bottom_25_income = sorted_df_b[1]

fig = px.scatter(top_25_income, x=top_25_income.index, y=1, title='Income by Tract (Top)', labels={ '1' : 'Median Income' , 'index' : 'Tracts'})
fig.update_yaxes(range=[10000, 260000])
fig.show()

fig2 = px.scatter(bottom_25_income, x=bottom_25_income.index, y=1, title='Income by Tract (Bottom)', labels={ '1' : 'Median Income' , 'index' : 'Tracts'})
fig2.update_yaxes(range=[10000, 260000])
fig2.show()


#### Unemployment Rate vs. Change from Baseline

In [163]:
file = 'unemp.rate.csv'
file_path = CENSUS_path + '/' + file
df_UNemp = create_df(file_path)

index_name = "Total population!!Percent"
f_UNemp = filter_file(df_UNemp, index_name, f_UNemp)

#4 tracts are missing from this data {'340201', '454400', '980000', '980100'}
# Identify and concat the missing tracts
all_tracts = set(tracts_set)
missing_tracts = all_tracts - set(f_UNemp.index)
missing_data = {'Tract': list(missing_tracts), 'UNemp': [0, 0, 0, 0]} #Replace with Mean Values
missing_df = pd.DataFrame(missing_data).set_index('Tract')
f_UNemp = pd.concat([f_UNemp, missing_df])

f_UNemp[9] = f_UNemp[9].str.replace('%','')
f_UNemp[9] = pd.to_numeric(f_UNemp[9])

e_df = e_df.sort_index()
e_df['Unemployment Rate'] = f_UNemp[9].values
e_df = e_df.sort_values(by = 'Base Degrees Mean')

fig1 = px_scatter_plot(e_df, 'Unemployment Rate', 'Change', 'Unemployment Rate (%)', 'Change from Baseline (1/4 - 2/7)', 'Out-Degree Change vs Unemployment Rate for Storm Week (Week 8) [Schools]')
fig2= px_scatter_plot(e_df, 'Unemployment Rate', 'Percent Change from Baseline', 'Unemployment Rate (%)', 'Percent Change from Baseline (1/4 - 2/7)', 'Percent Out-Degree Change vs Unemployment Rate for Storm Week (Week 8) [Schools]')

#### Comparing the Unemployment Rate of the Top 25 Most Changing Tracts and the Bottom 25 Least Changing Tracts

In [131]:
#Extract the top and bottom tracts 
f_UNemp_top = f_UNemp[f_UNemp.index.isin(Top_Tracts)]
f_UNemp_bottom = f_UNemp[f_UNemp.index.isin(Bottom_Tracts)]

# Sort the top and Bottom Tracts by values
sorted_df_t = f_UNemp_top.sort_values(by = 9)
sorted_df_b = f_UNemp_bottom.sort_values(by = 9)

#Extract only the 9th Column
top_25_emp = sorted_df_t[9]
bottom_25_emp = sorted_df_b[9]

# Plot the Values
fig = px.scatter(top_25_emp, x=top_25_emp.index, y=9, title='Unemployment Rate by Tract (Top 25)', labels={ '9' : 'Rate (%)' , 'index' : 'Tracts'})
fig.update_yaxes(range=[-10, 25])
fig.show()

fig2 = px.scatter(bottom_25_emp, x=bottom_25_emp.index, y=9, title='Unemployment Rate by Tract (Bottom 25)', labels={ '9' : 'Rate (%)' , 'index' : 'Tracts'})
fig2.update_yaxes(range=[-10, 25])
fig2.show()

#### Poverty Percentage vs. Change from Baseline

In [143]:
file = 'poverty.csv'
file_path = CENSUS_path + '/' + file
df_pov = create_df(file_path)

index_name = "!!Percent below poverty level!!Estimate"
f_pov = filter_file(df_pov, index_name, f_pov)

f_pov[0] = (f_pov[0].str.replace('-','').str.replace('%',''))
f_pov[0] = pd.to_numeric(f_pov[0])

e_df = e_df.sort_index()
e_df['Poverty Percentage'] = f_pov[0].values
e_df = e_df.sort_values(by = 'Base Degrees Mean')
e_df = e_df.sort_index()
e_df['Poverty Percentage'] = f_pov[0].values
e_df = e_df.sort_values(by = 'Base Degrees Mean')

fig1 = px_scatter_plot(e_df, 'Poverty Percentage', 'Change', 'Poverty Percentage', 'Change from Baseline (1/4 - 2/7)', 'Out-Degree Change vs Poverty Percentage for Storm Week (Week 8) [Schools]')
fig2 = px_scatter_plot(e_df, 'Poverty Percentage', 'Percent Change from Baseline', 'Poverty Percentage', 'Percent Change from Baseline (1/4 - 2/7)', 'Percent Out-Degree Change vs Poverty Percentage for Storm Week (Week 8) [Schools]')


#### Education vs. Change from Baseline

In [147]:
file = 'education.csv'
file_path = CENSUS_path + '/' + file
df_edu = create_df(file_path)

index_name = "Texas!!Percent!!Estimate"
f_edu = filter_file(df_edu, index_name, f_edu)

f_edu[14] = (f_edu[14].str.replace('-','').str.replace('%',''))
f_edu[14] = pd.to_numeric(f_edu[14])

e_df = e_df.sort_index()
e_df['HS or Higher'] = f_edu[14].values
e_df = e_df.sort_values(by = 'Base Degrees Mean')

fig1 = px_scatter_plot(e_df, 'HS or Higher', 'Change', 'Highschool diploma or Higher Education', 'Change from Baseline (1/4 - 2/7)', 'Out-Degree Change vs Highschool diploma or Education (Week 8) [Schools]')
fig2 = px_scatter_plot(e_df, 'HS or Higher', 'Percent Change from Baseline', 'Highschool diploma or Higher Education', 'Percent Change from Baseline (1/4 - 2/7)', 'Percent Out-Degree Change vs Education for Storm Week (Week 8) [Schools]')


#### Population vs. Change from Baseline 

In [148]:
file = 'total_pop.csv'
file_path = CENSUS_path + '/' + file
df_pop = create_df(file_path)

index_name = "Texas!!Total population!!Estimate"
f_pop = filter_file(df_pop, index_name, f_pop)

f_pop[0] = f_pop[0].str.replace(',', '')
f_pop[0] = pd.to_numeric(f_pop[0])

# Identify and concat the missing tracts
missing_tracts = all_tracts - set(f_pop.index)
missing_data = {'Tract': list(missing_tracts), 0 : [4397, 4397, 4397, 4397]}
missing_df = pd.DataFrame(missing_data).set_index('Tract')
f_pop = pd.concat([f_pop, missing_df], axis = 0)

e_df = e_df.sort_index()
e_df['Population Total'] = f_pop[0].values
e_df = e_df.sort_values(by = 'Base Degrees Mean')
mean_pop = np.mean(f_pop[0])

fig1 = px_scatter_plot(e_df, 'Population Total', 'Change', 'Population Total', 'Change from Baseline (1/4 - 2/7)', 'Out-Degree Change vs Population for Storm Week (Week 8) [Schools]')
fig2 = px_scatter_plot(e_df, 'Population Total', 'Percent Change from Baseline', 'Population Total', 'Percent Change from Baseline (1/4 - 2/7)', 'Percent Out-Degree Change vs Population for Storm Week (Week 8) [Schools]')
