In [None]:
import numpy as np
import pandas as pd
from scipy.stats import norm, kstest
import geopandas
from shapely.geometry import Polygon, Point
import random
import matplotlib.pyplot as plt
import matplotlib
import random
import contextily as ctx
%matplotlib inline

from constants import(
    COUNTYDATAFILEPATH,
    GEODATAFILEPATH,
    COUNTYCODE
)

# Function to generate a random point within a polygon
def get_random_point(polygon):
    min_x, min_y, max_x, max_y = polygon.bounds

    while True:
        # Generate a random point
        random_point = Point(random.uniform(min_x, max_x), random.uniform(min_y, max_y))
        
        # Check if the point is inside the polygon
        if polygon.contains(random_point):
            return random_point

#Dictionary to describe homedata Variables
households_variables_dict = {
    "B19001_001E": "total households in tract",
    "B19001_002E": "under 10k",
    "B19001_003E": "10k to 15k",
    "B19001_004E": "15k to 20k",
    "B19001_005E": "20k to 25k",
    "B19001_006E": "25k to 30k",
    "B19001_007E": "30k to 35k",
    "B19001_008E": "35k to 40k",
    "B19001_009E": "40k to 45k",
    "B19001_010E": "45k to 50k",
    "B19001_011E": "50k to 60k",
    "B19001_012E": "60k to 75k",
    "B19001_013E": "75k to 100k",
    "B19001_014E": "100k to 125k",
    "B19001_015E": "125k to 150k",
    "B19001_016E": "150k to 200k",
    "B19001_017E": "200k+",
    


}
households_key_list = (
    "B19001_001E",
    "B19001_002E",
    "B19001_003E",
    "B19001_004E",
    "B19001_005E",
    "B19001_006E",
    "B19001_007E",
    "B19001_008E",
    "B19001_009E",
    "B19001_010E",
    "B19001_011E",
    "B19001_012E",
    "B19001_013E",
    "B19001_014E",
    "B19001_015E",
    "B19001_016E",
    "B19001_017E",
    "B08201_001E",
    "B08201_002E",
    "B08201_003E",
    "B08201_004E",
    "B08201_005E",
    "B08201_006E",
    "B08201_007E",
    "B08201_008E",
    "B08201_009E",
    "B08201_010E",
    "B08201_011E",
    "B08201_012E",
    "B08201_013E",
    "B08201_014E",
    "B08201_015E",
    "B08201_016E",
    "B08201_017E",
    "B08201_018E",
    "B08201_019E",
    "B08201_020E",
    "B08201_021E",
    "B08201_022E",
    "B08201_023E",
    "B08201_024E",
    "B08201_025E",
    "B08201_026E",
    "B08201_027E",
    "B08201_028E",
    "B08201_029E",
    "B08201_030E",
    "B08202_001E",
    "B08202_002E",
    "B08202_003E",
    "B08202_004E",
    "B08202_005E",
    "B08202_006E",
    "B08202_007E",
    "B08202_008E",
    "B08202_009E",
    "B08202_010E",
    "B08202_011E",
    "B08202_012E",
    "B08202_013E",
    "B08202_014E",
    "B08202_015E",
    "B08202_016E",
    "B08202_017E",
    "B08202_018E",
    "B08202_019E",
    "B08202_020E",
    "B08202_021E",
    "B08202_022E",
    "B08203_001E",
    "B08203_002E",
    "B08203_003E",
    "B08203_004E",
    "B08203_005E",
    "B08203_006E",
    "B08203_007E",
    "B08203_008E",
    "B08203_009E",
    "B08203_010E",
    "B08203_011E",
    "B08203_012E",
    "B08203_013E",
    "B08203_014E",
    "B08203_015E",
    "B08203_016E",
    "B08203_017E",
    "B08203_018E",
    "B08203_019E",
    "B08203_020E",
    "B08203_021E",
    "B08203_022E",
    "B08203_023E",
    "B08203_024E",
    "B08203_025E",
    "B08203_026E",
    "B08203_027E",
    "B08203_028E",
    "B08203_029E",
    "B08203_030E",
    "B19019_001E",
    "B19019_002E",
    "B19019_003E",
    "B19019_004E",
    "B19019_005E",
    "B19019_006E",
    "B19019_007E",
    "B19019_008E",
)

household_values_list = (
    "total households in tract",
    "under 10k",
    "10k to 15k",
    "15k to 20k",
    "20k to 25k",
    "25k to 30k",
    "30k to 35k",
    "35k to 40k",
    "40k to 45k",
    "45k to 50k",
    "50k to 60k",
    "60k to 75k",
    "75k to 100k",
    "100k to 125k",
    "125k to 150k",
    "150k to 200k",
    "200k+"
)

#Read csvs into pandas dataframes
county_data = pd.read_csv(COUNTYDATAFILEPATH)
geodata = geopandas.read_file(GEODATAFILEPATH)

#Merge geographical dataframe (containing shapely ploygons) with census data
county_geodata = geodata[geodata['COUNTYFP'] == COUNTYCODE]
county_geodata.to_crs(epsg=3857)
county_geodata = county_geodata.rename(columns={"TRACTCE":"tract"})
county_geodata["tract"] = county_geodata["tract"].astype(int)
county_data["tract"] = county_data["tract"].astype(int)
data = pd.merge(county_geodata, county_data, on = "tract", how="inner")
data.rename(columns=households_variables_dict, inplace = True)
households = pd.DataFrame(columns = ["id","latitude","longitude","income"])

#Plot theme
theme = {'axes.grid': False,
              'legend.framealpha': 1,
              'legend.facecolor': 'white',
              'legend.shadow': True,
              'legend.fontsize': 14,
              'legend.title_fontsize': 16,
              'xtick.labelsize': 14,
              'ytick.labelsize': 14,
              'axes.labelsize': 16,
              'axes.titlesize': 20,
              'figure.dpi': 1000}

matplotlib.rcParams.update(theme)

In [5]:

"""
data = data.to_crs(epsg=3857)

# Plot the geometries
fig, ax = plt.subplots(figsize=(10, 10))

data['color'] = 'green'  # default color
#data.loc[(data['tract']>5000)&(data['tract']<6000), 'color'] = 'red'  # condition-based color

data.plot(ax=ax, alpha=0.15, edgecolor='k', linewidth=1, color=data["color"])
minx, miny, maxx, maxy = data.total_bounds
ax.set_axis_off()

ax.set_xlim(minx+13000, maxx-13000)
ax.set_ylim(miny+13000, maxy-13000)
ctx.add_basemap(ax, crs=data.crs.to_string(), source=ctx.providers.CartoDB.Voyager)



# Set aspect
ax.set_aspect('equal')

# Optionally set the x and y limits
# ax.set_xlim([xmin, xmax])
# ax.set_ylim([ymin, ymax])


plt.show()
"""

#Iterate through each tract
total_count = 0
for index,row in data.iterrows():
    if ((row['tract']>5000)&(row['tract']<6000)):
        total_households = row["total households in tract"]
        if total_households == 0:
            continue
        density = total_households/Polygon(row["geometry"]).area
        #if density<5000000:
        #    continue
        print(density)
        weights = np.array(row["under 10k":"200k+"])
        proportional_weights = weights/total_households

        for household_num in range(int(10000000000/density)):
            location = get_random_point(Polygon(row["geometry"]))
            income = random.choices(household_values_list[1:], weights, k=1)[0]
            households.loc[total_count] = {"id":total_count,"latitude":location.y,"longitude":location.x,"income":income}
            total_count+=1


households.to_csv('data/households.csv', index=False)

print("done")
print(households)


11582823.805056317
14427239.601523245
10076836.952341001
7830276.162796611
1775766.748840235
17236579.484636303
7383064.686853999
6221693.89879649
7824656.687933315
12700778.432760833
11768311.428789392
9515119.658333292
15246253.454329178
14702500.56544542
done
          id   latitude  longitude        income
0          0  39.941311 -82.964465    45k to 50k
1          1  39.940322 -82.968711    20k to 25k
2          2  39.941119 -82.972236     under 10k
3          3  39.940185 -82.967272    25k to 30k
4          4  39.937527 -82.968858    25k to 30k
...      ...        ...        ...           ...
18291  18291  39.930956 -82.986234    10k to 15k
18292  18292  39.930629 -82.997364  150k to 200k
18293  18293  39.931133 -82.992542    60k to 75k
18294  18294  39.930605 -82.992762  125k to 150k
18295  18295  39.933511 -82.988984     under 10k

[18296 rows x 4 columns]
