In [0]:
from pyspark.sql.functions import col, expr, aggregate, ceil, sum, map_values, lit, explode, desc, when, avg
import pyspark.sql.functions
import plotly.graph_objs as go
import math
import pandas as pd

In [0]:
a = spark.sql("SELECT * \
        FROM safegraph.places \
        INNER JOIN safegraph.patterns ON safegraph.patterns.placekey=safegraph.places.placekey \
        WHERE top_category LIKE '%Religious%' AND \
        UPPER(location_name) LIKE UPPER('The Church of Jesus Christ of Latter day Saints')")


drop duplicate placekeys to remove multiple times for a single location

In [0]:
a = a.select("*").dropDuplicates(["placekey"])

divides raw visits by visitor count to obtain visitor percentage and then multiplies the percent into the normalized state visit counts to create a scaler visitor scaling

In [0]:
a = a.withColumn('visitor_scaling', (col("normalized_visits_by_state_scaling") * ((col("raw_visitor_counts") / col("raw_visit_counts")))))

aggregates the vistorhomecbgs into a total quantity to be later used

In [0]:
a = a.withColumn('totalcbgs', map_values('visitor_home_cbgs')).select("*", aggregate("totalcbgs", lit(0), lambda acc, x: acc + x).alias("total")).drop("totalcbgs")

Explodes visitor home cbgs and keeps the total for each cbgs and the scaled visitor count. creates a percentage ratio of each cbgs value versus the total amount from the home aggrgation. ex 4 from a cbgs / 65 total to a building. Distributes the percentage ratio into the scaled visitor count to get the total members as a percentage of people that went to a building.

In [0]:
a = a.select(explode(a.visitor_home_cbgs).alias("cbgs", "value"), "total", 'visitor_scaling') \
        .withColumn("percent_total", (col("value") / (col("total")))) \
        .withColumn("members", col("percent_total")*col("visitor_scaling")) 

reduces down to cbgs and members and uses ceil to remove decimals.

In [0]:
a = a.select("cbgs", ceil("members").alias("total"))

groupsby cbgs and sums members on cbgs

In [0]:
a = a.groupBy("cbgs").agg(sum("total").alias("total_members"))

In [0]:
a.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable("members")

joins county and track information on cbgs

In [0]:
tract = spark.sql("SELECT members.cbgs, members.total_members, censusblock_table.county, censusblock_table.cnamelong, censusblock_table.tractcode\
        FROM default.members\
        INNER JOIN safegraph.censusblock_table ON default.members.cbgs=safegraph.censusblock_table.blockcode")

aggregates members onto tract

In [0]:
tracttotal = tract.select("total_members", "county", "tractcode", "cnamelong").groupBy("county", "tractcode", "cnamelong").agg(sum("total_members").alias("total_members"))

In [0]:
tracttotal= tracttotal.select("*").dropDuplicates(["tractcode"])
tracttotal.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable("tract")

In [0]:
locations = spark.sql("SELECT tract.county, tract.tractcode, tract.cnamelong, tract.total_members, tract_table.lat, tract_table.long\
        FROM default.tract\
        INNER JOIN safegraph.tract_table ON tract.tractcode=tract_table.tractcode")

In [0]:
display(tracttotal.agg(sum("total_members")))
display(tracttotal.agg(avg("total_members")))

sum(total_members)
228122


avg(total_members)
483.3093220338983


In [0]:
locations.head(15)

Unnamed: 0,county,tractcode,cnamelong,total_members,lat,long
0,41,16041970200,Franklin County,2149,42.060983,-111.714736
1,41,16041970100,Franklin County,1804,42.222418,-111.846778
2,73,16073950101,Owyhee County,293,43.592426,-116.960221
3,73,16073950102,Owyhee County,133,43.524785,-116.848134
4,69,16069960900,Nez Perce County,60,46.376303,-116.96654
5,69,16069960400,Nez Perce County,60,46.408018,-117.027292
6,69,16069960600,Nez Perce County,60,46.400231,-116.992189
7,69,16069960500,Nez Perce County,105,46.395655,-117.017098
8,69,16069961000,Nez Perce County,60,46.380843,-116.941372
9,1,16001000500,Ada County,60,43.640836,-116.223038


In [0]:
locations = locations.toPandas()

In [0]:
fig = go.Figure(data=go.Scattergeo(
        lon = locations['long'],
        lat = locations['lat'],
        text = locations["total_members"],
        mode = 'markers',
        marker_color = locations['total_members'],
        locationmode="USA-states"
        ))

fig.update_layout(
        title = 'Esitmated Active Member Counts for Each Tract)',
        geo_scope='usa',
    )