In [None]:
!pip install polars
!pip install gql
!pip install requests_toolbelt

Collecting polars
  Downloading polars-0.19.12-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.6 MB)
[?25l[K     |                                | 10 kB 11.2 MB/s eta 0:00:03[K     |                                | 20 kB 3.1 MB/s eta 0:00:09[K     |                                | 30 kB 4.5 MB/s eta 0:00:07[K     |                                | 40 kB 3.7 MB/s eta 0:00:08[K     |                                | 51 kB 3.6 MB/s eta 0:00:08[K     |                                | 61 kB 4.2 MB/s eta 0:00:07[K     |                                | 71 kB 4.4 MB/s eta 0:00:07[K     |                                | 81 kB 4.5 MB/s eta 0:00:07[K     |                                | 92 kB 4.4 MB/s eta 0:00:07[K     |▏                               | 102 kB 4.5 MB/s eta 0:00:07[K     |▏                               | 112 kB 4.5 MB/s eta 0:00:07[K     |▏                               | 122 kB 4.5 MB/s eta 0:00:07[K     |▏                       

In [None]:
import polars as pl
import pandas as pd
import plotly.express as px
from pyspark.sql.functions import col, sum, year, month, split, when, size, expr, min, max, explode, get_json_object, expr, map_values, count, countDistinct, percentile_approx, mean, stddev
from pyspark.sql import DataFrameStatFunctions as statFunc
from pyspark.sql.types import StructType, StructField, IntegerType, ArrayType
from pyspark.sql.types import IntegerType
import os
# spark - This starts for me.


In [None]:
dir_list = dbutils.fs.ls("dbfs:/data/idaho/patterns")
dir_list.pop(0)

# This webscraped places was created by Copeland's filters. He used a webscraper to get addresses from the church website, lat and longs, and did a very thorough job here.
webscraped_places = spark.read.format("csv").option("header", "true").option("mode", "PERMISSIVE").load("/FileStore/churchPlaces.csv")
patterns = spark.read.parquet("dbfs:/data/idaho/patterns/part-00000-tid-2844737020712298775-f616784b-e90b-4e50-b792-759034c7ff46-1655-1-c000.snappy.parquet")

for item in dir_list:
    patterns = patterns.unionAll(spark.read.parquet(item.path))



There was a problem with reading from the csv file, so I tried to fix it by doing a join to the actual places data. However, every join I ever tried always had an issue of some kind.

The data that has an issue from the csv file is any column after the open_hours column and the stuff after isn't important, so I'm going to use the data that has an issue from the csv file anyway.

In [None]:
# display(webscraped_places)
# display(webscraped_places.count())



In [None]:
# This is an additional filter added by Israel Olaveson
churches = webscraped_places.filter(~col("location_name").rlike("[R|r]eorganized|Fields|Repair|Goldsmiths|Steakhouse|Tim|Goldsmith|Goldsmithing|Reynolds|Windshields|Slattery|MD|tic|tics|Inc|Wine|Ball|Optometrist|Surg|Bicycles|Cemetery|Olds"))

# This is added by Spencer Birch, some churches have repeating placekeys and since we join on this, we don't want data to mutiply itself. 
churches = churches.dropDuplicates(['placekey'])

# I found a need to have placekey renamed later, Join the data
patterns_v05 = patterns.withColumnRenamed("placekey", "placekey_patterns")
df = patterns_v05.join(churches,churches.placekey ==  patterns_v05.placekey_patterns,"inner")

# Here I've pulled out all the data I think I'm going to need.
df_v1 = df.withColumn("Sunday_visits", df.popularity_by_day["Sunday"])\
                        .withColumn("Monday_visits", df.popularity_by_day["Monday"])\
                        .withColumn("Tuesday_visits", df.popularity_by_day["Tuesday"])\
                        .withColumn("Wednesday_visits", df.popularity_by_day["Wednesday"])\
                        .withColumn("Thursday_visits", df.popularity_by_day["Thursday"])\
                        .withColumn("Friday_visits", df.popularity_by_day["Friday"])\
                        .withColumn("Saturday_visits", df.popularity_by_day["Saturday"])\
                        .drop("popularity_by_day")\
                        .withColumn("timeInLocation_5_10", df.bucketed_dwell_times["5-10"])\
                        .withColumn("timeInLocation_21_60", df.bucketed_dwell_times["21-60"])\
                        .withColumn("timeInLocation_61_120", df.bucketed_dwell_times["61-120"])\
                        .withColumn("timeInLocation_0_4", df.bucketed_dwell_times["<5"])\
                        .withColumn("timeInLocation_240_999", df.bucketed_dwell_times[">240"])\
                        .withColumn("timeInLocation_11_20", df.bucketed_dwell_times["11-20"])\
                        .withColumn("timeInLocation_121_240", df.bucketed_dwell_times["121-240"])\
                        .drop("bucketed_dwell_times")\
                        .filter(col("Sunday_visits") > 0)\
                        .drop("related_same_day_brand", "related_same_month_brand")\
                        .withColumn("churchVisitTime",col("timeInLocation_21_60") + col("timeInLocation_61_120") + col("timeInLocation_121_240") + col("timeInLocation_240_999"))\
                        .withColumn("nonChurchVisitTime" , col("timeInLocation_0_4") + col("timeInLocation_5_10") + col("timeInLocation_11_20"))\
                        .withColumn("start_year", year("date_range_start"))\
                        .withColumn("start_month", month("date_range_start"))\
                        .withColumn("end_year", year("date_range_end"))\
                        .withColumn("end_month", month("date_range_end"))\
                        .withColumn("device_sampling_rate", col("normalized_visits_by_state_scaling") / col("raw_visit_counts"))

# quartile_expr = expr(
#     "statFunc(df_v1).approxQuantile(tract_members_v3, array(0.25, 0.75), 0)"
# )

# Now I'm doing the Calculations
df_v2 = df_v1.withColumn("Sunday_visits_actual", col("normalized_visits_by_state_scaling") * col("Sunday_visits") / col("raw_visit_counts"))\
             .withColumn("churchVisitTime_ratio", col("churchVisitTime") / col("raw_visit_counts"))\
             .withColumn("Sunday_visits_actual_v2", col("Sunday_visits_actual") * col("churchVisitTime_ratio"))\
             .withColumn("Sunday_members", col("Sunday_visits_actual_v2") / 4.345)\
             .withColumn("values", map_values(col("visitor_home_aggregation")))\
             .withColumn("tractTotals", expr("aggregate(values, 0, (acc, x) -> acc + x)"))\
             .select("*", explode(col("visitor_home_aggregation")).alias("tract_num", "tract_visitors"))\
             .withColumn("tract_members", col("Sunday_members") * col("tract_visitors")/col("tractTotals"))\
             .groupBy('placekey', 'tract_num')\
             .agg(
                sum("tract_members").alias("tract_members_v2"), 
                count("*").alias("church_patterns_count")
             )\
             .withColumn("tract_members_v3", col("tract_members_v2") / col("church_patterns_count"))\
             .groupBy('tract_num')\
             .agg(
                sum("tract_members_v3").alias("final_tract_members"),
                count("tract_num").alias("tract_count"),
            )\
            .withColumnRenamed("tract_num", "tract")\
            .withColumnRenamed("final_tract_members", "active_members")\
            .filter(col("tract_num").like("16%"))                                                      # filtered for idaho tracts


population = spark.read.parquet("dbfs:/FileStore/population.parquet")
population = population.withColumnRenamed("tract", "tract_poptable")

df_v3 = df_v2.join(population,population.tract_poptable ==  df_v2.tract,"inner")\
             .drop("tract_poptable")\
             .withColumn("%_active_members", col("active_members") / col("population") * 100)

display(df_v3)
display(df_v3.count())



tract,active_members,tract_count,population,%_active_members
16029960100,719.2574355372498,31,4386.0,16.398938338742585
16031950100,399.40146051469225,27,2879.0,13.872923255112616
16027021700,187.1250816843148,21,11701.0,1.5992229867901442
16055000500,66.69136601168864,5,14213.0,0.4692279322570086
16031950300,522.8948417134097,27,4864.0,10.750305133910562
16027021002,333.5463402307311,27,10662.0,3.1283655996129345
16067970400,282.8509460531796,20,4494.0,6.293968537008891
16027021500,185.51757497164965,17,4994.0,3.714809270557663
16069960700,12.095163846587436,1,5424.0,0.2229934337497683
16001002224,418.49625494054953,28,6374.0,6.565677046447278


289

In [None]:
display(df_v3.agg(
        max("active_members").alias("max"), 
        min("active_members").alias("min")
    ))
display(df_v3.select(
        mean(col("active_members")).alias("mean"),\
        stddev(col('active_members')).alias('std')
    ))

appox_items = df_v3.approxQuantile('active_members', [0.25, 0.5, 0.75], 0.01)
print(appox_items)


max,min
2897.1673156613924,3.6418824124856055


mean,std
316.0711721649893,370.1095419523343


[75.18001791321984, 185.51757497164965, 391.99529097570667]


In [None]:
rexburg_tracts = ["16065950100", "16065950200", "16065950400", "16065950301", "16065950500", "16065950302"]
courd_tracts = ["16055000402", "16055000401", "16055001200", "16055000900"]

graph_rexburg_data = df_v3.filter(col("tract").isin(rexburg_tracts))
graph_courd_d_alene_data = df_v3.filter(col("tract").isin(courd_tracts))

display(graph_rexburg_data.select("tract", "population", "active_members", "%_active_members"))
display(graph_courd_d_alene_data.select("tract", "population", "active_members", "%_active_members"))
display(df_v3)

tract,population,active_members,%_active_members
16065950500,4872.0,1182.5166746788796,24.271688724935952
16065950100,6035.0,1345.7780713269278,22.29955379166409
16065950302,13852.0,2897.1673156613924,20.91515532530604
16065950200,4602.0,743.4460850459758,16.154847567274572
16065950301,3208.0,1320.800858188071,41.17209657693488
16065950400,6531.0,923.339316599268,14.137793853916214


Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

tract,population,active_members,%_active_members
16055001200,6535.0,57.93069443794178,0.8864681627841129
16055000402,8894.0,87.99509676741131,0.9893759474635856
16055000401,5808.0,94.61259970163204,1.6290048157994497
16055000900,5978.0,70.81061223532066,1.1845201109956618


Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

tract,active_members,tract_count,population,%_active_members
16029960100,719.2574355372498,31,4386.0,16.398938338742585
16031950100,399.40146051469225,27,2879.0,13.872923255112616
16027021700,187.1250816843148,21,11701.0,1.5992229867901442
16055000500,66.69136601168864,5,14213.0,0.4692279322570086
16031950300,522.8948417134097,27,4864.0,10.750305133910562
16027021002,333.5463402307311,27,10662.0,3.1283655996129345
16067970400,282.8509460531796,20,4494.0,6.293968537008891
16027021500,185.51757497164965,17,4994.0,3.714809270557663
16069960700,12.095163846587436,1,5424.0,0.2229934337497683
16001002224,418.49625494054953,28,6374.0,6.565677046447278


Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

In [None]:
df_v3.select("tract", "active_members").write.mode("overwrite").parquet("FileStore/target.parquet")

In [None]:
target = spark.read.parquet("dbfs:/FileStore/target.parquet/part-00000-tid-1372722263700821896-5d8f0962-b921-455e-a909-9edc25b001af-456-1-c000.snappy.parquet")
display(target)
display(target.count())

tract,active_members
16029960100,719.2574355372498
16031950100,399.40146051469225
16027021700,187.1250816843148
16055000500,66.69136601168864
16031950300,522.8948417134097
16027021002,333.5463402307311
16067970400,282.8509460531796
16027021500,185.51757497164965
16069960700,12.095163846587436
16001002224,418.49625494054953


289

In [None]:
# !pip install geopandas
# !pip install geodatasets

Collecting geopandas
  Downloading geopandas-0.14.0-py3-none-any.whl (1.1 MB)
[?25l[K     |▎                               | 10 kB 15.8 MB/s eta 0:00:01[K     |▋                               | 20 kB 9.2 MB/s eta 0:00:01[K     |█                               | 30 kB 12.7 MB/s eta 0:00:01[K     |█▏                              | 40 kB 7.1 MB/s eta 0:00:01[K     |█▌                              | 51 kB 6.2 MB/s eta 0:00:01[K     |█▉                              | 61 kB 7.3 MB/s eta 0:00:01[K     |██                              | 71 kB 4.3 MB/s eta 0:00:01[K     |██▍                             | 81 kB 4.8 MB/s eta 0:00:01[K     |██▊                             | 92 kB 5.3 MB/s eta 0:00:01[K     |███                             | 102 kB 5.9 MB/s eta 0:00:01[K     |███▎                            | 112 kB 5.9 MB/s eta 0:00:01[K     |███▋                            | 122 kB 5.9 MB/s eta 0:00:01[K     |███▉                            | 133 kB 5.9 MB/s eta 0:00:

In [None]:
# import matplotlib
# import pandas as pd
# import geopandas
# import matplotlib.pyplot as plt
# from geodatasets import get_path

# tract_final_pandas = tract_final_calculations.toPandas()

# # idaho_bbox = (-116.916073, 41.987144, -111.043564, 49.001494)  # (minx, miny, maxx, maxy)


# gdf = geopandas.GeoDataFrame(
#     tract_final_pandas, geometry=geopandas.points_from_xy(tract_final_pandas.long, tract_final_pandas.lat), crs="EPSG:4326"
# )

# world = geopandas.read_file(get_path("dbfs:/FileStore/tl_rd22_16_tract.shp"))

# # We restrict to Idaho
# ax = world.clip([-117.5, 41.5, -111, 49.5]).plot(color="white", edgecolor="black")

# # We can now plot our ``GeoDataFrame``.
# gdf.plot(ax=ax, column='final_tract_members', markersize=8, legend=True, aspect='auto').set_title('Tract Active members in 2019 (Idaho)')

# plt.show()





[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-3231986104970970>:7[0m
[1;32m      4[0m [38;5;28;01mimport[39;00m [38;5;21;01mmatplotlib[39;00m[38;5;21;01m.[39;00m[38;5;21;01mpyplot[39;00m [38;5;28;01mas[39;00m [38;5;21;01mplt[39;00m
[1;32m      5[0m [38;5;28;01mfrom[39;00m [38;5;21;01mgeodatasets[39;00m [38;5;28;01mimport[39;00m get_path
[0;32m----> 7[0m tract_final_pandas [38;5;241m=[39m tract_final_calculations[38;5;241m.[39mtoPandas()
[1;32m      9[0m [38;5;66;03m# idaho_bbox = (-116.916073, 41.987144, -111.043564, 49.001494)  # (minx, miny, maxx, maxy)[39;00m
[1;32m     12[0m gdf [38;5;241m=[39m geopandas[38;5;241m.[39mGeoDataFrame(
[1;32m     13[0m     tract_final_pandas, geometry[38;5;241m=[39mgeopandas[38;5;241m.[39mpoints_from_xy(tract_final_pandas[38;5;241m.[39mlong, tract_final_pandas

In [None]:
# # pkmap = spark.read.parquet("dbfs:/FileStore/tl_rd22_16_tract.shp")

# # pkmap = spark.read.format("shapefile").load("file:/FileStore/tl_rd22_16_tract.shp")
# pkmap = spark.read.format("shapefile").load("dbfs:/FileStore/tl_rd22_16_tract.shp")

# display(pkmap)

[0;31m---------------------------------------------------------------------------[0m
[0;31mPy4JJavaError[0m                             Traceback (most recent call last)
File [0;32m<command-2178468853737223>:4[0m
[1;32m      1[0m [38;5;66;03m# pkmap = spark.read.parquet("dbfs:/FileStore/tl_rd22_16_tract.shp")[39;00m
[1;32m      2[0m 
[1;32m      3[0m [38;5;66;03m# pkmap = spark.read.format("shapefile").load("file:/FileStore/tl_rd22_16_tract.shp")[39;00m
[0;32m----> 4[0m pkmap [38;5;241m=[39m spark[38;5;241m.[39mread[38;5;241m.[39mformat([38;5;124m"[39m[38;5;124mshapefile[39m[38;5;124m"[39m)[38;5;241m.[39mload([38;5;124m"[39m[38;5;124mdbfs:/FileStore/tl_rd22_16_tract.shp[39m[38;5;124m"[39m)
[1;32m      6[0m display(pkmap)

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:48[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m     46[0m start [38;5;241m=[39m time[38;5;241m.[39mperf_counter()


In [None]:
# # !pip install shapefile
# !pip install pyshp

Collecting pyshp
  Downloading pyshp-2.3.1-py2.py3-none-any.whl (46 kB)
[?25l[K     |███████                         | 10 kB 20.1 MB/s eta 0:00:01[K     |██████████████                  | 20 kB 17.5 MB/s eta 0:00:01[K     |█████████████████████▏          | 30 kB 11.4 MB/s eta 0:00:01[K     |████████████████████████████▏   | 40 kB 5.7 MB/s eta 0:00:01[K     |████████████████████████████████| 46 kB 3.2 MB/s 
[?25hInstalling collected packages: pyshp
Successfully installed pyshp-2.3.1
You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-8b3329ed-b6ab-4480-97fe-ed9561f0bdd8/bin/python -m pip install --upgrade pip' command.[0m


In [None]:
# import polars as pl
# import plotly.express as px
# import urllib.request
# from urllib.request import urlopen
# import numpy as np
# import json
# import shapefile
# from json import dumps


# # function for converting shapefiles to geoJSON
# def shapefile2geojson(shape, geojson, idProperty=None):
#     # read the shapefile
#     reader = shapefile.Reader(shape)
#     fields = reader.fields[1:]
#     field_names = [field[0] for field in fields]
#     buffer = []
#     for sr in reader.shapeRecords():
#         atr = dict(zip(field_names, sr.record))
#         geom = sr.shape.__geo_interface__
#         if idProperty is not None:
#             buffer.append({'type':"Feature", 'geometry':geom, 'properties':atr, 'id':atr[idProperty]})
#         else:
#             buffer.append({'type':"Feature", 'geometry':geom, 'properties':atr})
#     # write the GeoJSON file
#     with open(geojson, "w") as f:
#         f.write(dumps({"type": "FeatureCollection", "features": buffer}, indent=2) + "\n")


# shapefile2geojson(
#     'dbfs:/FileStore/tl_rd22_16_tract.shp',
#     'idahoTractShapes.geojson',
#     'TRACTCE' # This makes the geojson indexable by tractcode -- important!
# )


# with open('idahoTractShapes.geojson', 'r') as f:
#     idahoShapes = json.load(f)


# graphme = (
#     pl.read_csv('your/exported/data/here')
#     .with_columns(pl.col('tractcode').cast(pl.Utf8).str.rjust(11, '0').str.slice(5,).alias('tract'))
#     .with_columns(pl.col('tractcode').cast(pl.Utf8).str.rjust(11, '0'))
#     # .with_columns(pl.col('tractcode').cast(pl.Utf8).str.rjust(11, '0'))
#     # .join(tract_table, on='tract', how='outer')#.filter(pl.col('tract').is_null())
#     # .with_columns(pl.concat_str(pl.col('tract').fill_null(''), pl.col('tract_right').fill_null('')).alias('tract'))
#     # .drop('tractcode_right')
#     # .with_columns(pl.col(‘estimatedMembers').fill_null(0))
#     .with_columns(np.log10(pl.col('estimatedMembers')))
# )


# fig = px.choropleth(graphme,
# geojson=idahoShapes, locations='tract',
# color='estimatedMembers',
# scope="usa",
# labels={'estimatedMembers': 'Estimated LDS Membership (log scale)'},
# title='Map of Estimated LDS Membership in Idaho using Available Data',
# )
# fig.update_layout(margin={"r":0,"t":40,"l":0,"b":0})
# fig.update_geos(fitbounds='locations', visible=True)


# # Get county shapes (as opposed to tract shapes)
# with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
#     countyShapes = json.load(response)

# graphme2 = graphme.select(
#     'estimatedMembers',
#     pl.col('tractcode').str.slice(0, 5).alias('county'),
#     'tractcode'
# )


# # Download this from https://github.com/byuibigdata/temple_placement_FA23/blob/main/temple_details.parquet
# temples = pl.read_parquet('temple_details.parquet', use_pyarrow=True)


# fig = px.choropleth(graphme2,
# geojson=countyShapes, locations='county',
# color='estimatedMembers',
# scope="usa",
# labels={'estimatedMembers': 'Estimated LDS Membership (log scale)'},
# title='Map of Estimated LDS Membership per County using Idaho Data, with Temples',
# )
# fig.update_layout(margin={"r":0,"t":40,"l":0,"b":0})


# fig.add_scattergeo(
# lat=temples['lat'],
# lon=temples['long'],
# text=temples['temple'],
# marker=go.scattergeo.Marker(
# size=10,
# color='rgb(0, 0, 255)',
# opacity=0.7
# ),
# )


# lat, lon = 45.256934, -114.711226
# fig.update_geos(
#     center_lat=lat,
#     center_lon=lon,
#     visible=True,
#     projection_scale=3.2,
# )


[0;31m---------------------------------------------------------------------------[0m
[0;31mModuleNotFoundError[0m                       Traceback (most recent call last)
File [0;32m<command-2178468853737222>:1[0m
[0;32m----> 1[0m [38;5;28;01mimport[39;00m [38;5;21;01mpolars[39;00m [38;5;28;01mas[39;00m [38;5;21;01mpl[39;00m
[1;32m      2[0m [38;5;28;01mimport[39;00m [38;5;21;01mplotly[39;00m[38;5;21;01m.[39;00m[38;5;21;01mexpress[39;00m [38;5;28;01mas[39;00m [38;5;21;01mpx[39;00m
[1;32m      3[0m [38;5;28;01mimport[39;00m [38;5;21;01murllib[39;00m[38;5;21;01m.[39;00m[38;5;21;01mrequest[39;00m

File [0;32m/databricks/python_shell/dbruntime/PythonPackageImportsInstrumentation/__init__.py:171[0m, in [0;36m_create_import_patch.<locals>.import_patch[0;34m(name, globals, locals, fromlist, level)[0m
[1;32m    166[0m thread_local[38;5;241m.[39m_nest_level [38;5;241m+[39m[38;5;241m=[39m [38;5;241m1[39m
[1;32m    168[0m [38;5;28;01mtry[3