In [0]:
%pip install geoscan

In [0]:
import pyspark.sql.functions as F
from geoscan import Geoscan

In [0]:
df = spark.sql("SELECT * FROM databricks_tourism_workspace.default.gowalla_checkins")

df = df.select("user", "checkinTime", "latitude", "longitude")

In [0]:
geoscan = Geoscan() \
    .setLatitudeCol("latitude") \
    .setLongitudeCol("longitude") \
    .setPredictionCol("cluster") \
    .setEpsilon(400) \
    .setMinPts(2)

model = geoscan.fit(df)

In [0]:
df = model.transform(df)

In [0]:
## Remove null clusters (outliers)
df = df.filter(F.col("cluster").isNotNull())

## Calculate average of latitude and longtitude for points in one cluster
location = df.groupBy("cluster").agg(F.avg("latitude").alias("latitude"), F.avg("longitude").alias("longitude"))

In [0]:
df_temp = df.select("user", "checkinTime", "cluster")
df = df_temp.join(location, on=(df_temp.cluster == location.cluster)).select("user", "checkinTime", "latitude", "longitude", df_temp.cluster)

In [0]:
display(df)

In [0]:
df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("gowalla_checkins")