In [None]:
!pip install --upgrade google-cloud-bigquery
from google.colab import auth
auth.authenticate_user()
from google.cloud import bigquery

In [None]:
# Set project ID
project_id = "rock-finder-project"
client = bigquery.Client(project=project_id)

In [None]:
query = """
SELECT * FROM `rock-finder-project.routes.routes_gold`
"""
df = client.query(query).to_dataframe()

In [None]:
#Pandas
import pandas as pd

In [None]:
Cluster method 1

In [None]:
#Rounding the latitude and longitude values to make clusters. 1 decimal us around 11 km, 2 decimals is around 1 km.
df['lat_rounded'] = df['area_latitude'].round(1)
df['lon_rounded'] = df['area_longitude'].round(1)

In [None]:
#Group by the rouded lat & lon values
grouped = df.groupby(['lat_rounded', 'lon_rounded'])
grouped.size()

In [None]:
Cluster method 2

In [None]:
# Cluster using DBSCAN
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Assuming you have a DataFrame `df` with columns 'latitude' and 'longitude'
coords = df[['area_latitude', 'area_longitude']].to_numpy()

# Convert degrees to radians for haversine
coords_rad = np.radians(coords)

# DBSCAN with Haversine distance (great-circle distance)
kms_per_radian = 6371.0088
epsilon = 1 / kms_per_radian  # 0.5 km radius

db = DBSCAN(eps=epsilon, min_samples=2, algorithm='ball_tree', metric='haversine').fit(coords_rad)

# Add cluster labels back to the DataFrame
df['cluster'] = db.labels_

In [None]:
# 1. Compute means
grouped_df = df.groupby("cluster").agg({
    "area_latitude": "mean",
    "area_longitude": "mean",
    "lat_rounded": "mean",
    "lon_rounded": "mean"
}).reset_index()

# 2. Compute count per group
cluster_sizes = df.groupby("cluster").size().reset_index(name="cluster_size")

# 3. Merge the two
grouped_df = pd.merge(grouped_df, cluster_sizes, on="cluster")

# 4. Rename average columns for clarity
grouped_df = grouped_df.rename(columns={
    "area_latitude": "area_latitude_avg",
    "area_longitude": "area_longitude_avg",
    "lat_rounded": "lat_rounded_avg",
    "lon_rounded": "lon_rounded_avg"
})

In [None]:
df.columns

In [None]:
grouped_df.columns

In [None]:
# merge the grouped information and averages into the initioal table. This way we are able to keep the key value as well as our averages and our count for each cluster
final_df = pd.merge(df,grouped_df, on = "cluster")

In [None]:
final_df.columns

In [None]:
# Only keep the rows we are interested in 
final_df = final_df[["key","cluster","area_latitude_avg","area_longitude_avg","cluster_size"]]
final_df

In [None]:
#Prepare to upload new dataframe to Big Query
project_id = "rock-finder-project"
dataset_id = "rock-finder-project.routes"
table_id = "clusters"
full_table_id = f"{dataset_id}.{table_id}"

In [None]:
#Upload new Dataframe to Big query 
from pandas_gbq import to_gbq
to_gbq(final_df, destination_table=full_table_id, project_id=project_id, if_exists="replace")