In [None]:
# Authenticate and install
from google.colab import auth
auth.authenticate_user()

!pip install --quiet google-cloud-bigquery pandas

#  Imports
import pandas as pd
import numpy as np
from google.cloud import bigquery

#  Initialize BigQuery client and load the table into a DataFrame
project_id = "rock-finder-project"
client = bigquery.Client(project=project_id)
table_id = "rock-finder-project.routes.mp_routes"

query = f"SELECT * FROM `{table_id}`"
df = client.query(query).to_dataframe()

#  Split Location into parts (robust to any number of ' > ' levels)
location_split = df['Location'].str.split(' > ')

#  Extract "area", "region" (island or subregion), and "state" from the last 3 parts
df['state'] = location_split.apply(lambda x: x[-1] if len(x) >= 1 else None)
df['region'] = location_split.apply(lambda x: x[-2] if len(x) >= 2 else None)
df['area'] = location_split.apply(lambda x: x[-3] if len(x) >= 3 else None)

#  first/topmost part separately:
df['top_level'] = location_split.apply(lambda x: x[0] if len(x) >= 1 else None)

# Handle invalid "Pitches" and missing "Length"
df['Pitches'] = df['Pitches'].replace(-1, np.nan)
df['Length'] = df['Length'].fillna(df['Length'].median())

# Quick sanity check
print(df[['Location', 'area', 'region', 'state']].head())
print("Missing counts:\n", df[['area','region','state','Pitches','Length']].isna().sum())

In [None]:
df['desc'] = df['desc'].str.lower()
df['protection'] = df['protection'].str.lower()

df['desc'] = df['desc'].str.strip()
df['protection'] = df['protection'].str.strip()


df['desc'] = df['desc'].str.replace(r'\s+', ' ', regex=True)
df['protection'] = df['protection'].str.replace(r'\s+', ' ', regex=True)

df['desc'] = df['desc'].fillna('')
df['protection'] = df['protection'].fillna('')

import string
df['desc'] = df['desc'].str.translate(str.maketrans('', '', string.punctuation))
df['protection'] = df['protection'].str.translate(str.maketrans('', '', string.punctuation))


In [None]:
# Rock Climbing Routes: KPI & Feature Enrichment Notebook

#  Setup and Imports 
import pandas as pd
import numpy as np
from google.colab import auth
from google.cloud import bigquery
import pandas_gbq

# Authenticate
auth.authenticate_user()
project_id = 'rock-finder-project'

#  SQL Query: Enrich and Add KPIs (from BigQuery) 
query = """
SELECT *,

  CASE
    WHEN SAFE_CAST(rating_num AS FLOAT64) < 5 THEN 'Beginner'
    WHEN SAFE_CAST(rating_num AS FLOAT64) BETWEEN 5 AND 5.9 THEN 'Intermediate'
    WHEN SAFE_CAST(rating_num AS FLOAT64) BETWEEN 5.10 AND 5.11 THEN 'Advanced'
    WHEN SAFE_CAST(rating_num AS FLOAT64) >= 5.12 THEN 'Expert'
    ELSE 'Unknown'
  END AS difficulty_level,

  CASE
    WHEN Length >= 100 THEN 'Long'
    WHEN Length BETWEEN 50 AND 99 THEN 'Medium'
    WHEN Length < 50 THEN 'Short'
    ELSE 'Unknown'
  END AS length_category,

  CASE
    WHEN num_votes >= 10 THEN 'Popular'
    ELSE 'Niche'
  END AS popularity_flag,

  ROUND(avg_stars, 1) AS avg_star_rounded,

  CASE
    WHEN rating_safety IN ('X', 'R') THEN 'High Risk'
    WHEN rating_safety = 'PG13' THEN 'Moderate Risk'
    ELSE 'Safe'
  END AS risk_level

FROM `rock-finder-project.routes.routes_silver`
"""

#  Load the enriched data into a dataframe 
df = pd.read_gbq(query, project_id=project_id)

#  Add KPI: Length Per Pitch 
df['length_per_pitch'] = df['Length'] / df['Pitches'].replace(0, np.nan)

#  Add KPI: Length Efficiency Category 
def categorize_efficiency(val):
    if pd.isna(val):
        return 'Unknown'
    elif val < 15:
        return 'Very Short'
    elif val < 30:
        return 'Short'
    elif val < 50:
        return 'Moderate'
    elif val < 70:
        return 'Long'
    else:
        return 'Very Long'

df['length_efficiency'] = df['length_per_pitch'].apply(categorize_efficiency)

#  Export to BigQuery as GOLD table 
pandas_gbq.to_gbq(
    df,
    destination_table='routes.routes_gold',
    project_id=project_id,
    if_exists='replace'
)

# Display Result 
df.head()