# GameBus Health Behavior Mining - Location Categorization

This notebook demonstrates how to categorize location data from GameBus by type using the Google Places API (new).

## Setup

First, let's set up our environment and import the necessary modules.

In [1]:
import sys
import os
import pandas as pd
import json
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from dotenv import load_dotenv
import numpy as np

# Add the project root directory to the Python path
sys.path.append('..')

# Import project modules
from config.paths import RAW_DATA_DIR
from config.credentials import GOOGLE_PLACES_API_KEY
from src.utils.logging import setup_logging
from src.utils.file_handlers import load_json, save_json

# Load environment variables (for Google Places API key)
load_dotenv()

# Set up logging
logger = setup_logging(log_level="INFO")
logger.info("Notebook initialized")

2025-05-15 11:03:00,295 - gamebus_health_mining - INFO - Notebook initialized


## Load location data from a GameBus user
This corresponds to the output of the Extraction service

In [20]:
# Load location data
player_id = 107631  # Use the same player ID as in data extraction
location_file = f"{RAW_DATA_DIR}/player_{107631}_location.json"

try:
    location_data = load_json(location_file)
    print(f"Loaded {len(location_data)} location data points")
    
    # Convert to DataFrame for easier analysis
    location_df = pd.DataFrame(location_data)
    display(location_df.head())
    
except Exception as e:
    print(f"Error loading location data: {e}")

Loaded 15130 location data points


Unnamed: 0,LATITUDE,LONGITUDE,ALTIDUDE,SPEED,ERROR,TIMESTAMP,ARM,activity_id,date,gameDescriptor
0,200,200,0,0,0,1747300689,Arm 2,3358902,1747300749000,GEOFENCE
1,200,200,0,0,0,1747300628,Arm 2,3358901,1747300689000,GEOFENCE
2,200,200,0,0,0,1747300568,Arm 2,3358900,1747300629000,GEOFENCE
3,200,200,0,0,0,1747300508,Arm 2,3358897,1747300568000,GEOFENCE
4,200,200,0,0,0,1747300448,Arm 2,3358896,1747300508000,GEOFENCE


In [17]:
import os, requests
#LAT, LNG = 51.44345855712891, 5.481135845184326

def get_place_type_from_coord(lat: float, lng: float) -> str:
    url = "https://places.googleapis.com/v1/places:searchNearby"
    headers = {
        "Content-Type": "application/json",
        "X-Goog-Api-Key": GOOGLE_PLACES_API_KEY,
        # Ask only for what we need – keeps the response tiny and cheap.
        "X-Goog-FieldMask": "places.displayName,places.types,places.primaryType",
    }
    payload = {
        #"includedTypes": ["library", "preschool", "school", "primary_school", "secondary_school", "university"],
        "maxResultCount": 5, #can change for debugging
        "locationRestriction": {
            "circle": {
            "center": {
                "latitude": lat,
                "longitude": lng},
            "radius": 20
            }
        },                                             
        "rankPreference": "DISTANCE" # closest first
    }         
    resp = requests.post(url, headers=headers, json=payload, timeout=5)
    resp.raise_for_status()
    places = resp.json().get("places", [])
    print(places)
    if not places:
        return "other"

    # Google always returns primaryType; fall back to full list if you like
    types = places[0]["types"]
    primaryType = places[0]["primaryType"]

    return primaryType, types

#get_place_type_from_coord(LAT,LNG)

[]


'other'

In [3]:
# Load location data
player_id = 107631  # Use the same player ID as in data extraction
location_file = f"{RAW_DATA_DIR}/player_{107631}_location.json"

try:
    location_data = load_json(location_file)
    print(f"Loaded {len(location_data)} location data points")
    
    # Convert to DataFrame for easier analysis
    location_df = pd.DataFrame(location_data)
    display(location_df.head())
    
except Exception as e:
    print(f"Error loading location data: {e}")

Loaded 13408 location data points


Unnamed: 0,LATITUDE,LONGITUDE,ALTIDUDE,SPEED,ERROR,TIMESTAMP,ARM,activity_id,date,gameDescriptor
0,200,200,31,0,0,1747154573,Arm 2,3339642,1747154634000,GEOFENCE
1,200,200,31,0,0,1747154513,Arm 2,3339641,1747154574000,GEOFENCE
2,200,200,31,0,0,1747154453,Arm 2,3339640,1747154514000,GEOFENCE
3,200,200,31,0,0,1747154393,Arm 2,3339639,1747154453000,GEOFENCE
4,200,200,31,0,0,1747154333,Arm 2,3339638,1747154393000,GEOFENCE


In [10]:
import math
def haversine(lat1, lon1, lat2, lon2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees) in meters.
    """
    # Convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(math.radians, [float(lon1), float(lat1), float(lon2), float(lat2)])

    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
    c = 2 * math.asin(math.sqrt(a))
    r = 6371000  # Radius of Earth in meters
    return c * r

In [21]:
# Initialize the new column
location_df['location_type'] = pd.NA # Using pandas NA for broader compatibility, or np.nan

location_df = location_df.iloc[0:5]
print(location_df)	

  LATITUDE LONGITUDE ALTIDUDE SPEED ERROR   TIMESTAMP    ARM  activity_id  \
0      200       200        0     0     0  1747300689  Arm 2      3358902   
1      200       200        0     0     0  1747300628  Arm 2      3358901   
2      200       200        0     0     0  1747300568  Arm 2      3358900   
3      200       200        0     0     0  1747300508  Arm 2      3358897   
4      200       200        0     0     0  1747300448  Arm 2      3358896   

            date gameDescriptor location_type  
0  1747300749000       GEOFENCE          <NA>  
1  1747300689000       GEOFENCE          <NA>  
2  1747300629000       GEOFENCE          <NA>  
3  1747300568000       GEOFENCE          <NA>  
4  1747300508000       GEOFENCE          <NA>  


## Categorize location data points iteratively 
### We keep a list of unique location data point to minimize API calls. We consider a minimum distance between unique location points. 

In [22]:
# Create the new DataFrame for unique processed locations
processed_locations_df = pd.DataFrame(columns=['latitude', 'longitude', 'locationtype'])

In [23]:
for index, row in location_df.iterrows():
        lat = row['LATITUDE']
        lon = row['LONGITUDE']
        print(f"Processing location: ({lat}, {lon})")
        
        if pd.isna(lat) or pd.isna(lon) or (float(lat) == 200 and float(lon) == 200):
            location_df.loc[index, 'location_type'] = np.nan 
            print(f"Skipping API call for ({lat}, {lon}) as NaN or (200, 200)")
        else:
            found_nearby_in_processed = False
            cached_type = pd.NA

            # Check against already processed unique locations
            for _, processed_row in processed_locations_df.iterrows():
                # Ensure processed_row lat/lon are not NaN before distance calculation
                if pd.notna(processed_row['latitude']) and pd.notna(processed_row['longitude']):
                    dist = haversine(lat, lon, processed_row['latitude'], processed_row['longitude'])
                    if dist < 10: # Closer than 10 meters
                        found_nearby_in_processed = True
                        cached_type = processed_row['locationtype']
                        break
            
            if found_nearby_in_processed:
                location_df.loc[index, 'location_type'] = cached_type
                print(f"Found nearby in processed: {found_nearby_in_processed} with type {cached_type}")
            else:
                # Call API if not found in processed locations
                # Ensure API key is available
                place_primary_type = "unknown" # Default type if API call fails or key missing
                if 'GOOGLE_PLACES_API_KEY' not in globals() or GOOGLE_PLACES_API_KEY == "YOUR_ACTUAL_GOOGLE_PLACES_API_KEY" or not GOOGLE_PLACES_API_KEY: # check if empty
                    print(f"API Key not configured. Using '{place_primary_type}' for ({lat}, {lon}).") 
                    pass # Keep default "unknown"
                else:
                    print(f"Calling API for: ({lat}, {lon})") 
                    api_type_result = get_place_type_from_coord(lat, lon)
                    if api_type_result: # If not None or empty string
                        place_primary_type = api_type_result
                    else:
                        place_primary_type = "api_returned_empty" # Or "other" if that's preferred for empty API results

                location_df.loc[index, 'location_type'] = place_primary_type
                print(f"API returned: {place_primary_type}")
                
                # Add this new unique location to processed_locations_df
                # Make sure to use a dictionary for the new row
                new_entry_dict = {'latitude': lat, 'longitude': lon, 'locationtype': place_primary_type}
                new_entry_df = pd.DataFrame([new_entry_dict])
                processed_locations_df = pd.concat([processed_locations_df, new_entry_df], ignore_index=True)
    
print("\\nFinished processing location data.")
print("Updated location_df head:")
display(location_df.head())
print("\\nUnique processed locations and their types (processed_locations_df):")
display(processed_locations_df)

Processing location: (200, 200)
Skipping API call for (200, 200) as NaN or (200, 200)
Processing location: (200, 200)
Skipping API call for (200, 200) as NaN or (200, 200)
Processing location: (200, 200)
Skipping API call for (200, 200) as NaN or (200, 200)
Processing location: (200, 200)
Skipping API call for (200, 200) as NaN or (200, 200)
Processing location: (200, 200)
Skipping API call for (200, 200) as NaN or (200, 200)
\nFinished processing location data.
Updated location_df head:


Unnamed: 0,LATITUDE,LONGITUDE,ALTIDUDE,SPEED,ERROR,TIMESTAMP,ARM,activity_id,date,gameDescriptor,location_type
0,200,200,0,0,0,1747300689,Arm 2,3358902,1747300749000,GEOFENCE,
1,200,200,0,0,0,1747300628,Arm 2,3358901,1747300689000,GEOFENCE,
2,200,200,0,0,0,1747300568,Arm 2,3358900,1747300629000,GEOFENCE,
3,200,200,0,0,0,1747300508,Arm 2,3358897,1747300568000,GEOFENCE,
4,200,200,0,0,0,1747300448,Arm 2,3358896,1747300508000,GEOFENCE,


\nUnique processed locations and their types (processed_locations_df):


Unnamed: 0,latitude,longitude,locationtype
