In [5]:
import os

import h3
import pandas as pd
import xml.etree.ElementTree as ET
from datetime import datetime

In [14]:
def parse_gpx_to_dataframe(gpx_file_path):
    """
    Parse a GPX file and convert it to a pandas DataFrame.
    Returns a DataFrame with columns: lat, lon, elevation, time, heart_rate
    """
    tree = ET.parse(gpx_file_path)
    root = tree.getroot()
    
    # Define namespaces
    namespaces = {
        'gpx': 'http://www.topografix.com/GPX/1/1',
        'gpxtpx': 'http://www.garmin.com/xmlschemas/TrackPointExtension/v1'
    }
    
    track_points = []
    
    # Find all track points
    for trkpt in root.findall('.//gpx:trkpt', namespaces):
        lat = float(trkpt.get('lat'))
        lon = float(trkpt.get('lon'))
        
        # Get elevation
        ele = trkpt.find('gpx:ele', namespaces)
        elevation = float(ele.text) if ele is not None else None
        
        # Get time
        time_elem = trkpt.find('gpx:time', namespaces)
        time_str = time_elem.text if time_elem is not None else None
        
        # Get heart rate from extensions
        hr_elem = trkpt.find('.//gpxtpx:hr', namespaces)
        heart_rate = int(hr_elem.text) if hr_elem is not None else None
        
        track_points.append({
            'lat': lat,
            'lon': lon,
            'elevation': elevation,
            'time': time_str,
            'heart_rate': heart_rate
        })
    
    df = pd.DataFrame(track_points)
    
    # Convert time to datetime
    if 'time' in df.columns:
        df['time'] = pd.to_datetime(df['time'])
    
    return df

def add_h3_to_gpx_dataframe(df, resolution=9):
    """Add H3 index and boundaries to the GPX DataFrame"""
    h3_data = []
    for _, row in df.iterrows():
        lat = row['lat']
        lon = row['lon']
        h3_index = h3.latlng_to_cell(lat, lon, resolution)
        boundaries = h3.cell_to_boundary(h3_index)
        h3_data.append({
            'h3_index': h3_index,
            'h3_boundaries': boundaries
        })
    
    h3_df = pd.DataFrame(h3_data)
    return pd.concat([df, h3_df], axis=1)

In [None]:
# Parse the GPX file
gpx_df = parse_gpx_to_dataframe('../data/player_runs/pl_001/Afternoon_Run.gpx')
print(f"GPX data shape: {gpx_df.shape}")
print(f"Columns: {gpx_df.columns.tolist()}")

GPX data shape: (10805, 5)
Columns: ['lat', 'lon', 'elevation', 'time', 'heart_rate']


(10805, 5)

# Compile data

In [11]:
path_data  = '../data/player_runs/'

In [55]:
player_data_file_name = 'player_table.csv'

df_player_data = pd.read_csv(os.path.join(path_data, '..', player_data_file_name))[['player_id',
'player_name', 'player_color']]

In [56]:
l_player_folders = [f for f in os.listdir('../data/player_runs/') if f.startswith('pl')]

df_territory = pd.DataFrame()

for player_folder in l_player_folders:

    df_tmp_player = pd.DataFrame()

    for gpx_file in os.listdir(os.path.join(path_data, player_folder)):

        df_tmp_gpx = add_h3_to_gpx_dataframe(
            parse_gpx_to_dataframe(
                os.path.join(path_data, player_folder, gpx_file)
            )
        )
    
        df_tmp_player = pd.concat([df_tmp_player, df_tmp_gpx])

    df_tmp_player = df_tmp_player.groupby('h3_index', as_index=False).agg(
    {'time':'max'}
    )

    df_tmp_player['player_id'] = player_folder

    df_territory = pd.concat([df_territory, df_tmp_player])

df_territory = df_territory.sort_values(by='time')

df_territory = df_territory.groupby('h3_index', as_index=False).agg(
{'time':'max',
'player_id':'first'}
)

df_territory = pd.merge(df_territory, df_player_data, how='left', on = 'player_id')

In [57]:
df_territory

Unnamed: 0,h3_index,time,player_id,player_name,player_color
0,89396002247ffff,2025-09-07 09:39:25+00:00,pl_002,Maxime BBBB,#90D5FF
1,8939600224bffff,2025-09-07 10:02:32+00:00,pl_002,Maxime BBBB,#90D5FF
2,8939600224fffff,2025-09-07 10:00:58+00:00,pl_002,Maxime BBBB,#90D5FF
3,89396002257ffff,2025-09-07 09:37:48+00:00,pl_002,Maxime BBBB,#90D5FF
4,89396002273ffff,2025-09-07 09:41:35+00:00,pl_002,Maxime BBBB,#90D5FF
...,...,...,...,...,...
135,8939601aea3ffff,2025-09-06 13:55:22+00:00,pl_001,Maxime Le Chanceux,#EFBF04
136,8939601aea7ffff,2025-09-06 12:34:43+00:00,pl_001,Maxime Le Chanceux,#EFBF04
137,8939601aeabffff,2025-09-06 13:54:22+00:00,pl_001,Maxime Le Chanceux,#EFBF04
138,8939601aeafffff,2025-09-06 13:58:19+00:00,pl_001,Maxime Le Chanceux,#EFBF04


In [58]:
df_territory.to_csv('../data/gpx_treated_data.csv')

In [59]:
df_territory.loc[df_territory.h3_index == '8939601ae23ffff']

Unnamed: 0,h3_index,time,player_id,player_name,player_color
129,8939601ae23ffff,2025-09-06 12:55:29+00:00,pl_001,Maxime Le Chanceux,#EFBF04


# tmp IN post

In [None]:
df_in = pd.read_csv('../tmp_linkedin_post.csv')

In [62]:
df_in.head()

Unnamed: 0,raw,clean
0,Salomon 2425 trail short men,Men - Trail - Shorts
1,Alim-oct24 boost tah chocolate,Food - Bars
2,nike air zoom pegasus 40 w blk/wht sz8,Women - Running - Shoes
3,proteine whey vanille 2kg myprotein promo,Food - Supplements
4,garmin fenix 7 sapphire solar titane gps,Electronics - Watches - GPS


In [70]:
import time

In [78]:
for _, row in df_in.iterrows():
    print(row['raw'])
    time.sleep(0.7)
    print(row['clean'])
    time.sleep(0.2)
    print('')

Salomon 2425 trail short men 
Men - Trail - Shorts

Alim-oct24 boost tah chocolate
Food - Bars

nike air zoom pegasus 40 w blk/wht sz8
Women - Running - Shoes 

proteine whey vanille 2kg myprotein promo
Food - Supplements

garmin fenix 7 sapphire solar titane gps
Electronics - Watches - GPS

patagonia nano puff jkt mens blue M
Men - Outdoor - Jackets

clif bar crunchy pb 12pack exp-dec24
Food - Bars

