In [None]:
import numpy as np
import pandas as pd
import scipy.stats
from datetime import datetime, timedelta
from math import radians, sin, cos, asin, sqrt
import os
from collections import defaultdict

# Geographic distance calculation function (unit: kilometers)
def geodistance(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(radians, [float(lon1), float(lat1), float(lon2), float(lat2)])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    distance = 2 * asin(sqrt(a)) * 6371
    return distance  # return kilometers

# JSD calculation function
def js_divergence(p, q):
    p = np.array(p, dtype=float)
    q = np.array(q, dtype=float)
    
    # Avoid division by zero
    p = p / (p.sum() + 1e-9)
    q = q / (q.sum() + 1e-9)
    
    m = (p + q) / 2
    return 0.5 * scipy.stats.entropy(p, m) + 0.5 * scipy.stats.entropy(q, m)

# Trajectory analysis class
class TrajectoryAnalyzer:
    def __init__(self, grid_size=0.01, lat_range=(40.5, 40.9), lon_range=(-74.25, -73.7)):
        self.grid_size = grid_size
        self.lat_min, self.lat_max = lat_range
        self.lon_min, self.lon_max = lon_range
        self.n_lat = int((self.lat_max - self.lat_min) / grid_size) + 1
        self.n_lon = int((self.lon_max - self.lon_min) / grid_size) + 1
        self.grid_total = self.n_lat * self.n_lon
        self.time_bins = 144  # 24 hours * 6 (10-minute intervals)

        self.category_keywords = {
            'Food': [
                'restaurant', 'food', 'cafe', 'coffee', 'deli', 'pizza', 'sandwich', 
                'burger', 'bakery', 'diner', 'donut', 'sushi', 'bagel', 'cream',
                'bbq', 'seafood', 'thai', 'breakfast', 'dessert', 'japanese',
                'caribbean', 'korean', 'chicken', 'indian', 'latin', 'steak',
                'cuban', 'vegan', 'gastropub', 'ramen', 'noodle', 'burrito',
                'hot dog', 'cupcake', 'spanish', 'tea', 'taco', 'salad',
                'middle eastern', 'tapas', 'german', 'mediterranean', 'vietnamese',
                'soup', 'soul', 'wings', 'greek', 'falafel', 'snack', 'brewery',
                'brazilian', 'european', 'australian', 'molecular', 'dim sum',
                'cajun', 'creole', 'dumpling', 'malaysian', 'african', 'filipino',
                'mac & cheese', 'peruvian', 'argentinian', 'scandinavian', 'turkish',
                'ethiopian', 'moroccan', 'swiss', 'fish', 'portuguese', 'distillery',
                'gluten'
            ],
            
            'Travel & Transport': [
                'subway', 'station', 'transport', 'bus', 'road', 'airport', 'hotel',
                'ferry', 'rail', 'parking', 'taxi', 'rental', 'lounge', 'bike share',
                'train', 'commute', 'travel', 'walk', 'drive', 'truck', 'transport',
                'bike', 
            ],
            
            'Shop & Service': [
                'store', 'shop', 'clothing', 'drug', 'pharmacy', 'department',
                'salon', 'barbershop', 'electronic', 'gas', 'garage', 'mall',
                'laundry', 'book', 'convenience', 'automotive', 'furniture', 'home',
                'cosmetic', 'hardware', 'sporting', 'pet', 'office', 'craft',
                'flea', 'toy', 'game', 'smoke', 'candy', 'thrift', 'vintage',
                'bike', 'gift', 'tattoo', 'video', 'jewelry', 'record', 'tanning',
                'nail', 'phone', 'flower', 'bridal', 'hobby', 'camera', 'wash',
                'dealership', 'antique', 'garden', 'board', 'newsstand', 'motorcycle'
            ],
            
            'Nightlife Spot': [
                'bar', 'beer garden', 'nightlife'
            ],
            
            'Arts & Entertainment': [
                'theater', 'movie', 'music', 'entertainment', 'performing',
                'art', 'gallery', 'library', 'arcade', 'museum', 'concert',
                'historic', 'bowling', 'comedy', 'casino', 'pool hall',
                'winery', 'gaming', 'planetarium'
            ],
            
            'Professional & Workplace': [
                'office', 'building', 'medical', 'bank', 'government', 'church',
                'post', 'city', 'moving', 'synagogue', 'design', 'factory',
                'funeral', 'spiritual', 'work' , 'animal', 'temple', 'recycling',
                'financial', 'legal', 'shrine', 'workplace', 'job', 'embassy', 'consulate',
                'mosque', 'military', 'storage', 'internet', 'photography', 'service'
            ],
            
            'Outdoors & Recreation': [
                'gym', 'fitness', 'athletic', 'park', 'outdoors', 'bridge',
                'plaza', 'stadium', 'beach', 'playground', 'scenic', 'harbor',
                'marina', 'garden', 'campground', 'pool', 'cemetery', 'river',
                'zoo', 'racetrack', 'ski', 'castle', 'aquarium'
            ],
            
            'College & University': [
                'college', 'school', 'university', 'academic', 'student',
                'campus', 'fraternity', 'sorority', 'education'
            ],
            
            'Residence': [
                'home', 'residential', 'apartment', 'condo', 'neighborhood',
                'housing'
            ],
            
            'Event': [
                'event', 'convention', 'market', 'fair'
            ]
        }

        # Activity type ID mapping
        self.activity_map = {
            'Travel & Transport': 0,
            'Food': 1, 
            'Shop & Service': 2,
            'Nightlife Spot': 3,
            'Arts & Entertainment': 4,
            'Professional & Workplace': 5,
            'Outdoors & Recreation': 6,
            'College & University': 7,
            'Residence': 8,
            'Event': 9,
            'Other': 10
        }

    def parse_real_time(self, time_str):
        """Parse real data time format (Tue Apr 03 18:00:09 +0000 2012)"""
        try:
            dt = datetime.strptime(time_str, '%a %b %d %H:%M:%S %z %Y')
            return dt
        except ValueError:
            print(f"Cannot parse real data time string: {time_str}")
            return None
    
    def parse_gen_time(self, time_str):
        """Parse generated data time format (2025/5/28 8:00)"""
        try:
            dt = datetime.strptime(time_str, '%Y/%m/%d %H:%M')
            return dt
        except ValueError:
            try:
                dt = datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S')
                return dt
            except ValueError:
                print(f"Cannot parse generated data time string: {time_str}")
                return None
    
    def time_to_bin(self, dt):
        """Convert time to time interval index (10-minute intervals)"""
        total_minutes = dt.hour * 60 + dt.minute
        return min(int(total_minutes // 10), self.time_bins - 1)
    
    def latlon_to_grid(self, lat, lon):
        """Convert latitude and longitude to grid index"""
        if not (self.lat_min <= lat <= self.lat_max and self.lon_min <= lon <= self.lon_max):
            return -1  # Out of range
        
        lat_idx = int((lat - self.lat_min) / self.grid_size)
        lon_idx = int((lon - self.lon_min) / self.grid_size)
        return min(lat_idx * self.n_lon + lon_idx, self.grid_total - 1)

    def normalize_category(self, category):
        """Normalize venue category"""
        if not isinstance(category, str):
            return 'Unknown'
        
        category_lower = f" {category.lower()} "
        scores = {cat: 0 for cat in self.category_keywords.keys()}
        
        # Calculate matching score for each category
        for main_category, keywords in self.category_keywords.items():
            for keyword in keywords:
                if f" {keyword} " in category_lower:
                    scores[main_category] += 1
        
        # Return category with highest score
        max_score = max(scores.values())
        if max_score > 0:
            return max(scores.items(), key=lambda x: x[1])[0]
        
        return 'Other'
    
    def get_activity_id(self, category):
        """Get activity type ID"""
        normalized_category = self.normalize_category(category)
        return self.activity_map.get(normalized_category, 5)  # Default to 'Professional & Other Places'

    def analyze_real_trajectories(self, real_df):
        """Analyze real trajectory data"""
        traj_data = []
        
        # Group trajectories by user and date
        real_df['date'] = real_df['utcTimestamp'].apply(
            lambda x: self.parse_real_time(x).strftime('%Y-%m-%d') if self.parse_real_time(x) else None
        )
        
        # Remove records with invalid dates
        real_df = real_df.dropna(subset=['date'])
        
        for (user_id, date), group in real_df.groupby(['userId', 'date']):
            user_traj = []
            group = group.sort_values('utcTimestamp')
            
            for _, row in group.iterrows():
                dt = self.parse_real_time(row['utcTimestamp'])
                if dt is None:
                    continue
                
                grid_idx = self.latlon_to_grid(row['latitude'], row['longitude'])
                
                if grid_idx >= 0:
                    # Store actual time instead of time interval
                    act_id = self.get_activity_id(row['venueCategory'])
                    user_traj.append([dt, act_id, [row['latitude'], row['longitude']]])
            
            if len(user_traj) > 1:
                traj_data.append(sorted(user_traj, key=lambda x: x[0]))
        
        return traj_data

    def analyze_generated_trajectories(self, gen_df):
        """Analyze generated trajectory data"""
        traj_data = []
        
        # Ensure timestamp column format is correct
        gen_df['datetime'] = gen_df['timestamp'].apply(self.parse_gen_time)
        
        # Remove records with invalid dates
        gen_df = gen_df.dropna(subset=['datetime'])
        
        # Group trajectories by user and date
        gen_df['date'] = gen_df['datetime'].apply(lambda x: x.strftime('%Y-%m-%d'))
        
        for (user_id, date), group in gen_df.groupby(['userId', 'date']):
            user_traj = []
            group = group.sort_values('datetime')
            
            for idx, row in group.iterrows():
                dt = row['datetime']
                
                # Handle origin point
                origin_grid = self.latlon_to_grid(row['origin_lat'], row['origin_lon'])
                if origin_grid >= 0:
                    origin_act_id = self.get_activity_id(row['origin_category'])
                    user_traj.append([dt, origin_act_id, [row['origin_lat'], row['origin_lon']]])
                
                # Handle destination point
                dest_dt = dt + timedelta(minutes=10)  # Assume arrival time is 10 minutes after departure
                dest_grid = self.latlon_to_grid(row['destination_lat'], row['destination_lon'])
                if dest_grid >= 0:
                    dest_act_id = self.get_activity_id(row['destination_category'])
                    user_traj.append([dest_dt, dest_act_id, [row['destination_lat'], row['destination_lon']]])
                    
                # If this is the last row, add the final destination point
                if idx == group.index[-1]:
                    final_dt = dest_dt + timedelta(minutes=10)  # Add additional time for the final destination
                    if dest_grid >= 0:
                        user_traj.append([final_dt, dest_act_id, [row['destination_lat'], row['destination_lon']]])
            
            if len(user_traj) > 1:
                # Sort by time and remove duplicate points
                user_traj = sorted(user_traj, key=lambda x: x[0])
                # Remove points with same time and location
                filtered_traj = []
                for i, point in enumerate(user_traj):
                    if i == 0 or point[0] != user_traj[i-1][0] or point[2] != user_traj[i-1][2]:
                        filtered_traj.append(point)
                
                if len(filtered_traj) > 1:
                    traj_data.append(filtered_traj)
        
        return traj_data

    # def analyze_generated_trajectories(self, gen_df):
    #     """Analyze generated trajectory data"""
    #     traj_data = []
        
    #     # Ensure timestamp columns format is correct
    #     gen_df['arrival_time'] = gen_df['times_arr'].apply(self.parse_gen_time)
    #     gen_df['departure_time'] = gen_df['time_dep'].apply(self.parse_gen_time)
        
    #     # Remove records with invalid dates
    #     gen_df = gen_df.dropna(subset=['arrival_time', 'departure_time'])
        
    #     # Group trajectories by user and date
    #     gen_df['date'] = gen_df['arrival_time'].apply(lambda x: x.strftime('%Y-%m-%d'))
        
    #     for (user_id, date), group in gen_df.groupby(['userId', 'date']):
    #         user_traj = []
    #         group = group.sort_values('arrival_time')
            
    #         for idx, row in group.iterrows():
    #             # Handle point with arrival time
    #             origin_grid = self.latlon_to_grid(row['origin_lat'], row['origin_lon'])
    #             if origin_grid >= 0:
    #                 origin_act_id = self.get_activity_id(row['origin_category'])
    #                 # Add point with arrival time
    #                 user_traj.append([row['arrival_time'], origin_act_id, 
    #                                 [row['origin_lat'], row['origin_lon']]])
                    
    #                 # Add the same point with departure time
    #                 user_traj.append([row['departure_time'], origin_act_id, 
    #                                 [row['origin_lat'], row['origin_lon']]])
                
    #             # If this is the last row, add the final destination point
    #             if idx == group.index[-1]:
    #                 dest_grid = self.latlon_to_grid(row['destination_lat'], row['destination_lon'])
    #                 if dest_grid >= 0:
    #                     dest_act_id = self.get_activity_id(row['destination_category'])
    #                     user_traj.append([row['departure_time'], 
    #                                     dest_act_id, 
    #                                     [row['destination_lat'], row['destination_lon']]])
            
    #         if len(user_traj) > 1:
    #             # Sort by time and remove duplicate points
    #             user_traj = sorted(user_traj, key=lambda x: x[0])
                
    #             # Remove points with same time and location
    #             filtered_traj = []
    #             for i, point in enumerate(user_traj):
    #                 if i == 0 or point[0] != user_traj[i-1][0] or point[2] != user_traj[i-1][2]:
    #                     filtered_traj.append(point)
                
    #             if len(filtered_traj) > 1:
    #                 traj_data.append(filtered_traj)
        
    #     return traj_data
    
    def calculate_distance_distribution(self, trajectories):
        """Calculate distance distribution between consecutive points"""
        distances = []
        for traj in trajectories:
            for i in range(len(traj) - 1):
                curr_loc = traj[i][2]  # [lat, lon]
                next_loc = traj[i+1][2]  # [lat, lon]
                dist = geodistance(curr_loc[1], curr_loc[0], next_loc[1], next_loc[0])
                if dist > 0:  # Exclude same locations
                    distances.append(dist)
        return np.array(distances)
    
    def calculate_duration_distribution(self, trajectories):
        """Calculate stay duration distribution between consecutive points (minutes)"""
        durations = []
        for traj in trajectories:
            for i in range(len(traj) - 1):
                # For real data, time is already datetime object
                curr_time = traj[i][0]
                next_time = traj[i+1][0]
                
                # Calculate time difference (minutes)
                if isinstance(curr_time, datetime) and isinstance(next_time, datetime):
                    # If datetime objects, calculate directly
                    duration = (next_time - curr_time).total_seconds() / 60
                else:
                    # If still time intervals, convert to minutes
                    duration = (next_time - curr_time) * 10
                
                if duration > 0:  # Ignore non-positive values
                    durations.append(duration)
        
        return np.array(durations)
    
    def calculate_activity_distribution(self, trajectories):
        """Calculate activity type distribution"""
        # Initialize counters for all activity types
        act_counts = np.zeros(len(self.activity_map))
        
        for traj in trajectories:
            for point in traj:
                act_id = point[1]
                if 0 <= act_id < len(act_counts):
                    act_counts[act_id] += 1
        
        return act_counts
    
    def calculate_location_distribution(self, trajectories):
        """Calculate spatial location distribution"""
        loc_counts = np.zeros(self.grid_total)
        
        for traj in trajectories:
            for point in traj:
                lat, lon = point[2]
                grid_idx = self.latlon_to_grid(lat, lon)
                if grid_idx >= 0:
                    loc_counts[grid_idx] += 1
        
        return loc_counts
    
    def calculate_st_activity_distribution(self, trajectories):
        """Calculate spatio-temporal activity distribution (reference st_act_jsd in code)"""
        st_act_dict = {}
        indices = []
        
        # Build unified spatio-temporal activity dictionary
        for traj in trajectories:
            for point in traj:
                dt, act_id, _ = point
                time_bin = self.time_to_bin(dt)
                key = (time_bin, act_id)
                if key not in st_act_dict:
                    st_act_dict[key] = len(st_act_dict)
                indices.append(st_act_dict[key])
        
        return np.array(indices)
    
    def calculate_st_location_distribution(self, trajectories):
        """Calculate spatio-temporal location distribution (reference st_loc_jsd in code)"""
        st_loc_dict = {}
        indices = []
        
        # Build unified spatio-temporal location dictionary
        for traj in trajectories:
            for point in traj:
                dt, _, loc = point
                time_bin = self.time_to_bin(dt)
                grid_idx = self.latlon_to_grid(loc[0], loc[1])
                if grid_idx < 0:  # Skip invalid locations
                    continue
                key = (time_bin, grid_idx)
                if key not in st_loc_dict:
                    st_loc_dict[key] = len(st_loc_dict)
                indices.append(st_loc_dict[key])
        
        return np.array(indices)
    
    def calculate_jsd_with_bins(self, real_arr, gen_arr, min_val, max_val, bins, total_bins=None):
        """Unified JSD calculation logic (consistent with reference code)"""
        # Normalization
        if total_bins is not None and len(real_arr) > 0 and len(gen_arr) > 0:
            min_idx = min(np.min(real_arr), np.min(gen_arr))
            max_idx = max(np.max(real_arr), np.max(gen_arr))
            real_arr = (real_arr - min_idx) / (max_idx - min_idx + 1e-9)
            gen_arr = (gen_arr - min_idx) / (max_idx - min_idx + 1e-9)
            min_val, max_val = 0, 1
        
        # Bin statistics
        real_hist, _ = np.histogram(real_arr, bins=bins, range=(min_val, max_val))
        gen_hist, _ = np.histogram(gen_arr, bins=bins, range=(min_val, max_val))
        
        # Handle out-of-range values
        real_out = len(real_arr[real_arr > max_val])
        gen_out = len(gen_arr[gen_arr > max_val])
        
        real_hist = np.append(real_hist, real_out)
        gen_hist = np.append(gen_hist, gen_out)
        
        return js_divergence(real_hist, gen_hist)
    
    def arr_to_distribution(self, arr, min_val, max_val, bins):
        """Convert array to distribution"""
        if len(arr) == 0:
            return np.zeros(bins)
        
        # Handle out-of-range values
        in_range = arr[(arr >= min_val) & (arr <= max_val)]
        out_range = arr[arr > max_val]
        
        # Calculate distribution
        distribution, _ = np.histogram(in_range, bins=bins, range=(min_val, max_val))
        
        # Add out-of-range values to last bin
        if len(out_range) > 0:
            distribution = np.append(distribution, len(out_range))
        else:
            distribution = np.append(distribution, 0)  # Ensure consistent distribution length
        
        return distribution

    def calculate_jsd_metrics(self, real_trajectories, gen_trajectories):
        """Calculate JSD metrics (consistent with reference code parameters)"""
        metrics = {}
        
        # 1. Step distance (SD) - 0-10 km, 10 bins
        real_distances = self.calculate_distance_distribution(real_trajectories)
        gen_distances = self.calculate_distance_distribution(gen_trajectories)
        metrics['SD'] = self.calculate_jsd_with_bins(real_distances, gen_distances, 0, 10, 10)
        
        # 2. Step interval (SI) - Convert to 10-minute intervals, 0-12 intervals, 12 bins
        real_durations = self.calculate_duration_distribution(real_trajectories)
        gen_durations = self.calculate_duration_distribution(gen_trajectories)
        
        if len(real_durations) == 0 or len(gen_durations) == 0:
            print("Warning: Insufficient duration data, cannot calculate SI metric")
            metrics['SI'] = np.nan
        else:
            # Time range and number of bins
            min_dur, max_dur = 0, max(np.percentile(real_durations, 95), np.percentile(gen_durations, 95))
            dur_bins = 50
            
            real_dur_hist = self.arr_to_distribution(real_durations, min_dur, max_dur, dur_bins)
            gen_dur_hist = self.arr_to_distribution(gen_durations, min_dur, max_dur, dur_bins)
            
            metrics['SI'] = js_divergence(real_dur_hist, gen_dur_hist)
        
        # 3. Daily activity routine distribution (DARD) - 1000 bins
        real_st_act = self.calculate_st_activity_distribution(real_trajectories)
        gen_st_act = self.calculate_st_activity_distribution(gen_trajectories)
        metrics['DARD'] = self.calculate_jsd_with_bins(real_st_act, gen_st_act, 0, 1, 1000, total_bins=True)
        
        # 4. Spatial-temporal visits distribution (STVD) - 400 bins
        real_st_loc = self.calculate_st_location_distribution(real_trajectories)
        gen_st_loc = self.calculate_st_location_distribution(gen_trajectories)
        metrics['STVD'] = self.calculate_jsd_with_bins(real_st_loc, gen_st_loc, 0, 1, 400, total_bins=True)
        
        return metrics
    
    def print_trajectory_stats(self, trajectories, name):
        """Print trajectory statistics"""
        if not trajectories:
            print(f"{name} dataset: No valid trajectories")
            return
        
        num_trajectories = len(trajectories)
        total_points = sum(len(traj) for traj in trajectories)
        avg_points_per_traj = total_points / num_trajectories
        
        # Calculate total distance
        total_distance = 0
        num_segments = 0
        for traj in trajectories:
            for i in range(len(traj) - 1):
                curr_loc = traj[i][2]  # [lat, lon]
                next_loc = traj[i+1][2]  # [lat, lon]
                dist = geodistance(curr_loc[1], curr_loc[0], next_loc[1], next_loc[0])
                total_distance += dist
                num_segments += 1
        
        avg_distance = total_distance / max(num_segments, 1)
        
        # Activity type statistics
        activity_counts = defaultdict(int)
        for traj in trajectories:
            for point in traj:
                act_id = point[1]
                activity_counts[act_id] += 1
        
        # Reverse mapping from activity ID to name
        act_id_to_name = {v: k for k, v in self.activity_map.items()}
        top_activities = sorted(activity_counts.items(), key=lambda x: x[1], reverse=True)[:5]
        
        print(f"{name} Dataset Statistics:")
        print(f"  Number of trajectories: {num_trajectories}")
        print(f"  Total points: {total_points}")
        print(f"  Average points per trajectory: {avg_points_per_traj:.2f}")
        print(f"  Average movement distance: {avg_distance:.2f} km")
        print(f"  Most common activity types:")
        for act_id, count in top_activities:
            act_name = act_id_to_name.get(act_id, f"Unknown activity({act_id})")
            percentage = count / total_points * 100
            print(f"    - {act_name}: {count} times ({percentage:.1f}%)")
        print()

def main(real_traj_path, gen_traj_path):
    # Load data
    real_df = pd.read_csv(real_traj_path)
    gen_df = pd.read_csv(gen_traj_path)

    # analyzer = TrajectoryAnalyzer(
    #     grid_size=0.01,
    #     lat_range=(40.5, 40.9),  # New York latitude range
    #     lon_range=(-74.25, -73.7)  # New York latitude range
    # )
    
    # Initialize analyzer (using New York range)
    analyzer = TrajectoryAnalyzer(
        grid_size=0.001,
        lat_range=(40.5, 40.9),  # New York latitude range
        lon_range=(-74.25, -73.7)  # New York longitude range
    )
    
    # Analyze real trajectories
    real_trajectories = analyzer.analyze_real_trajectories(real_df)
    analyzer.print_trajectory_stats(real_trajectories, "Real")
    
    # Analyze generated trajectories
    gen_trajectories = analyzer.analyze_generated_trajectories(gen_df)
    analyzer.print_trajectory_stats(gen_trajectories, "Generated")
    
    # Calculate JSD metrics
    metrics = analyzer.calculate_jsd_metrics(real_trajectories, gen_trajectories)
    
    # Print results
    print("\nTrajectory Consistency Analysis Results (Jensen-Shannon Divergence):")
    print(f"1. Step distance (SD): {metrics.get('SD', 'N/A'):.3f}")
    print(f"2. Step interval (SI): {metrics.get('SI', 'N/A'):.3f}")
    print(f"3. Daily activity routine distribution (DARD): {metrics.get('DARD', 'N/A'):.3f}")
    print(f"4. Spatial-temporal visits distribution (STVD): {metrics.get('STVD', 'N/A'):.3f}")
    
    return metrics


if __name__ == "__main__":
    # File path configuration
    real_traj_path = './dataset/dataset_TSMC2014_NYC.csv'
    gen_traj_path = './output/generated_trajectories_CoT.csv'
    # gen_traj_path = r'D:\A_Research\A_doing_research\20250526_LLM_causal_inference\output_results\generated_trajectories_gpt4o.csv'
    
    # Execute analysis
    main(real_traj_path, gen_traj_path)

Real Dataset Statistics:
  Number of trajectories: 47322
  Total points: 171040
  Average points per trajectory: 3.61
  Average movement distance: 3.18 km
  Most common activity types:
    - Food: 41431 times (24.2%)
    - Shop & Service: 36634 times (21.4%)
    - Travel & Transport: 27987 times (16.4%)
    - Outdoors & Recreation: 18784 times (11.0%)
    - Professional & Workplace: 14170 times (8.3%)

Generated Dataset Statistics:
  Number of trajectories: 1117
  Total points: 20776
  Average points per trajectory: 18.60
  Average movement distance: 2.51 km
  Most common activity types:
    - Food: 6403 times (30.8%)
    - Shop & Service: 4760 times (22.9%)
    - Nightlife Spot: 2485 times (12.0%)
    - Outdoors & Recreation: 2067 times (9.9%)
    - Arts & Entertainment: 1212 times (5.8%)


Trajectory Consistency Analysis Results (Jensen-Shannon Divergence):
1. Step distance (SD): 0.039
2. Step interval (SI): 0.292
3. Daily activity routine distribution (DARD): 0.272
4. Spatial-tempor