In [None]:
import pandas as pd
import numpy as np
from scipy.stats import entropy
from datetime import datetime
import math
from math import sin, cos, asin, sqrt, radians

class TrajectoryEvaluation:
    def __init__(self):
        pass
        
    def geodistance(self, lng1, lat1, lng2, lat2):
        """Calculate the great circle distance between two points."""
        lng1, lat1, lng2, lat2 = map(radians, [float(lng1), float(lat1), float(lng2), float(lat2)])
        dlon = lng2 - lng1
        dlat = lat2 - lat1
        a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
        distance = 2 * asin(sqrt(a)) * 6371 * 1000
        distance = round(distance/1000, 3)
        return distance

    def arr_to_distribution(self, arr, Min, Max, bins):
        """Convert array to probability distribution."""
        distribution, base = np.histogram(arr[arr <= Max], bins=bins, range=(Min, Max))
        m = np.array([len(arr[arr > Max])], dtype='int64')
        distribution = np.hstack((distribution, m))
        return distribution, base[:-1]

    def get_js_divergence(self, p1, p2):
        """Calculate Jensen-Shannon divergence between two distributions."""
        p1 = p1 / (p1.sum() + 1e-9)
        p2 = p2 / (p2.sum() + 1e-9)
        m = (p1 + p2) / 2
        js = 0.5 * entropy(p1, m) + 0.5 * entropy(p2, m)
        return js

    def prepare_trajectory_data(self, df):
        """Prepare trajectory data in the required format."""
        trajectories = []
        for user_id in df['userId'].unique():
            user_traj = df[df['userId'] == user_id].sort_values('utcTimestamp' if 'utcTimestamp' in df.columns else 'timestamp')
            
            # Convert timestamps to intervals (0-143 for 10-minute intervals in a day)
            timestamps = pd.to_datetime(user_traj['utcTimestamp' if 'utcTimestamp' in df.columns else 'timestamp'])
            intervals = ((timestamps.dt.hour * 60 + timestamps.dt.minute) // 10).values
            
            # Get categories and coordinates
            categories = user_traj['venueCategory' if 'venueCategory' in df.columns else 'destination_category'].values
            lats = user_traj['latitude' if 'latitude' in df.columns else 'destination_lat'].values
            lons = user_traj['longitude' if 'longitude' in df.columns else 'destination_lon'].values
            
            # Create trajectory
            traj = []
            for i in range(len(intervals)):
                traj.append([intervals[i], categories[i], [lats[i], lons[i]]])
            trajectories.append(traj)
            
        return trajectories

    def calculate_step_distance(self, trajectories):
        """Calculate step distances for all trajectories."""
        distances = []
        for traj in trajectories:
            for i in range(len(traj)-1):
                lat1, lon1 = traj[i][2]
                lat2, lon2 = traj[i+1][2]
                distances.append(self.geodistance(lon1, lat1, lon2, lat2))
        return np.array(distances)

    def calculate_step_interval(self, trajectories):
        """Calculate time intervals between consecutive points."""
        intervals = []
        for traj in trajectories:
            for i in range(len(traj)-1):
                interval = (traj[i+1][0] - traj[i][0]) * 10  # Convert back to minutes
                intervals.append(interval)
        return np.array(intervals)

    def calculate_dard(self, trajectories):
        """Calculate Daily Activity Routine Distribution."""
        st_act_dict = {}
        values = []
        for traj in trajectories:
            for point in traj:
                key = f"{point[0]}_{point[1]}"
                if key not in st_act_dict:
                    st_act_dict[key] = len(st_act_dict)
                values.append(st_act_dict[key])
        return np.array(values)

    def calculate_stvd(self, trajectories):
        """Calculate Spatial-temporal Visits Distribution."""
        st_loc_dict = {}
        values = []
        for traj in trajectories:
            for point in traj:
                key = f"{point[0]}_{point[2][0]}_{point[2][1]}"
                if key not in st_loc_dict:
                    st_loc_dict[key] = len(st_loc_dict)
                values.append(st_loc_dict[key])
        return np.array(values)

    def evaluate(self, real_df, gen_df):
        """Calculate all evaluation metrics."""
        # Prepare data
        real_trajectories = self.prepare_trajectory_data(real_df)
        gen_trajectories = self.prepare_trajectory_data(gen_df)

        # Calculate metrics
        # 1. Step Distance (SD)
        real_sd = self.calculate_step_distance(real_trajectories)
        gen_sd = self.calculate_step_distance(gen_trajectories)
        sd_dist_real, _ = self.arr_to_distribution(real_sd, 0, 10, 10)
        sd_dist_gen, _ = self.arr_to_distribution(gen_sd, 0, 10, 10)
        sd_jsd = self.get_js_divergence(sd_dist_real, sd_dist_gen)

        # 2. Step Interval (SI)
        real_si = self.calculate_step_interval(real_trajectories)
        gen_si = self.calculate_step_interval(gen_trajectories)
        si_dist_real, _ = self.arr_to_distribution(real_si, 0, 12, 12)
        si_dist_gen, _ = self.arr_to_distribution(gen_si, 0, 12, 12)
        si_jsd = self.get_js_divergence(si_dist_real, si_dist_gen)

        # 3. Daily Activity Routine Distribution (DARD)
        real_dard = self.calculate_dard(real_trajectories)
        gen_dard = self.calculate_dard(gen_trajectories)
        dard_dist_real, _ = self.arr_to_distribution(real_dard, 0, 1, 1000)
        dard_dist_gen, _ = self.arr_to_distribution(gen_dard, 0, 1, 1000)
        dard_jsd = self.get_js_divergence(dard_dist_real, dard_dist_gen)

        # 4. Spatial-temporal Visits Distribution (STVD)
        real_stvd = self.calculate_stvd(real_trajectories)
        gen_stvd = self.calculate_stvd(gen_trajectories)
        stvd_dist_real, _ = self.arr_to_distribution(real_stvd, 0, 1, 400)
        stvd_dist_gen, _ = self.arr_to_distribution(gen_stvd, 0, 1, 400)
        stvd_jsd = self.get_js_divergence(stvd_dist_real, stvd_dist_gen)

        return {
            'sd_jsd': sd_jsd,
            'si_jsd': si_jsd,
            'dard_jsd': dard_jsd,
            'stvd_jsd': stvd_jsd
        }

# 使用示例
evaluator = TrajectoryEvaluation()
real_df = pd.read_csv(r'D:\A_Research\A_doing_research\20250526_LLM_causal_inference\dataset\dataset_TSMC2014_NYC.csv')
gen_df = pd.read_csv('generated_trajectories.csv')
metrics = evaluator.evaluate(real_df, gen_df)

print(f"Step Distance JSD: {metrics['sd_jsd']:.4f}")
print(f"Step Interval JSD: {metrics['si_jsd']:.4f}")
print(f"Daily Activity Routine Distribution JSD: {metrics['dard_jsd']:.4f}")
print(f"Spatial-temporal Visits Distribution JSD: {metrics['stvd_jsd']:.4f}")

  timestamps = pd.to_datetime(user_traj['utcTimestamp' if 'utcTimestamp' in df.columns else 'timestamp'])
  timestamps = pd.to_datetime(user_traj['utcTimestamp' if 'utcTimestamp' in df.columns else 'timestamp'])
  timestamps = pd.to_datetime(user_traj['utcTimestamp' if 'utcTimestamp' in df.columns else 'timestamp'])
  timestamps = pd.to_datetime(user_traj['utcTimestamp' if 'utcTimestamp' in df.columns else 'timestamp'])
  timestamps = pd.to_datetime(user_traj['utcTimestamp' if 'utcTimestamp' in df.columns else 'timestamp'])
  timestamps = pd.to_datetime(user_traj['utcTimestamp' if 'utcTimestamp' in df.columns else 'timestamp'])
  timestamps = pd.to_datetime(user_traj['utcTimestamp' if 'utcTimestamp' in df.columns else 'timestamp'])
  timestamps = pd.to_datetime(user_traj['utcTimestamp' if 'utcTimestamp' in df.columns else 'timestamp'])
  timestamps = pd.to_datetime(user_traj['utcTimestamp' if 'utcTimestamp' in df.columns else 'timestamp'])
  timestamps = pd.to_datetime(user_traj['utcTi

Step Distance JSD: 0.0132
Step Interval JSD: 0.0835
Daily Activity Routine Distribution JSD: 0.0028
Spatial-temporal Visits Distribution JSD: 0.0034
