In [133]:
from tpm.data_model import *
from tpm.util.io import read_geolife
from tpm.util.dist import haversine_distance
from tpm.preprocessing import time_duplication_filter
from tpm.preprocessing import speed_filter_abs
import numpy as np
import pandas as pd
import folium
from datetime import timedelta

In [2]:
trajs = read_geolife('/mnt/hdd1/christian/data/geotracking/Geolife Trajectories 1.3/Data/004/Trajectory')


In [118]:
preprocessed = list()
for traj in trajs:
    traj_new = time_duplication_filter(traj)
    traj_new = speed_filter_abs(traj_new, 300, in_kmh=True)
    preprocessed.append(traj_new)


In [119]:
def staypoints_geolife(traj):
    time_thresh = 30*60
    dist_thresh = 200

    staypoints = list()
    i, i_max = 0, len(traj)
    while i < i_max:
        j = i+1
        token = 0
        while j < i_max:
            dist = haversine_distance(traj[i], traj[j])
            if dist > dist_thresh:
                delta_time = traj[j].datetime - traj[i].datetime
                if delta_time.total_seconds() > time_thresh:
                    mean_point = np.mean([[p.lat, p.lon] for p in traj[i:j+1]], axis=0)
                    arrival_time = traj[i].datetime
                    leave_time = traj[j].datetime
                    staypoints.append([mean_point, arrival_time, leave_time, i, j])
                    i = j
                    token = 1
                break
            j = j+1
        if not token == 1:
            i = i+1

    
    return staypoints

In [120]:
len(trajs)

395

In [134]:
def make_df(trajs):
    
    def haversine_distance(p1_lat, p1_lon, p2_lat, p2_lon):
        from math import asin
        lat_rad1 = radians(p1_lat)
        lon_rad1 = radians(p1_lon)
        lat_rad2 = radians(p2_lat)
        lon_rad2 = radians(p2_lon)
        return 2*R * asin(sqrt(sin((lat_rad2-lat_rad1)/2)**2 + cos(lat_rad1)*cos(lat_rad2)*(sin((lon_rad2-lon_rad1)/2)**2)))
    
    data = list()
    for traj in trajs:
        fp = traj[0]
        sps = staypoints_geolife(traj)
        lp = traj[-1]               
        
        if len(sps) > 1:
            data.append([fp.lat, fp.lon, fp.datetime, sps[0][0][0], sps[0][0][1], sps[0][1]])
            for i in range(1, len(sps)-1):
                data.append([sps[i][0][0], sps[i][0][1], sps[i][1], sps[i+1][0][0], sps[i+1][0][1], sps[i+1][2]])
            data.append([sps[-1][0][0], sps[-1][0][1], sps[-1][2], lp.lat, lp.lon, lp.datetime])
        else:
            data.append([fp.lat, fp.lon, fp.datetime, lp.lat, lp.lon, lp.datetime])
        
    filtered_data = list()
    for row in data:
        if haversine_distance(row[0],row[1],row[3],row[4]) > 50:
            filtered_data.append(row)

    
    df = pd.DataFrame(filtered_data, columns=['start_lat','start_lon','start_date','end_lat','end_lon','end_date'])
    df = df.set_index(pd.DatetimeIndex(df['start_date'])).sort_index()
    return df

In [135]:
df = make_df(preprocessed)
df


Unnamed: 0,start_lat,start_lon,start_date,end_lat,end_lon,end_date
2008-10-24 02:58:52,39.999973,116.327148,2008-10-24 02:58:52,40.010681,116.321487,2008-10-24 03:08:48
2008-10-24 10:54:54,40.010948,116.321434,2008-10-24 10:54:54,40.000034,116.327583,2008-10-24 11:13:39
2008-10-24 18:27:39,39.999493,116.326843,2008-10-24 18:27:39,39.993038,116.327057,2008-10-24 18:31:29
2008-10-25 00:58:59,39.999756,116.326965,2008-10-25 00:58:59,40.006382,116.321548,2008-10-25 01:05:14
2008-10-25 13:57:55,40.006138,116.320770,2008-10-25 13:57:55,39.999893,116.327354,2008-10-25 19:38:21
2008-10-26 03:24:32,39.999149,116.327965,2008-10-26 03:24:32,40.011120,116.322037,2008-10-26 03:34:07
2008-10-26 15:48:37,40.010170,116.322128,2008-10-26 15:48:37,39.991360,116.320969,2008-10-26 16:20:43
2008-10-26 17:04:51,39.999165,116.327057,2008-10-26 17:04:51,39.968838,116.415939,2008-10-26 22:34:47
2008-10-26 22:34:47,39.968838,116.415939,2008-10-26 22:34:47,40.000736,116.327263,2008-10-26 23:24:02
2008-10-27 03:46:27,39.999844,116.327133,2008-10-27 03:46:27,40.010681,116.321976,2008-10-27 03:55:27


In [130]:
counter = 0
for i, row in df.iterrows():
    
        counter +=1
        
print(counter)

30


In [8]:
from sklearn.cluster import dbscan
from sklearn.cluster import k_means
from sklearn.neighbors import DistanceMetric
from tpm.data_model import R
from collections import Counter

In [127]:
from math import asin
def haversine_distance(p1_lat, p1_lon, p2_lat, p2_lon):
    lat_rad1 = radians(p1_lat)
    lon_rad1 = radians(p1_lon)
    lat_rad2 = radians(p2_lat)
    lon_rad2 = radians(p2_lon)
    return 2*R * asin(sqrt(sin((lat_rad2-lat_rad1)/2)**2 + cos(lat_rad1)*cos(lat_rad2)*(sin((lon_rad2-lon_rad1)/2)**2)))

In [99]:
def cluster_into_spots(df, init_eps=150, levels=2, threshold=0.1):
    start_points = list()
    end_points = list()
    length = len(df)
    
    for i in range(length):
        start_points.append([df['start_lat'].iloc[i], df['start_lon'].iloc[i]])
        end_points.append([df['end_lat'].iloc[i], df['end_lon'].iloc[i]])
    
    points = np.radians(np.vstack([start_points, end_points]))
    
    haversine = DistanceMetric.get_metric('haversine')
    dist = haversine.pairwise(points)*R

    clusters = dbscan(dist, metric='precomputed', min_samples=1,eps=init_eps)[1]
    clusters = np.array(clusters, dtype=np.object)
    
    
    for _ in range(levels):
        init_eps = init_eps*0.3
        counts = dict(Counter(clusters))
        for key in counts:
            if counts[key] > threshold*length:
                idxs = np.where(clusters == key)[0]
                dist = haversine.pairwise(points[idxs])*R
                inner_clusters = dbscan(dist, metric='precomputed', min_samples=1,eps=init_eps)[1]
                for i, idx in enumerate(idxs):
                    
                    clusters[idx] = "{}_{}".format(clusters[idx], inner_clusters[i])
    
    start_clusters = list()
    end_clusters = list()
    for i, cluster in enumerate(clusters):
        if i < length:
            start_clusters.append(clusters[i])
        else:
            end_clusters.append(clusters[i%length+length])
    
    print(len(dict(Counter(clusters)).keys()))
    
    df['start_cluster'] = start_clusters
    df['end_cluster'] = end_clusters
    return df

In [97]:
def cluster_into_spots(df, init_eps=150, levels=2, threshold=0.1):
    start_points = list()
    end_points = list()
    length = len(df)
    
    for i in range(length):
        start_points.append([df['start_lat'].iloc[i], df['start_lon'].iloc[i]])
        end_points.append([df['end_lat'].iloc[i], df['end_lon'].iloc[i]])
    
    points = np.radians(np.vstack([start_points, end_points]))
    
    haversine = DistanceMetric.get_metric('haversine')
    dist = haversine.pairwise(points)*R

    clusters = dbscan(dist, metric='precomputed', min_samples=1,eps=init_eps)[1]
    clusters = np.array(clusters, dtype=np.object)
    
    k_means()
    
    for _ in range(levels):
        init_eps = init_eps*0.3
        counts = dict(Counter(clusters))
        for key in counts:
            if counts[key] > threshold*length:
                idxs = np.where(clusters == key)[0]
                dist = haversine.pairwise(points[idxs])*R
                inner_clusters = dbscan(dist, metric='precomputed', min_samples=1,eps=init_eps)[1]
                for i, idx in enumerate(idxs):
                    
                    clusters[idx] = "{}_{}".format(clusters[idx], inner_clusters[i])
    
    start_clusters = list()
    end_clusters = list()
    for i, cluster in enumerate(clusters):
        if i < length:
            start_clusters.append(clusters[i])
        else:
            end_clusters.append(clusters[i%length+length])
    
    print(len(dict(Counter(clusters)).keys()))
    
    df['start_cluster'] = start_clusters
    df['end_cluster'] = end_clusters
    return df

In [12]:
from sklearn.cluster import AgglomerativeClustering
from matplotlib.colors import cnames
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift

In [13]:
def agglomerative_cluster_into_spots(df, min_cluster, max_cluster):
    start_points = list()
    end_points = list()
    length = len(df)

    for i in range(length):
        start_points.append([df['start_lat'].iloc[i], df['start_lon'].iloc[i]])
        end_points.append([df['end_lat'].iloc[i], df['end_lon'].iloc[i]])

    points = np.radians(np.vstack([start_points, end_points]))

    sil_scores = list()
    for i in range(min_cluster, max_cluster):
        ac = AgglomerativeClustering(n_clusters=i)
        pred = ac.fit_predict(points)
        sil_score = silhouette_score(points, pred)
        sil_scores.append(sil_score)

    
    n_cluster = np.argmax(sil_scores) + min_cluster
    ac = AgglomerativeClustering(n_clusters=n_cluster)
    clusters = ac.fit_predict(points)
    
    
    start_clusters = list()
    end_clusters = list()
    
    for i, cluster in enumerate(clusters):
        if i < length:
            start_clusters.append(clusters[i])
        else:
            end_clusters.append(clusters[i%length+length])

        

    df['start_cluster'] = start_clusters
    df['end_cluster'] = end_clusters
    return df

In [14]:
def kmeans_cluster_into_spots(df, min_cluster, max_cluster):
    start_points = list()
    end_points = list()
    length = len(df)

    for i in range(length):
        start_points.append([df['start_lat'].iloc[i], df['start_lon'].iloc[i]])
        end_points.append([df['end_lat'].iloc[i], df['end_lon'].iloc[i]])

    points = np.radians(np.vstack([start_points, end_points]))

    sil_scores = list()
    for i in range(min_cluster, max_cluster):
        ac = AgglomerativeClustering(n_clusters=i)
        pred = ac.fit_predict(points)
        sil_score = silhouette_score(points, pred)
        sil_scores.append(sil_score)

    
    n_cluster = np.argmax(sil_scores) + min_cluster
    ac = KMeans(n_clusters=n_cluster, n_jobs=-1)
    clusters = ac.fit_predict(points)
    
    
    start_clusters = list()
    end_clusters = list()
    
    for i, cluster in enumerate(clusters):
        if i < length:
            start_clusters.append(clusters[i])
        else:
            end_clusters.append(clusters[i%length+length])

        

    df['start_cluster'] = start_clusters
    df['end_cluster'] = end_clusters
    return df

In [92]:
import numpy as np
from scipy.cluster.hierarchy import fcluster
from scipy.cluster.hierarchy import linkage
from sklearn.base import BaseEstimator
from sklearn.base import ClusterMixin
from sklearn.utils import check_array
from sklearn.neighbors import DistanceMetric

def hierarchical_clustering(
        X,
        max_dist=50,
        method='average',
        criterion='inconsistent',
):
    if len(X) < 1:
        return np.array([])
    elif len(X) == 1:
        return np.array([0])
    
    link = linkage(
            X,
            method=method,
        )
    
    labels = fcluster(
        link,
        t=max_dist,
        criterion=criterion,
    )
    return labels




In [83]:
from numpy import sin,cos,arctan2,sqrt,pi # import from numpy
# earth's mean radius = 6,371km
EARTHRADIUS = R

def getDistanceByHaversine(loc1, loc2):
    '''Haversine formula - give coordinates as a 2D numpy array of
    (lat_denter link description hereecimal,lon_decimal) pairs'''
    #      
    # "unpack" our numpy array, this extracts column wise arrays
    lat1 = loc1[0]
    lon1 = loc1[1]
    lat2 = loc2[0]
    lon2 = loc2[1]
    #
    # convert to radians ##### Completely identical
    lon1 = lon1 * pi / 180.0
    lon2 = lon2 * pi / 180.0
    lat1 = lat1 * pi / 180.0
    lat2 = lat2 * pi / 180.0
    #
    # haversine formula #### Same, but atan2 named arctan2 in numpy
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = (sin(dlat/2))**2 + cos(lat1) * cos(lat2) * (sin(dlon/2.0))**2
    c = 2.0 * arctan2(sqrt(a), sqrt(1.0-a))
    km = EARTHRADIUS * c
    return km

In [93]:
def cluster_into_spots(df):
    start_points = list()
    end_points = list()
    length = len(df)
    
    for i in range(length):
        start_points.append([df['start_lat'].iloc[i], df['start_lon'].iloc[i]])
        end_points.append([df['end_lat'].iloc[i], df['end_lon'].iloc[i]])
    
    points = np.vstack([start_points, end_points])
    
    from scipy.spatial.distance import pdist
    
    dist = pdist(points, lambda u, v: getDistanceByHaversine(u,v))
    
    print(dist)

    return hierarchical_clustering(dist, max_dist=1)
    
    
    
clusters = cluster_into_spots(df)

[ 1315.3022936     59.49908441    68.55183998 ...,  1142.95697109
    76.83470415  1216.38570993]


In [94]:
max(clusters)

849

In [165]:
from sklearn.preprocessing import StandardScaler
def meanshift_cluster_into_spots(df):
    start_points = list()
    end_points = list()
    length = len(df)

    for i in range(length):
        start_points.append([df['start_lat'].iloc[i], df['start_lon'].iloc[i]])
        end_points.append([df['end_lat'].iloc[i], df['end_lon'].iloc[i]])

    points = np.radians(np.vstack([start_points, end_points]))
    points = StandardScaler().fit_transform(points)

    ac = MeanShift(n_jobs=-1, cluster_all=False)
    print('start clustering')
    clusters = ac.fit_predict(points)
    print('done clustering')
    
    start_clusters = list()
    end_clusters = list()
    
    for i, cluster in enumerate(clusters):
        if i < length:
            start_clusters.append(clusters[i])
        else:
            end_clusters.append(clusters[i%length+length])

        

    df['start_cluster'] = start_clusters
    df['end_cluster'] = end_clusters
    return df

In [16]:
n_cluster = np.argmax(sil_scores) +100
ac = AgglomerativeClustering(n_clusters=n_cluster)
pred = ac.fit_predict(points)
points_deg = np.rad2deg(points)
visualize_cluster(points_deg, pred)

NameError: name 'sil_scores' is not defined

In [17]:
Counter(pred)

NameError: name 'pred' is not defined

In [161]:
def visualize_cluster(df):
    colors = [hexc for hexc in cnames.values()]
    map_lat, map_lon = df.iloc[0].start_lat, df.iloc[0].start_lon
    map_osm = folium.Map(location=[map_lat, map_lon])

    for i, row in df.iterrows():
        tup = (row['start_lat'], row['start_lon'])
        start_c = row['start_cluster']
        marker = folium.CircleMarker(tup, color=colors[start_c], fill_color=colors[start_c], radius=50, fill_opacity=1)
        map_osm.add_children(marker)
        
        tup = (row['end_lat'], row['end_lon'])
        end_c = row['end_cluster']
        marker = folium.CircleMarker(tup, color=colors[end_c], fill_color=colors[end_c], radius=50, fill_opacity=1)
        map_osm.add_children(marker)

    return map_osm

In [143]:
def cluster_into_spots(df, init_eps=20, levels=2, threshold=0.1):
    start_points = list()
    end_points = list()
    length = len(df)
    
    for i in range(length):
        start_points.append([df['start_lat'].iloc[i], df['start_lon'].iloc[i]])
        end_points.append([df['end_lat'].iloc[i], df['end_lon'].iloc[i]])
    
    points = np.radians(np.vstack([start_points, end_points]))
    
    haversine = DistanceMetric.get_metric('haversine')
    dist = haversine.pairwise(points)*R

    clusters = dbscan(dist, metric='precomputed', min_samples=2,eps=init_eps)[1]
    clusters = np.array(clusters, dtype=np.object)
    
    print(Counter(clusters))
    
    for _ in range(levels):
        init_eps = init_eps*0.3
        counts = dict(Counter(clusters))
        for key in counts:
            if counts[key] > threshold*length:
                idxs = np.where(clusters == key)[0]
                dist = haversine.pairwise(points[idxs])*R
                inner_clusters = dbscan(dist, metric='precomputed', min_samples=1,eps=init_eps)[1]
                for i, idx in enumerate(idxs):
                    
                    clusters[idx] = "{}_{}".format(clusters[idx], inner_clusters[i])
    
    start_clusters = list()
    end_clusters = list()
    for i, cluster in enumerate(clusters):
        if i < length:
            start_clusters.append(clusters[i])
        else:
            end_clusters.append(clusters[i%length+length])
    
    print(len(dict(Counter(clusters)).keys()))
    
    df['start_cluster'] = start_clusters
    df['end_cluster'] = end_clusters
    return df

In [166]:
df = cluster_into_spots(df)#, init_eps=50, levels=0, threshold=0.5)

counter = 0

for i in range(1,len(df)):
    ec = df['end_cluster'].iloc[i-1]
    sc = df['start_cluster'].iloc[i]
    if ec == sc:
        counter += 1
        
print(counter, len(df))

Counter({0: 721, 1: 329, -1: 313, 4: 43, 29: 27, 48: 26, 96: 24, 170: 24, 154: 23, 124: 20, 36: 11, 145: 10, 9: 8, 80: 8, 159: 8, 2: 7, 34: 7, 67: 7, 98: 6, 121: 6, 126: 6, 229: 6, 14: 5, 19: 5, 41: 5, 181: 5, 195: 5, 220: 5, 11: 4, 23: 4, 37: 4, 38: 4, 40: 4, 51: 4, 64: 4, 75: 4, 92: 4, 138: 4, 164: 4, 174: 4, 177: 4, 202: 4, 212: 4, 39: 3, 42: 3, 43: 3, 50: 3, 68: 3, 74: 3, 83: 3, 90: 3, 103: 3, 104: 3, 111: 3, 119: 3, 122: 3, 125: 3, 130: 3, 146: 3, 155: 3, 158: 3, 182: 3, 191: 3, 205: 3, 207: 3, 208: 3, 218: 3, 223: 3, 232: 3, 263: 3, 264: 3, 3: 2, 5: 2, 6: 2, 7: 2, 8: 2, 10: 2, 12: 2, 13: 2, 15: 2, 16: 2, 17: 2, 18: 2, 20: 2, 21: 2, 22: 2, 24: 2, 25: 2, 26: 2, 27: 2, 28: 2, 30: 2, 31: 2, 32: 2, 33: 2, 35: 2, 44: 2, 45: 2, 46: 2, 47: 2, 49: 2, 52: 2, 53: 2, 54: 2, 55: 2, 56: 2, 57: 2, 58: 2, 59: 2, 60: 2, 61: 2, 62: 2, 63: 2, 65: 2, 66: 2, 69: 2, 70: 2, 71: 2, 72: 2, 73: 2, 76: 2, 77: 2, 78: 2, 79: 2, 81: 2, 82: 2, 84: 2, 85: 2, 86: 2, 87: 2, 88: 2, 89: 2, 91: 2, 93: 2, 94: 2, 95: 

In [None]:
df = agglomerative_cluster_into_spots(df,100,120)

counter = 0

for i in range(1,len(df)):
    ec = df['end_cluster'].iloc[i-1]
    sc = df['start_cluster'].iloc[i]
    if ec == sc:
        counter += 1
        
print(counter, len(df))

In [158]:
df = kmeans_cluster_into_spots(df,50,120)

counter = 0

for i in range(1,len(df)):
    ec = df['end_cluster'].iloc[i-1]
    sc = df['start_cluster'].iloc[i]
    if ec == sc:
        counter += 1
        
print(counter, len(df))

892 1105


In [167]:
df = meanshift_cluster_into_spots(df)

counter = 0

for i in range(1,len(df)):
    ec = df['end_cluster'].iloc[i-1]
    sc = df['start_cluster'].iloc[i]
    if ec == sc:
        counter += 1
        
print(counter, len(df))

start clustering


Process ForkPoolWorker-168:
Process ForkPoolWorker-166:
Process ForkPoolWorker-165:
Process ForkPoolWorker-164:
Process ForkPoolWorker-162:
Process ForkPoolWorker-159:
Traceback (most recent call last):
Process ForkPoolWorker-158:
Process ForkPoolWorker-160:
Process ForkPoolWorker-157:
Process ForkPoolWorker-161:
Process ForkPoolWorker-167:
Traceback (most recent call last):
Traceback (most recent call last):
Process ForkPoolWorker-163:
  File "/home/christian/anaconda3/envs/mt/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/home/christian/anaconda3/envs/mt/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/christian/anaconda3/envs/mt/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/christian/

KeyboardInterrupt: 

In [162]:
visualize_cluster(df)

In [106]:
df

Unnamed: 0,start_lat,start_lon,start_date,end_lat,end_lon,end_date,start_cluster,end_cluster
2008-10-24 02:58:52,39.999973,116.327148,2008-10-24 02:58:52,40.010681,116.321487,2008-10-24 03:08:48,0,1
2008-10-24 10:54:54,40.010948,116.321434,2008-10-24 10:54:54,40.000034,116.327583,2008-10-24 11:13:39,1,0
2008-10-24 18:27:39,39.999493,116.326843,2008-10-24 18:27:39,39.993168,116.327019,2008-10-24 18:31:04,0,6
2008-10-24 20:13:38,39.999363,116.327042,2008-10-24 20:13:38,39.999546,116.327362,2008-10-24 21:09:19,0,0
2008-10-25 00:58:59,39.999756,116.326965,2008-10-25 00:58:59,40.006382,116.321548,2008-10-25 01:05:14,0,-1
2008-10-25 13:57:55,40.006138,116.320770,2008-10-25 13:57:55,39.999893,116.327354,2008-10-25 19:38:21,24,0
2008-10-26 03:24:32,39.999149,116.327965,2008-10-26 03:24:32,40.011120,116.322037,2008-10-26 03:34:07,-1,1
2008-10-26 15:48:37,40.010170,116.322128,2008-10-26 15:48:37,39.991417,116.321014,2008-10-26 16:20:33,1,-1
2008-10-26 17:04:36,39.998749,116.326988,2008-10-26 17:04:36,39.968327,116.415443,2008-10-26 22:36:27,0,-1
2008-10-26 22:36:27,39.968327,116.415443,2008-10-26 22:36:27,40.000736,116.327263,2008-10-26 23:24:02,-1,0


In [None]:
def visualize_rows(rows):
    map_lat, map_lon = rows.iloc[0].start_lat, rows.iloc[0].start_lon
    map_osm = folium.Map(location=[map_lat, map_lon])

    for i, row in rows.iterrows():
        tup = (row['start_lat'], row['start_lon'])
        marker = folium.Marker(tup, icon=folium.Icon(color='green'), popup='{} {}'.format(i, row['start_cluster']))
        map_osm.add_children(marker)
        tup = (row['end_lat'], row['end_lon'])
        marker = folium.Marker(tup, icon=folium.Icon(color='red'), popup='{} {}'.format(i, row['end_cluster']))
        map_osm.add_children(marker)

    return map_osm

In [None]:
visualize_rows(df.iloc[:-1])

In [22]:
ts = df['end_date'].iloc[-1] - df['start_date'].iloc[0]
ts.days/7

39.714285714285715

In [163]:
duration = timedelta(days=7*33)
train_start = df['start_date'].iloc[0]
train_end = train_start + duration
train = df[train_start:train_end]
test = df[train_end:]

In [24]:
dates = list()
for i in test.index:
    if len(dates) > 1:
        if dates[-1] == (i.month, i.day):
            continue
    dates.append((i.month, i.day))

In [25]:
from masterthesis.models import BayesWeekdayEstimator
from masterthesis.models import FrequentistEstimator
import operator

In [164]:
bwe = FrequentistEstimator()

bwe = bwe.fit(train)

counter_no_prob = 0

for date in dates:
    month, day = date
    x = pd.DataFrame(data=[[49.475752, 8.482531]],index=pd.DatetimeIndex([pd.Timestamp("2008-{}-{} 19:45:21".format(month,day))]), columns=['lat', 'lon'])
    print(x)
    pred = bwe.predict_proba(x)
    
    sorted_pred = sorted(pred.items(), key=operator.itemgetter(1), reverse=True)
    
    for i, row in df.loc[(df.index.month==month) & (df.index.day==day)].iterrows():
        print((row['start_cluster'], row['end_cluster']))
        if (row['start_cluster'], row['end_cluster']) in pred.keys():
            print(pred[(row['start_cluster'], row['end_cluster'])])
            for i, s_pred in enumerate(sorted_pred):
                if s_pred[0] == (row['start_cluster'], row['end_cluster']):
                    print('Ranked:', i+1, 'of total', len(pred), 'predictions')
        else:
            print("no prob")
            counter_no_prob += 1
            
    
print('Number of no prob:', counter_no_prob)
print('Total predictions:', len(test))

                           lat       lon
2008-06-12 19:45:21  49.475752  8.482531
(13, 22)
no prob
(22, 0)
0.13885778275475924
Ranked: 2 of total 227 predictions
(10, 0)
0.0011198208286674132
Ranked: 178 of total 227 predictions
                           lat       lon
2008-06-12 19:45:21  49.475752  8.482531
(13, 22)
no prob
(22, 0)
0.13885778275475924
Ranked: 2 of total 227 predictions
(10, 0)
0.0011198208286674132
Ranked: 178 of total 227 predictions
                           lat       lon
2008-06-13 19:45:21  49.475752  8.482531
(0, 17)
0.022396416573348264
Ranked: 7 of total 227 predictions
(17, 33)
no prob
(26, 26)
no prob
(26, 7)
no prob
(7, 33)
no prob
                           lat       lon
2008-06-14 19:45:21  49.475752  8.482531
(33, 17)
no prob
                           lat       lon
2008-06-15 19:45:21  49.475752  8.482531
(0, 0)
0.0593505039193729
Ranked: 3 of total 227 predictions
(22, 0)
0.13885778275475924
Ranked: 2 of total 227 predictions
                         

In [28]:
from masterthesis.preprocessing import DenseDepartureTimes
from masterthesis.models import BayesDepartureTimeEstimator
from sklearn.base import BaseEstimator
from sklearn.cluster import dbscan

In [29]:
class BayesDepartureTimeEstimator(BaseEstimator):
    def fit(self, X, y=None):
        self.data_ = X
        return self

    def partial_fit(self, X):
        # stack data to present data
        pass

    def predict_proba(self, x):
        length = len(self.data_)
        start_end = [(start, end) for start, end in zip(self.data_['start_cluster'], self.data_['end_cluster'])]
        priors = {str(k): v / length for k, v in Counter(start_end).items()}

        dayofweek = x.index.dayofweek
        
        p_ba = [row['start_time_cluster'] for index, row in self.data_.iterrows() if
                index.dayofweek == dayofweek]
        p_ba = {k: v / len(p_ba) for k, v in Counter(p_ba).items()}
        
        res = {key: priors[key.split('_')[0]] * p_ba[key] / (1 / 7) for key in p_ba}

        return res
    
        
    def resolve_start_time_cluster(self, stc):
        stc_df = self.data_[self.data_['start_time_cluster'] == stc]
        
        return min(stc_df.index.time), max(stc_df.index.time)

In [30]:
from datetime import datetime
from datetime import date
from datetime import time

In [31]:
def _time_to_degree(time):
    return ((time.hour + (time.minute + (time.second / 60)) / 60) / 24) * 360

def _time_distance(t1, t2):
    circumference = 2 * np.pi
    return (np.abs(_time_to_degree(t1) - _time_to_degree(t2))) * (circumference / 360)

_time_distance(time(6,44,22), time(7))

0.068213284932111679

In [32]:
ddf = DenseDepartureTimes(0.03)
train_ddt = ddf.fit_transform(train.copy())
bdte = BayesDepartureTimeEstimator()
bdte = bdte.fit(train_ddt)

counter_no_prob = 0

for i, row in test.iterrows():
    x = pd.DataFrame(data=[[row['start_lat'], row['start_lon']]], index=[i], columns=['lat', 'lon'])
    pred = bdte.predict_proba(x)
    sorted_pred = sorted(pred.items(), key=operator.itemgetter(1), reverse=True)
    for key in pred.keys():
        time_wa, time_wb = bdte.resolve_start_time_cluster(key)
        dummydate = date(1970, 1, 1)
        #time_wa = (datetime.combine(dummydate,time_wa)-timedelta(hours=1)).time()
        #time_wb = (datetime.combine(dummydate,time_wb)+timedelta(hours=1)).time()
        if len(x.between_time(time_wa, time_wb)) == 1:
            
            
            if str((row['start_cluster'], row['end_cluster'])) == key.split('_')[0]:
                print(x)
                print(key)
                print(time_wa, time_wb)
                print(row['start_cluster'], row['end_cluster'])
                for i, s_pred in enumerate(sorted_pred):
                    if s_pred[0] == key:
                        print('Ranked:', i+1, 'of total', len(pred), 'predictions')
                counter_no_prob += 1
        
    
print('Number of no prob:', counter_no_prob)
print('Total predictions:', len(test))

                           lat         lon
2009-06-16 03:02:30  39.999699  116.327202
(69, 13)_0
02:30:53 03:34:57
69 13
Ranked: 1 of total 123 predictions
                           lat         lon
2009-06-20 02:37:37  39.998798  116.326881
(69, 13)_0
02:30:53 03:34:57
69 13
Ranked: 1 of total 112 predictions
                           lat         lon
2009-06-22 02:56:22  39.997169  116.325554
(69, 13)_0
02:30:53 03:34:57
69 13
Ranked: 1 of total 115 predictions
                           lat         lon
2009-06-23 02:31:36  40.000118  116.326012
(69, 13)_0
02:30:53 03:34:57
69 13
Ranked: 1 of total 123 predictions
                           lat         lon
2009-06-30 02:33:35  39.999992  116.327415
(69, 13)_0
02:30:53 03:34:57
69 13
Ranked: 1 of total 123 predictions
                           lat         lon
2009-07-02 02:13:05  39.999962  116.326241
(69, 13)_5
01:52:08 02:18:37
69 13
Ranked: 6 of total 127 predictions
                           lat         lon
2009-07-06 02:56:25  