In [1]:
from tpm.data_model import *
from tpm.util.io import read_geolife
from tpm.util.dist import haversine_distance
from tpm.preprocessing import time_duplication_filter
from tpm.preprocessing import speed_filter_abs
import numpy as np
import pandas as pd
import folium
from datetime import timedelta

In [2]:
trajs = read_geolife('/mnt/hdd1/christian/data/geotracking/Geolife Trajectories 1.3/Data/001/Trajectory')


In [3]:
preprocessed = list()
for traj in trajs:
    traj_new = time_duplication_filter(traj)
    traj_new = speed_filter_abs(traj_new, 300, in_kmh=True)
    preprocessed.append(traj_new)


In [4]:
def staypoints_geolife(traj):
    time_thresh = 30*60
    dist_thresh = 250

    staypoints = list()
    i, i_max = 0, len(traj)
    while i < i_max:
        j = i+1
        token = 0
        while j < i_max:
            dist = haversine_distance(traj[i], traj[j])
            if dist > dist_thresh:
                delta_time = traj[j].datetime - traj[i].datetime
                if delta_time.total_seconds() > time_thresh:
                    mean_point = np.mean([[p.lat, p.lon] for p in traj[i:j+1]], axis=0)
                    arrival_time = traj[i].datetime
                    leave_time = traj[j].datetime
                    staypoints.append([mean_point, arrival_time, leave_time, i, j])
                    i = j
                    token = 1
                break
            j = j+1
        if not token == 1:
            i = i+1

    
    return staypoints

In [5]:
len(trajs)

71

In [5]:
def make_df(trajs):
    data = list()
    for traj in trajs:
        fp = traj[0]
        sps = staypoints_geolife(traj)
        lp = traj[-1]               
        
        if len(sps) > 1:
            data.append([fp.lat, fp.lon, fp.datetime, sps[0][0][0], sps[0][0][1], sps[0][1]])
            for i in range(1, len(sps)-1):
                data.append([sps[i][0][0], sps[i][0][1], sps[i][1], sps[i+1][0][0], sps[i+1][0][1], sps[i+1][2]])
            data.append([sps[-1][0][0], sps[-1][0][1], sps[-1][2], lp.lat, lp.lon, lp.datetime])
        else:
            data.append([fp.lat, fp.lon, fp.datetime, lp.lat, lp.lon, lp.datetime])
        
        
    df = pd.DataFrame(data, columns=['start_lat','start_lon','start_date','end_lat','end_lon','end_date'])
    df = df.set_index(pd.DatetimeIndex(df['start_date'])).sort_index()
    return df

In [6]:
df = make_df(preprocessed)
df


Unnamed: 0,start_lat,start_lon,start_date,end_lat,end_lon,end_date
2008-10-23 14:53:05,39.984093,116.319237,2008-10-23 14:53:05,39.978695,116.327423,2008-10-23 15:00:15
2008-10-23 20:49:08,40.015598,116.306198,2008-10-23 20:49:08,40.013802,116.306534,2008-10-23 21:04:28
2008-10-24 08:41:04,40.013866,116.306473,2008-10-24 08:41:04,39.978962,116.326286,2008-10-24 09:11:00
2008-10-24 10:54:12,39.980999,116.310104,2008-10-24 10:54:12,39.981224,116.309708,2008-10-24 12:21:33
2008-10-24 11:28:19,39.981224,116.309708,2008-10-24 11:28:19,39.978741,116.325806,2008-10-24 14:31:38
2008-10-24 12:54:19,39.978741,116.325806,2008-10-24 12:54:19,39.981331,116.310745,2008-10-24 15:26:28
2008-10-24 15:26:28,39.981331,116.310745,2008-10-24 15:26:28,39.977898,116.327065,2008-10-24 15:35:50
2008-10-25 08:44:05,40.013813,116.306480,2008-10-25 08:44:05,40.002087,116.170609,2008-10-25 12:39:53
2008-10-25 14:48:49,39.993202,116.145996,2008-10-25 14:48:49,39.989513,116.186981,2008-10-25 18:07:48
2008-10-25 17:23:24,39.989513,116.186981,2008-10-25 17:23:24,39.990730,116.193146,2008-10-25 19:07:22


In [11]:
from sklearn.cluster import dbscan
from sklearn.neighbors import DistanceMetric
from tpm.data_model import R
from collections import Counter

In [11]:
def haversine_distance(p1_lat, p1_lon, p2_lat, p2_lon):
    lat_rad1 = radians(p1_lat)
    lon_rad1 = radians(p1_lon)
    lat_rad2 = radians(p2_lat)
    lon_rad2 = radians(p2_lon)
    return 2*R * asin(sqrt(sin((lat_rad2-lat_rad1)/2)**2 + cos(lat_rad1)*cos(lat_rad2)*(sin((lon_rad2-lon_rad1)/2)**2)))

In [19]:
def cluster_into_spots(df, init_eps=150, levels=2, threshold=0.1):
    start_points = list()
    end_points = list()
    length = len(df)
    
    for i in range(length):
        start_points.append([df['start_lat'].iloc[i], df['start_lon'].iloc[i]])
        end_points.append([df['end_lat'].iloc[i], df['end_lon'].iloc[i]])
    
    points = np.radians(np.vstack([start_points, end_points]))
    
    haversine = DistanceMetric.get_metric('haversine')
    dist = haversine.pairwise(points)*R

    clusters = dbscan(dist, metric='precomputed', min_samples=1,eps=init_eps)[1]
    clusters = np.array(clusters, dtype=np.object)
    
    
    for _ in range(levels):
        init_eps = init_eps*0.6
        counts = dict(Counter(clusters))
        for key in counts:
            if counts[key] > threshold*length:
                idxs = np.where(clusters == key)[0]
                dist = haversine.pairwise(points[idxs])*R
                inner_clusters = dbscan(dist, metric='precomputed', min_samples=1,eps=init_eps)[1]
                for i, idx in enumerate(idxs):
                    
                    clusters[idx] = "{}_{}".format(clusters[idx], inner_clusters[i])
    
    start_clusters = list()
    end_clusters = list()
    for i, cluster in enumerate(clusters):
        if i < length:
            start_clusters.append(clusters[i])
        else:
            end_clusters.append(clusters[i%length+length])
    
    print(len(dict(Counter(clusters)).keys()))
    
    df['start_cluster'] = start_clusters
    df['end_cluster'] = end_clusters
    return df

In [7]:
import hdbscan
from collections import Counter
from sklearn.preprocessing import StandardScaler

In [16]:
def cluster_into_spots(df):
    start_points = list()
    end_points = list()
    length = len(df)

    for i in range(length):
        start_points.append([df['start_lat'].iloc[i], df['start_lon'].iloc[i]])
        end_points.append([df['end_lat'].iloc[i], df['end_lon'].iloc[i]])

    points = np.radians(np.vstack([start_points, end_points]))

    haversine = DistanceMetric.get_metric('haversine')
    dist = haversine.pairwise(points) * R
    
    dist = StandardScaler().fit_transform(dist)
    
    clusterer = hdbscan.HDBSCAN(min_cluster_size=10, metric='precomputed')
    clusters = clusterer.fit_predict(dist)

    start_clusters = list()
    end_clusters = list()
    for i, cluster in enumerate(clusters):
        if i < length:
            start_clusters.append(clusters[i])
        else:
            end_clusters.append(clusters[i % length + length])

    print(Counter(start_clusters))        
    
    df['start_cluster'] = start_clusters
    df['end_cluster'] = end_clusters
    return df

In [20]:
df = cluster_into_spots(df, init_eps=300, levels=1, threshold=0.5)

counter = 0

for i in range(1,len(df)):
    ec = df['end_cluster'].iloc[i-1]
    sc = df['start_cluster'].iloc[i]
    if ec == sc:
        counter += 1
        
print(counter, len(df))

32
86 117


In [23]:
visualize_cluster(df)

In [22]:
from matplotlib.colors import cnames
def visualize_cluster(df):
    colors = [hexc for hexc in cnames.values()]
    map_lat, map_lon = df.iloc[0].start_lat, df.iloc[0].start_lon
    map_osm = folium.Map(location=[map_lat, map_lon])

    for i, row in df.iterrows():
        tup = (row['start_lat'], row['start_lon'])
        start_c = row['start_cluster']
        if not isinstance(start_c, int) and '_' in start_c:
            start_c = start_c.split('_')[1]
        start_c = int(start_c)
        marker = folium.CircleMarker(tup, color=colors[start_c], fill_color=colors[start_c], radius=50, fill_opacity=1)
        map_osm.add_children(marker)
        
        tup = (row['end_lat'], row['end_lon'])
        end_c = row['end_cluster']
        if not isinstance(end_c, int) and'_' in end_c:
            end_c = end_c.split('_')[1]
        end_c = int(end_c)
        marker = folium.CircleMarker(tup, color=colors[end_c], fill_color=colors[end_c], radius=50, fill_opacity=1)
        map_osm.add_children(marker)

    return map_osm

In [21]:
def visualize_rows(rows):
    map_lat, map_lon = rows.iloc[0].start_lat, rows.iloc[0].start_lon
    map_osm = folium.Map(location=[map_lat, map_lon])

    for i, row in rows.iterrows():
        tup = (row['start_lat'], row['start_lon'])
        marker = folium.Marker(tup, icon=folium.Icon(color='green'), popup='{} {}'.format(i, row['start_cluster']))
        map_osm.add_children(marker)
        tup = (row['end_lat'], row['end_lon'])
        marker = folium.Marker(tup, icon=folium.Icon(color='red'), popup='{} {}'.format(i, row['end_cluster']))
        map_osm.add_children(marker)

    return map_osm

In [22]:
visualize_rows(df.iloc[:-1])

In [30]:
ts = df['end_date'].iloc[-1] - df['start_date'].iloc[0]
ts.days/7

7.428571428571429

In [191]:
duration = timedelta(days=7*5)
train_start = df['start_date'].iloc[0]
train_end = train_start + duration
train = df[train_start:train_end]
test = df[train_end:]

In [28]:
dates = list()
for i in test.index:
    if len(dates) > 1:
        if dates[-1] == (i.month, i.day):
            continue
    dates.append((i.month, i.day))

In [67]:
from masterthesis.models import BayesWeekdayEstimator
from masterthesis.models import FrequentistEstimator
from sklearn.base import BaseEstimator, TransformerMixin
import operator

In [58]:
class FrequentistEstimator(BaseEstimator):
    def __init__(
        self,
        look_ahead = timedelta(hours=1)
    ):
        self.look_ahead = look_ahead
    
    def fit(self, X, y=None):
        self.data_ = [(date, start, end) for date, start, end in zip(X['start_date'], X['start_cluster'], X['end_cluster'])]
        look_ahead_data = list()
        idx_insert = list()
        if self.look_ahead:
            for i, tup in enumerate(self.data_):
                for j in range(i+1, len(self.data_)):
                    if self.data_[j][0] - tup[0] < self.look_ahead:
                        look_ahead_data.append([tup[0], tup[1], self.data_[j][2]])
                        idx_insert.append(i+1)
        
        print(len(self.data_))
        idx_insert = sorted(idx_insert, reverse=True)
        for i, idx in enumerate(idx_insert):
            self.data_.insert(idx, look_ahead_data[i])
        
        print(len(self.data_))
        
        return self

    def partial_fit(self):
        # stack data to present data
        pass

    def predict_proba(self, x):
        length = len(self.data_)    
        start_end = [(start, end) for _, start, end in self.data_]
        return {k: v / length for k, v in Counter(start_end).items()}

In [64]:
class BayesWeekdayEstimator(BaseEstimator):
    def __init__(
        self,
        look_ahead = timedelta(hours=1)
    ):
        self.look_ahead = look_ahead
        
    def fit(self, X, y=None):
        self.data_ = [(date, start, end) for date, start, end in zip(X['start_date'], X['start_cluster'], X['end_cluster'])]
        look_ahead_data = list()
        idx_insert = list()
        if self.look_ahead:
            for i, tup in enumerate(self.data_):
                for j in range(i+1, len(self.data_)):
                    if self.data_[j][0] - tup[0] < self.look_ahead:
                        look_ahead_data.append([tup[0], tup[1], self.data_[j][2]])
                        idx_insert.append(i+1)
        
        print(len(self.data_))
        idx_insert = sorted(idx_insert, reverse=True)
        for i, idx in enumerate(idx_insert):
            self.data_.insert(idx, look_ahead_data[i])
        
        print(len(self.data_))
        
        return self
    def partial_fit(self, X):
        # stack data to present data
        pass

    def predict_proba(self, x):
        length = len(self.data_)    
        start_end = [(start, end) for _, start, end in self.data_]
        priors = {k: v / length for k, v in Counter(start_end).items()}

        dayofweek = x.index.dayofweek

        counts_given_dayofweek = [(start, end) for date, start, end in self.data_ 
                                                    if date.weekday() == dayofweek]
        prob_given_dayofweek = {k: v / len(counts_given_dayofweek) for k, v in Counter(counts_given_dayofweek).items()}

        res = {key: priors[key] * prob_given_dayofweek[key] / (1 / 7) for key in prob_given_dayofweek}

        return res

In [71]:
class BayesDepartureTimeEstimator(BaseEstimator):    
     def __init__(
        self,
        look_ahead = timedelta(hours=1)
    ):
        self.look_ahead = look_ahead
    
    def fit(self, X, y=None):
        self.data_ = [(date, start, end, st_cluster) for date, start, end, st_cluster in zip(X['start_date'], X['start_cluster'], X['end_cluster'], X['start_time_cluster'])]
        
        look_ahead_data = list()
        idx_insert = list()
        if self.look_ahead:
            for i, tup in enumerate(self.data_):
                for j in range(i+1, len(self.data_)):
                    if self.data_[j][0] - tup[0] < self.look_ahead:
                        look_ahead_data.append([tup[0], tup[1], self.data_[j][2]])
                        idx_insert.append(i+1)
        
        idx_insert = sorted(idx_insert, reverse=True)
        for i, idx in enumerate(idx_insert):
            self.data_.insert(idx, look_ahead_data[i])
        
        return self
    

    def partial_fit(self, X):
        # stack data to present data
        pass

    def predict_proba(self, x):
        length = len(self.data_)
        start_end = [(start, end) for start, end in zip(self.data_['start_cluster'], self.data_['end_cluster'])]
        priors = {str(k): v / length for k, v in Counter(start_end).items()}

        dayofweek = x.index.dayofweek

        p_ba = [row['start_time_cluster'] for index, row in self.data_.iterrows() if
                index.dayofweek == dayofweek]
        p_ba = {k: v / len(p_ba) for k, v in Counter(p_ba).items()}

        res = {key: priors[key.split('_')[0]] * p_ba[key] / (1 / 7) for key in p_ba}

        return res

    def resolve_start_time_cluster(self, stc):
        return self.data_[self.data_['start_time_cluster'] == stc]
    

IndentationError: unindent does not match any outer indentation level (<ipython-input-71-2de14782114c>, line 8)

In [82]:
from sklearn.cluster import DBSCAN

In [169]:
class DenseDepartureTimes(BaseEstimator, TransformerMixin):
    def __init__(self, eps=0.5, look_ahead=timedelta(hours=1)):
        self.eps = eps
        self.look_ahead = look_ahead

    def fit(self, X, y=None, **fitparams):
        return self

    def transform(self, X):
        rows = list()
        
        if self.look_ahead:
            for i in range(len(X)):
                for j in range(i+1, len(X)):
                    if X['start_date'].iloc[j] - X['start_date'].iloc[i]< self.look_ahead:

                        row_cp = X.iloc[i].copy()
                        row_cp['end_lat'] = X['end_lat'].iloc[j]
                        row_cp['end_lon'] = X['end_lon'].iloc[j]
                        row_cp['end_date'] = X['end_date'].iloc[j]
                        row_cp['end_cluster'] = X['end_cluster'].iloc[j]
                        row_cp['start_date'] = X['start_date'].iloc[i] + timedelta(seconds=1)
                        rows.append(row_cp)

            X = X.append(rows, ignore_index=True)
            X = X.set_index(pd.DatetimeIndex(X['start_date'])).sort_index()
        
        
        dbscan = DBSCAN(eps=self.eps, min_samples=1, metric='precomputed')
        start_cluster_to_time = dict()
        for key, group in X.groupby(['start_cluster', 'end_cluster']):
            dist = self._calc_pdist_matrix(group)
            clusters = dbscan.fit_predict(dist)
            
            d = {timestamp: cluster for timestamp, cluster in zip(group.index, clusters)}
            start_cluster_to_time[key] = d
        
        print(start_cluster_to_time)
        
        start_time_cluster = list()
        for index, row in X.iterrows():
            cluster_pair = (row['start_cluster'], row['end_cluster'])
            print(cluster_pair)
            start_time_cluster.append("{}___{}".format(cluster_pair, start_cluster_to_time[cluster_pair][index]))

        X['start_time_cluster'] = start_time_cluster
        return X

    def _time_to_degree(self, time):
        return ((time.hour + (time.minute + (time.second / 60)) / 60) / 24) * 360

    def _time_distance(self, t1, t2):
        circumference = 2 * np.pi
        return (np.abs(self._time_to_degree(t1) - self._time_to_degree(t2))) * (circumference / 360)

    def _calc_pdist_matrix(self, group):
        matrix = list()

        for t1 in group.index.time:
            inner_dist = list()

            for t2 in group.index.time:
                inner_dist.append(self._time_distance(t1, t2))

            matrix.append(inner_dist)

        return matrix

In [173]:
train = DenseDepartureTimes().fit_transform(train)


{('3_0', 9): {Timestamp('2008-11-11 20:43:04'): 0}, ('1_0', 25): {Timestamp('2008-11-18 08:44:59'): 0}, (0, '3_0'): {Timestamp('2008-10-23 14:53:05'): 0}, ('1_0', 22): {Timestamp('2008-10-29 08:30:53'): 0}, ('3_1', '3_0'): {Timestamp('2008-11-18 09:38:44'): 0}, (5, 6): {Timestamp('2008-10-25 17:23:24'): 0}, ('1_0', 2): {Timestamp('2008-11-08 12:43:58'): 0}, (15, '1_0'): {Timestamp('2008-11-08 18:42:42'): 0}, (7, '3_0'): {Timestamp('2008-10-26 15:28:10'): 0}, (2, 10): {Timestamp('2008-10-30 14:28:16'): 0}, (17, '1_0'): {Timestamp('2008-11-17 22:13:49'): 0}, (16, 16): {Timestamp('2008-11-15 14:32:10'): 0}, ('3_0', 2): {Timestamp('2008-10-24 12:54:19'): 0}, ('3_0', 16): {Timestamp('2008-11-15 12:56:34'): 0}, (2, '3_0'): {Timestamp('2008-10-24 10:54:13'): 0, Timestamp('2008-10-24 15:26:28'): 1, Timestamp('2008-10-24 11:28:19'): 0}, (9, '3_0'): {Timestamp('2008-10-29 08:50:48'): 0}, ('1_0', '3_2'): {Timestamp('2008-10-28 08:30:29'): 0}, ('3_0', 10): {Timestamp('2008-11-04 14:48:59'): 0}, ('

In [175]:
class BayesDepartureTimeEstimator(BaseEstimator):
    def fit(self, X, y=None):
        self.data_ = X
        return self

    def partial_fit(self, X):
        # stack data to present data
        pass

    def predict_proba(self, x):
        length = len(self.data_)
        start_end = [(start, end) for start, end in zip(self.data_['start_cluster'], self.data_['end_cluster'])]
        priors = {str(k): v / length for k, v in Counter(start_end).items()}

        dayofweek = x.index.dayofweek

        p_ba = [row['start_time_cluster'] for index, row in self.data_.iterrows() if
                index.dayofweek == dayofweek]
        p_ba = {k: v / len(p_ba) for k, v in Counter(p_ba).items()}
    
        print(priors)
        res = {key: priors[key.split('___')[0]] * p_ba[key] / (1 / 7) for key in p_ba}

        return res

    def resolve_start_time_cluster(self, stc):
        return self.data_[self.data_['start_time_cluster'] == stc]

In [176]:
bwe = BayesDepartureTimeEstimator()

bwe = bwe.fit(train)
x = pd.DataFrame(data=[[49.475752, 8.482531]],index=pd.DatetimeIndex([pd.Timestamp("2008-06-13 19:45:21")]), columns=['lat', 'lon'])
bwe.predict_proba(x)

{"('3_0', '1_0')": 0.16091954022988506, "('3_0', 14)": 0.022988505747126436, "(6, '1_0')": 0.011494252873563218, '(12, 13)': 0.011494252873563218, "('3_0', 9)": 0.011494252873563218, "('3_0', 16)": 0.011494252873563218, "(16, '1_0')": 0.011494252873563218, "('1_0', 2)": 0.011494252873563218, "('1_0', '1_0')": 0.04597701149425287, '(2, 10)': 0.011494252873563218, '(5, 6)': 0.011494252873563218, "('1_0', 21)": 0.011494252873563218, '(2, 2)': 0.011494252873563218, "('3_0', '3_0')": 0.06896551724137931, "(8, '1_0')": 0.011494252873563218, '(10, 10)': 0.011494252873563218, "('1_0', 25)": 0.011494252873563218, "('1_0', 22)": 0.011494252873563218, "(7, '3_0')": 0.011494252873563218, "('3_0', 2)": 0.011494252873563218, "(15, '1_0')": 0.011494252873563218, "(9, '3_0')": 0.011494252873563218, "('1_0', 16)": 0.022988505747126436, "(17, '1_0')": 0.011494252873563218, "(11, '3_0')": 0.011494252873563218, "('3_1', '3_0')": 0.011494252873563218, '(4, 5)': 0.011494252873563218, "('1_0', 24)": 0.011494

{"('1_0', '3_0')___0": 0.45258620689655177,
 "('3_0', '1_0')___0": 0.07040229885057471,
 "('3_0', '3_0')___0": 0.03017241379310345,
 "('3_0', '3_0')___2": 0.03017241379310345,
 "('3_0', 14)___0": 0.020114942528735632,
 "('3_0', 2)___0": 0.005028735632183908,
 "(14, '3_0')___0": 0.005028735632183908,
 "(2, '3_0')___0": 0.03017241379310345,
 "(2, '3_0')___1": 0.015086206896551725,
 '(2, 2)___0': 0.005028735632183908}

In [31]:
bwe = FrequentistEstimator()

bwe = bwe.fit(train)

counter_no_prob = 0

ranks = list()

for date in dates:
    month, day = date
    x = pd.DataFrame(data=[[49.475752, 8.482531]],index=pd.DatetimeIndex([pd.Timestamp("2008-{}-{} 19:45:21".format(month,day))]), columns=['lat', 'lon'])
    print(x)
    pred = bwe.predict_proba(x)
    
    sorted_pred = sorted(pred.items(), key=operator.itemgetter(1), reverse=True)
    
    for i, row in df.loc[(df.index.month==month) & (df.index.day==day)].iterrows():
        print((row['start_cluster'], row['end_cluster']))
        if (row['start_cluster'], row['end_cluster']) in pred.keys():
            print(pred[(row['start_cluster'], row['end_cluster'])])
            for i, s_pred in enumerate(sorted_pred):
                if s_pred[0] == (row['start_cluster'], row['end_cluster']):
                    print('Ranked:', i+1, 'of total', len(pred), 'predictions')
                    ranks.append(i+1)
        else:
            print("no prob")
            counter_no_prob += 1
            
    
print('Number of no prob:', counter_no_prob)
print('Total predictions:', len(test))
print(np.mean(ranks))

                           lat       lon
2008-12-01 19:45:21  49.475752  8.482531
('3_0', '1_0')
0.1728395061728395
Ranked: 2 of total 42 predictions
                           lat       lon
2008-12-02 19:45:21  49.475752  8.482531
('1_0', '3_0')
0.18518518518518517
Ranked: 1 of total 42 predictions
('3_0', '1_0')
0.1728395061728395
Ranked: 2 of total 42 predictions
                           lat       lon
2008-12-03 19:45:21  49.475752  8.482531
('1_0', '3_0')
0.18518518518518517
Ranked: 1 of total 42 predictions
('3_0', '1_0')
0.1728395061728395
Ranked: 2 of total 42 predictions
                           lat       lon
2008-12-04 19:45:21  49.475752  8.482531
('1_0', '3_0')
0.18518518518518517
Ranked: 1 of total 42 predictions
(10, '3_0')
0.037037037037037035
Ranked: 5 of total 42 predictions
('3_0', '1_0')
0.1728395061728395
Ranked: 2 of total 42 predictions
                           lat       lon
2008-12-05 19:45:21  49.475752  8.482531
('1_0', '3_0')
0.18518518518518517
Ranked: 1

In [65]:
bwe = BayesWeekdayEstimator()

bwe = bwe.fit(train)

counter_no_prob = 0

ranks = list()
for date in dates:
    month, day = date
    x = pd.DataFrame(data=[[49.475752, 8.482531]],index=pd.DatetimeIndex([pd.Timestamp("2008-{}-{} 19:45:21".format(month,day))]), columns=['lat', 'lon'])
    print(x)
    pred = bwe.predict_proba(x)
    
    sorted_pred = sorted(pred.items(), key=operator.itemgetter(1), reverse=True)
    
    for i, row in df.loc[(df.index.month==month) & (df.index.day==day)].iterrows():
        print((row['start_cluster'], row['end_cluster']))
        if (row['start_cluster'], row['end_cluster']) in pred.keys():
            print(pred[(row['start_cluster'], row['end_cluster'])])
            for i, s_pred in enumerate(sorted_pred):
                if s_pred[0] == (row['start_cluster'], row['end_cluster']):
                    print('Ranked:', i+1, 'of total', len(pred), 'predictions')
                    ranks.append(i+1)
                    
        else:
            print("no prob")
            counter_no_prob += 1
            
    
print('Number of no prob:', counter_no_prob)
print('Total predictions:', len(test))
print(np.mean(ranks))

                           lat       lon
2008-12-01 19:45:21  49.475752  8.482531
(3, 1)
0.26886145404663925
Ranked: 2 of total 6 predictions
                           lat       lon
2008-12-02 19:45:21  49.475752  8.482531
(1, 3)
0.36728395061728397
Ranked: 1 of total 6 predictions
(3, 1)
0.30246913580246915
Ranked: 2 of total 6 predictions
                           lat       lon
2008-12-03 19:45:21  49.475752  8.482531
(1, 3)
0.4006734006734006
Ranked: 1 of total 7 predictions
(3, 1)
0.32996632996632996
Ranked: 2 of total 7 predictions
                           lat       lon
2008-12-04 19:45:21  49.475752  8.482531
(1, 3)
0.36728395061728397
Ranked: 2 of total 6 predictions
(10, 3)
0.043209876543209874
Ranked: 3 of total 6 predictions
(3, 1)
0.40329218106995884
Ranked: 1 of total 6 predictions
                           lat       lon
2008-12-05 19:45:21  49.475752  8.482531
(1, 3)
0.5246913580246914
Ranked: 1 of total 8 predictions
(3, 1)
0.08641975308641975
Ranked: 3 of total 8 pr

In [183]:
from sklearn.base import ClassifierMixin
import datetime

In [285]:
class TimeSensitiveMostFrequentRoute(BaseEstimator, ClassifierMixin):
    def __init__(
            self,
            time_window = timedelta(hours=1),
            look_ahead = timedelta(hours=1)
    ):
        self.time_window = time_window
        self.dummydate = datetime.date(1970, 1, 1)
        self.look_ahead = look_ahead

    def fit(self, X, y=None):
        self.data_ = [(date, start, end) for date, start, end in zip(X['start_date'], X['start_cluster'], X['end_cluster'])]
        
        look_ahead_data = list()
        idx_insert = list()
        if self.look_ahead:
            for i, tup in enumerate(self.data_):
                for j in range(i+1, len(self.data_)):
                    if self.data_[j][0] - tup[0] < self.look_ahead:
                        look_ahead_data.append([tup[0], tup[1], self.data_[j][2]])
                        idx_insert.append(i+1)
        
        idx_insert = sorted(idx_insert, reverse=True)
        for i, idx in enumerate(idx_insert):
            self.data_.insert(idx, look_ahead_data[i])
        
        self.data_ = pd.DataFrame(data=self.data_, columns=['start_date','start_cluster','end_cluster'])
        self.data_ = self.data_.set_index(pd.DatetimeIndex(self.data_['start_date'])).sort_index()
        
        return self

    def predict(self, X):
        
        res_list = list()
        for index, row in X.iterrows():
            res = self.data_[self.data_['start_cluster'] == row['start_cluster']]
            dummydatetime = datetime.datetime.combine(self.dummydate, datetime.time(index.hour, index.minute))
            lowerbound = dummydatetime - self.time_window
            upperbound = dummydatetime + self.time_window
            lowerbound = '{}:{}'.format(lowerbound.hour, lowerbound.minute)
            upperbound = '{}:{}'.format(upperbound.hour, upperbound.minute)
            res = res.between_time(lowerbound, upperbound)
            try:
                res = res['end_cluster'].value_counts().index[0]
            except:
                res = None
            res_list.append(res)

        return res_list
        
        
    def predict_proba(self, X):
        length = len(self.data_)
        res_list = list()
        for index, row in X.iterrows():
            res = self.data_[self.data_['start_cluster'] == row['start_cluster']]
            dummydatetime = datetime.datetime.combine(self.dummydate, datetime.time(index.hour, index.minute))
            lowerbound = dummydatetime - self.time_window
            upperbound = dummydatetime + self.time_window
            lowerbound = '{}:{}'.format(lowerbound.hour, lowerbound.minute)
            upperbound = '{}:{}'.format(upperbound.hour, upperbound.minute)
            res = res.between_time(lowerbound, upperbound)
            try:
                res = res['end_cluster'].value_counts() / length
            except:
                res = None
            res_list.append(res)

        return res_list
        
        

In [301]:
ts = TimeSensitiveMostFrequentRoute()
ts.fit(train)
preds = ts.predict_proba(test)
ranks = list()
for i in range(len(preds)):
    correct = False
    
    for j, pred in enumerate(preds[i].index):
        if pred == test['end_cluster'].iloc[i]:
            ranks.append((j + 1, len(preds[i])))
            correct = True
            break
    
    if not correct:
        ranks.append((None, len(preds[i])))
        
                    
ranks

[(1, 2),
 (1, 6),
 (None, 0),
 (1, 6),
 (1, 3),
 (1, 6),
 (1, 2),
 (1, 1),
 (1, 6),
 (1, 1),
 (None, 6),
 (None, 0),
 (None, 0),
 (1, 4),
 (None, 0),
 (None, 0),
 (None, 4),
 (1, 1),
 (1, 6),
 (1, 1),
 (1, 6),
 (None, 4),
 (None, 0),
 (None, 0),
 (1, 1),
 (1, 6),
 (None, 1),
 (1, 1),
 (1, 6),
 (1, 3),
 (1, 6),
 (1, 1),
 (None, 1),
 (None, 0),
 (None, 0),
 (1, 6)]

In [235]:
from math import *

In [277]:
class FuzzyLocationMixin(object):
    
    def _get_nearest_cluster(self, lat, lon):    
        min_dist = self.max_dist_nearest_cluster
        cluster = None
        for index, row in self.data_.iterrows():
            
            dist = haversine_distance(lat, lon, row['start_lat'], row['start_lon'])
            if dist < min_dist:
                cluster = row['start_cluster']
                min_dist = dist

        return cluster

In [242]:
def haversine_distance(p1_lat, p1_lon, p2_lat, p2_lon):
            lat_rad1 = radians(p1_lat)
            lon_rad1 = radians(p1_lon)
            lat_rad2 = radians(p2_lat)
            lon_rad2 = radians(p2_lon)
            return 2*R * asin(sqrt(sin((lat_rad2-lat_rad1)/2)**2 + cos(lat_rad1)*cos(lat_rad2)*(sin((lon_rad2-lon_rad1)/2)**2)))
    

In [303]:
class TimeAndLocSensitiveMostFrequentRoute(BaseEstimator, ClassifierMixin, FuzzyLocationMixin):
    def __init__(
            self,
            time_window = timedelta(hours=1),
            look_ahead = timedelta(hours=1),
            max_dist_nearest_cluster = 100,
            
    ):
        self.time_window = time_window
        self.dummydate = datetime.date(1970, 1, 1)
        self.look_ahead = look_ahead
        self.max_dist_nearest_cluster = max_dist_nearest_cluster
        

    def fit(self, X, y=None):
        self.data_ = [(date, start, lat, lon, end) for date, start, lat, lon, end in 
                      zip(X['start_date'], X['start_cluster'], X['start_lat'], X['start_lon'], X['end_cluster'])]
        
        look_ahead_data = list()
        idx_insert = list()
        if self.look_ahead:
            for i, tup in enumerate(self.data_):
                for j in range(i+1, len(self.data_)):
                    if self.data_[j][0] - tup[0] < self.look_ahead:
                        look_ahead_data.append([tup[0], tup[1], tup[2], tup[3], self.data_[j][4]])
                        idx_insert.append(i+1)
        
        idx_insert = sorted(idx_insert, reverse=True)
        for i, idx in enumerate(idx_insert):
            self.data_.insert(idx, look_ahead_data[i])
        
        self.data_ = pd.DataFrame(data=self.data_, columns=['start_date','start_cluster','start_lat','start_lon','end_cluster'])
        self.data_ = self.data_.set_index(pd.DatetimeIndex(self.data_['start_date'])).sort_index()
        
        return self

    def predict(self, X):
        res_list = []

        for index, x in X.iterrows():
            lat = x['start_lat']
            lon = x['start_lon']
            nearest_cluster = self._get_nearest_cluster(lat, lon)
            if nearest_cluster == None:
                res_list.append(None)
                continue

            res = self.data_[self.data_['start_cluster'] == nearest_cluster]
            dummydatetime = datetime.datetime.combine(self.dummydate, datetime.time(index.hour, index.minute))
            lowerbound = dummydatetime - self.time_window
            upperbound = dummydatetime + self.time_window
            lowerbound = '{}:{}'.format(lowerbound.hour, lowerbound.minute)
            upperbound = '{}:{}'.format(upperbound.hour, upperbound.minute)
            res = res.between_time(lowerbound, upperbound)

            try:
                res = res['end_cluster'].value_counts().index[0]
            except:
                res = None
            
            res_list.append(res)

        return res_list
    
    def predict_proba(self, X):
        res_list = []
        length = len(self.data_)

        for index, x in X.iterrows():
            lat = x['start_lat']
            lon = x['start_lon']
            nearest_cluster = self._get_nearest_cluster(lat, lon)
            if nearest_cluster == None:
                res_list.append(None)
                continue

            res = self.data_[self.data_['start_cluster'] == nearest_cluster]
            dummydatetime = datetime.datetime.combine(self.dummydate, datetime.time(index.hour, index.minute))
            lowerbound = dummydatetime - self.time_window
            upperbound = dummydatetime + self.time_window
            lowerbound = '{}:{}'.format(lowerbound.hour, lowerbound.minute)
            upperbound = '{}:{}'.format(upperbound.hour, upperbound.minute)
            res = res.between_time(lowerbound, upperbound)

            try:
                res = res['end_cluster'].value_counts() / length
            except:
                res = []
            
            res_list.append(res)

        return res_list


In [306]:
ts = TimeAndLocSensitiveMostFrequentRoute()
ts.fit(train)
preds = ts.predict_proba(test)
ranks = list()
for i in range(len(preds)):
    correct = False
    if preds[i] is not None:
        for j, pred in enumerate(preds[i].index):
            if pred == test['end_cluster'].iloc[i]:
                ranks.append((j + 1, len(preds[i])))
                correct = True
                break
    
        if not correct:
            ranks.append((None, len(preds[i])))
    
    else:
        ranks.append((None, None))
        
                    
ranks

[(1, 2),
 (1, 6),
 (None, 0),
 (1, 6),
 (1, 3),
 (1, 6),
 (1, 2),
 (1, 1),
 (1, 6),
 (1, 1),
 (None, 6),
 (None, None),
 (None, None),
 (1, 4),
 (None, None),
 (None, None),
 (None, 4),
 (1, 1),
 (1, 6),
 (1, 1),
 (1, 6),
 (None, 4),
 (None, None),
 (None, None),
 (1, 1),
 (1, 6),
 (None, 1),
 (1, 1),
 (1, 6),
 (1, 3),
 (1, 6),
 (1, 1),
 (None, 1),
 (None, None),
 (None, 0),
 (1, 6)]

In [278]:
class TLWSensitiveMostFrequentRoute(BaseEstimator, ClassifierMixin, FuzzyLocationMixin):
    def __init__(
            self,
            time_window = timedelta(hours=1),
            look_ahead = timedelta(hours=1),
            max_dist_nearest_cluster = 100,
            
    ):
        self.time_window = time_window
        self.dummydate = datetime.date(1970, 1, 1)
        self.look_ahead = look_ahead
        self.max_dist_nearest_cluster = max_dist_nearest_cluster
        

    def fit(self, X, y=None):
        self.data_ = [(date, start, lat, lon, end) for date, start, lat, lon, end in 
                      zip(X['start_date'], X['start_cluster'], X['start_lat'], X['start_lon'], X['end_cluster'])]
        
        look_ahead_data = list()
        idx_insert = list()
        if self.look_ahead:
            for i, tup in enumerate(self.data_):
                for j in range(i+1, len(self.data_)):
                    if self.data_[j][0] - tup[0] < self.look_ahead:
                        look_ahead_data.append([tup[0], tup[1], tup[2], tup[3], self.data_[j][4]])
                        idx_insert.append(i+1)
        
        idx_insert = sorted(idx_insert, reverse=True)
        for i, idx in enumerate(idx_insert):
            self.data_.insert(idx, look_ahead_data[i])
        
        self.data_ = pd.DataFrame(data=self.data_, columns=['start_date','start_cluster','start_lat','start_lon','end_cluster'])
        self.data_ = self.data_.set_index(pd.DatetimeIndex(self.data_['start_date'])).sort_index()
        
        return self

    def predict(self, X):
        res_list = []

        for index, x in X.iterrows():
            lat = x['start_lat']
            lon = x['start_lon']
            nearest_cluster = self._get_nearest_cluster(lat, lon)
            if nearest_cluster == None:
                res_list.append(None)
                continue

            res = self.data_[self.data_['start_cluster'] == nearest_cluster]
            res = res[res.index.dayofweek == index.dayofweek]

            dummydatetime = datetime.datetime.combine(self.dummydate, datetime.time(index.hour, index.minute))
            lowerbound = dummydatetime - self.time_window
            upperbound = dummydatetime + self.time_window
            lowerbound = '{}:{}'.format(lowerbound.hour, lowerbound.minute)
            upperbound = '{}:{}'.format(upperbound.hour, upperbound.minute)
            res = res.between_time(lowerbound, upperbound)

            try:
                res = res['end_cluster'].value_counts().index[0]
            except:
                res = None
            res_list.append(res)

        return res_list
    
    def predict_proba(self, X):
        res_list = []
        length = len(self.data_)

        for index, x in X.iterrows():
            lat = x['start_lat']
            lon = x['start_lon']
            nearest_cluster = self._get_nearest_cluster(lat, lon)
            if nearest_cluster == None:
                res_list.append(None)
                continue

            res = self.data_[self.data_['start_cluster'] == nearest_cluster]
            res = res[res.index.dayofweek == index.dayofweek]

            dummydatetime = datetime.datetime.combine(self.dummydate, datetime.time(index.hour, index.minute))
            lowerbound = dummydatetime - self.time_window
            upperbound = dummydatetime + self.time_window
            lowerbound = '{}:{}'.format(lowerbound.hour, lowerbound.minute)
            upperbound = '{}:{}'.format(upperbound.hour, upperbound.minute)
            res = res.between_time(lowerbound, upperbound)

            try:
                res = res['end_cluster'].value_counts() / length
            except:
                res = None
            res_list.append(res)

        return res_list

        return res


In [307]:
ts = TLWSensitiveMostFrequentRoute()
ts.fit(train)
preds = ts.predict_proba(test)
ranks = list()
for i in range(len(preds)):
    correct = False
    if preds[i] is not None:
        for j, pred in enumerate(preds[i].index):
            if pred == test['end_cluster'].iloc[i]:
                ranks.append((j + 1, len(preds[i])))
                correct = True
                break
    
        if not correct:
            ranks.append((None, len(preds[i])))
    
    else:
        ranks.append((None, None))
        
                    
ranks

[(1, 1),
 (1, 3),
 (None, 0),
 (1, 2),
 (1, 2),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (None, 0),
 (None, 3),
 (None, None),
 (None, None),
 (None, 1),
 (None, None),
 (None, None),
 (None, 0),
 (None, 0),
 (1, 1),
 (1, 1),
 (1, 3),
 (None, 0),
 (None, None),
 (None, None),
 (1, 1),
 (1, 2),
 (None, 0),
 (None, 0),
 (1, 1),
 (None, 0),
 (1, 1),
 (None, 0),
 (None, 0),
 (None, None),
 (None, 0),
 (1, 1)]