# Comparison GeoLife vs GeoLife+

We analyze our augmented dataset and compare it to the original geolife slice we used

In [None]:
import yaml
import pandas as pd
import skmob
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import seaborn as sns
from skmob.measures.individual import radius_of_gyration
from mpl_toolkits.axes_grid.inset_locator import inset_axes
from pylab import *
from skmob.measures.individual import number_of_visits

with open("conf.yaml") as f:
    conf = yaml.load(f, Loader = yaml.FullLoader)

out_path = conf["out_path"]
#data_path = conf["data_path"]
geolife_data_path = conf["geolife_data_path"]

beijing_lat_min = conf["beijing_lat_min"]
beijing_lat_max = conf["beijing_lat_max"]
beijing_lon_min = conf["beijing_lon_min"]
beijing_lon_max = conf["beijing_lon_max"]
geo_life_analysis_path = conf["geo_life_analysis_path"]

start_time = conf["start_time"]
end_time = conf["end_time"]

In [None]:
# load the data sets
geo_life = pd.read_csv(geolife_data_path+"geo_life_full.csv")
geo_life_plus = pd.read_csv(out_path+"geolife_full_augmented.csv")

#restricting to beijing area
geo_life = geo_life[(geo_life['lat'].between(beijing_lat_min, beijing_lat_max )) & (geo_life['lon'].between(beijing_lon_min, beijing_lon_max))]
geo_life_plus = geo_life_plus[(geo_life_plus['lat'].between(beijing_lat_min, beijing_lat_max )) & (geo_life_plus['lon'].between(beijing_lon_min, beijing_lon_max))]

#restricting to period of interest
geo_life = geo_life[(geo_life.date_time > start_time) & (geo_life.date_time < end_time)]
geo_life_plus = geo_life_plus[(geo_life_plus.date_time > start_time) & (geo_life_plus.date_time < end_time)]


#Trajectories
t_geoLife = skmob.TrajDataFrame(geo_life, latitude='lat', longitude='lon', datetime='data_time', user_id='uid')
t_geoLife = t_geoLife.rename(columns={"date_time": "datetime"})

t_geoLife_plus = skmob.TrajDataFrame(geo_life_plus, latitude='lat', longitude='lon', datetime='data_time', user_id='uid')
t_geoLife_plus = t_geoLife_plus.rename(columns={"date_time": "datetime"})


In [None]:
len(geo_life_plus)

# Radius of gyration and number of visits

In [None]:
rg_geolife = radius_of_gyration(t_geoLife)
nvisits_geolife = number_of_visits(t_geoLife)

rg_geolife_plus = radius_of_gyration(t_geoLife_plus)
nvisits_geolife_plus = number_of_visits(t_geoLife_plus)



In [None]:
rg_geolife_plus

In [None]:
#Statistiche radius e number of visits per paper
print("radius GeoLife",rg_geolife["radius_of_gyration"].describe())
print("radius GeoLife+",rg_geolife_plus["radius_of_gyration"].describe())
print("number visits GeoLife",nvisits_geolife["number_of_visits"].describe())
print("number visits GeoLife+",nvisits_geolife_plus["number_of_visits"].describe())

In [None]:
# Radius of gyration + pdf visits
sns.set(style="ticks",font_scale=1.4)

fontsize = 10
fig = plt.figure(constrained_layout=True,figsize=(10, 5))

gs = GridSpec(2, 2, figure=fig)

# radius
ax1=fig.add_subplot(gs[0,0])
#radius = radius_of_gyration(tdf)
plt.grid(alpha=1)
sns.histplot(rg_geolife.radius_of_gyration,bins=100,kde=True,ax=ax1, color="r",stat="probability")
ax1.set_title("GeoLife")
ax1.set_ylabel("$r_g$")
ax1.set_xlabel("")
ax1.set_xlim([0,30])



ax2=fig.add_subplot(gs[1,0])
sns.histplot(nvisits_geolife.number_of_visits,ax=ax2,kde=True,color="green",stat="probability")
plt.grid(alpha=1) 
ax2.set_ylabel("$n_v$")
ax2.set_xlabel("")

ax3=fig.add_subplot(gs[0,1])
plt.grid(alpha=1)
sns.histplot(rg_geolife_plus.radius_of_gyration,bins=100,kde=True,ax=ax3, color="r",stat="probability")
ax3.set_title("GeoLife+")
ax3.set_ylabel("$r_g$")
ax3.set_xlabel("")
ax3.set_xlim([0,30])

ax4=fig.add_subplot(gs[1,1])
sns.histplot(nvisits_geolife_plus.number_of_visits,ax=ax4,kde=True,color="green",stat="probability")
plt.grid(alpha=1) 
ax4.set_ylabel("$n_v$")
ax4.set_xlabel("")


savefig(geo_life_analysis_path+"geo_life_mobility_metrics.png", dpi  = 300)



# Numbers for table of the paper

In [None]:
print("total traces:", len(geo_life))
print("total users:", len(geo_life["uid"].unique()))
print("total traj:", len(geo_life["tid"].unique()))
print("average traj user",geo_life.groupby("uid").tid.nunique().mean())
print(rg_geolife.describe())
print(nvisits_geolife.describe())


print("total traces:", len(geo_life_plus))
print("total users:", len(geo_life_plus["uid"].unique()))
print("total traj:", len(geo_life_plus["tid"].unique()))
print("average traj user",geo_life_plus.groupby("uid").tid.nunique().mean())
print(rg_geolife_plus.describe())
print(nvisits_geolife_plus.describe())


We start by defining some helper functions and importing some modules.

In [None]:

#helper functions to plot and save figures with pyplot
def plot_df(x, y, name, title="", xlabel='Date', ylabel='Value', dpi=100):
    plt.figure(figsize=(16,5), dpi=dpi)
    plt.plot(x, y, color='tab:blue', marker="o")
    plt.gca().set(title=title, xlabel=xlabel, ylabel=ylabel)
    plt.grid()
    plt.savefig(out_path + name)
    plt.show()

def plot_df_comparison(x1, y1, x2, y2, name, title, legend, xlabel="date", ylabel="Value", dpi = 100):
    plt.figure(figsize=(16,5), dpi=dpi)
    plt.plot(x1, y1, color='tab:blue', marker="o")
    plt.plot(x2, y2, color='tab:red', marker="x")
    plt.legend(legend)
    plt.gca().set(title=title, xlabel=xlabel, ylabel=ylabel)
    plt.grid()
    plt.savefig(out_path + name)
    plt.show
    
def plot_bar_comparison(users_orig, users_aug, trajs_orig, trajs_aug):
        
    n_groups = 2    
    values_original = (users_orig, trajs_orig)
    values_augmented = (users_aug, trajs_aug)
    
    # create plot
    fig, ax = plt.subplots()
    index = np.arange(n_groups)
    bar_width = 0.35
    opacity = 0.8
    
    rects1 = plt.bar(index, values_original, bar_width,
    alpha=opacity,
    color='tab:blue',
    label='Original')

    rects2 = plt.bar(index + bar_width, values_augmented, bar_width,
    alpha=opacity,
    color='tab:red',
    label='Augmented')
    
    plt.xlabel('Attribute')
    plt.ylabel('Quantity')
    plt.yscale("log")
    plt.title('Unique users and trajectories comparison')
    plt.xticks(index + bar_width, ("Users", "Trajectories"))
    plt.legend()

    #plt.tight_layout()
    plt.savefig(out_path + "bar_comparison_orig_vs_aug.png")
    plt.show()


We retrieve our datasets from disk

In [None]:
cols = ["date_time", "lat", "lon", "tid", "uid"]
cols_aug = ["lat", "lon", "uid", "tid","date_time"]

#restricting to june - august 2008
start_time = "2008-06-01 00:00:00"
end_time = "2008-08-31 23:59:00"

df = pd.read_csv(data_path + "complete_with_tids.csv", \
                 usecols = cols, parse_dates = True, infer_datetime_format = True)
original = (df[(df.date_time > start_time) & (df.date_time < end_time)]).copy()
augmented = pd.read_csv(data_path + "augmented_dataset.csv", usecols=cols_aug, parse_dates = True)

In [None]:
print(augmented)
print(augmented.info())
print(augmented.uid.nunique())
print(original.uid.nunique())
print(original.tid.max())
print(augmented.tid.max())

In [None]:
augmented["date_time"] = pd.to_datetime(augmented["date_time"])
original["date_time"] = pd.to_datetime(original["date_time"])

print(augmented.info())
print(original.info())

# Useful stats
We plot some useful stats on our augmented dataset

In [None]:
aug_uid = augmented['uid'].nunique()
orig_uid = original['uid'].nunique()

print(str(aug_uid) + " unique users for june 2008 (augmented dataset)")
print(str(orig_uid) + " unique users for june 2008 (original dataset)")

aug_tid = augmented['tid'].nunique()
orig_tid = original['tid'].nunique()

print(str(aug_tid) + " unique tids for june 2008 (augmented dataset)")
print(str(orig_tid) + " unique tids for june 2008 (original dataset)")

avg_pts = len(augmented)/aug_uid
print("average points per user in the augmented dataset: {:f}, on a total of {:d} points".format(avg_pts, len(augmented)))

In [None]:
augmented.set_index('date_time', inplace=True)
original.set_index('date_time', inplace=True)

# Bar comparison
We plot a bar comparison using a logarithmic scale to show how we augmented our dataset.

In [None]:
#bar comparison for number of users and trajectories
plot_bar_comparison(orig_uid, aug_uid, orig_tid, aug_tid)

In [None]:
#daily - original

original_daily = original.groupby(pd.Grouper(freq='D')).size()
plot_df(x=original_daily.index, y=original_daily, name="original_daily_linear.png",\
       title="Number of gps points per day in our period (original dataset)" )

In [None]:
#daily - augmented

augmented_daily = augmented.groupby(pd.Grouper(freq='D')).size()
plot_df(x=augmented_daily.index, y=augmented_daily, name="augmented_daily_linear.png",\
       title="Number of gps points per day in our period (augmented dataset)" )

# Linear graphs
We plot a linear graph for both daily and weekly aggregations

In [None]:
#daily - comparison

plot_df_comparison(x1 = original_daily.index, y1 = original_daily,\
               x2 = augmented_daily.index, y2 = augmented_daily, name = "augmented_vs_original_daily.png",\
                  title="Dailiy comparison between the original and augmented datasets", legend = ["Original", "Augmented"])

In [None]:
#weekly - original
original_weekly = original.groupby(pd.Grouper(freq='W')).size()
plot_df(x=original_weekly.index, y=original_weekly, name="original_weekly_linear.png",\
       title="Number of gps points per week in our period (original dataset)" )

In [None]:
#weekly - augmented
augmented_weekly = augmented.groupby(pd.Grouper(freq='W')).size()
plot_df(x=augmented_weekly.index, y=augmented_weekly, name="augmented_weekly_linear.png",\
       title="Number of gps points per week in our period (augmented dataset)" )

In [None]:
#weekly - comparison

plot_df_comparison(x1 = original_weekly.index, y1 = original_weekly,\
               x2 = augmented_weekly.index, y2 = augmented_weekly, name = "augmented_vs_original_weekly.png",\
                  title="Weekly comparison between the original and augmented dataset", legend = ["Original", "Augmented"])

##### We load the augmented dataset again, start here if you just want the graphs below and wanna spare some Ram

In [None]:
cols_aug = ["lat", "lon", "uid", "tid","date_time"]

augmented = pd.read_csv(data_path + "augmented_dataset.csv", usecols=cols_aug, parse_dates = True)

### Gps points per week
a little bit strange since June 1 2008 was a Sunday and ISO weeks start on monday

In [None]:
week = augmented
week["week"] = week.index.week
week["day"] = week.index.weekday
week = week.groupby(["day", "week"]).size()

sns.set(style = "ticks", font_scale = 1.45)
fig = plt.figure(figsize = (15, 8))
week = week.T
week = week.unstack(level=-1)
#week = week.pivot("day", "week")

#print(week)

plt.title("GPS points per week in our augmented dataset", fontsize = 14)
img = sns.heatmap(week, robust = True, cmap="Reds",linewidths =0.3,square = False, cbar= True, cbar_kws={"orientation": "horizontal"})
plt.savefig(out_path + "gps_points_per_week_augmented.jpg")

### Visits per time unit
(on a weekly basis)

In [None]:
import skmob
import datetime
from skmob.measures.collective import visits_per_time_unit

#print(augmented.reset_index())

tdf = skmob.TrajDataFrame(augmented.reset_index(), longitude = "lon", datetime = "date_time").sort_values(by='datetime')
vtu_df = visits_per_time_unit(tdf, time_unit = "W")
print(vtu_df)

We need to get the number of users per week in order to plot a meaningful heatmap

In [None]:
users = augmented
users["week"] = users.index.week
users["day"] = users.index.weekday
users = users.groupby(["day", "week"])["uid"].nunique()

users = users.T
users = users.unstack(level=-1)

print(users)

Now we plot the number of visits per week along with a heatmap of active users

In [None]:
import matplotlib.gridspec as gridspec
sns.set(style="whitegrid",font_scale=1.4)
fontsize = 10

fig = plt.figure(constrained_layout=False,figsize=(13, 5))
gs = gridspec.GridSpec(2, 4, figure=fig)

ax1=fig.add_subplot(gs[0, 0:4])
ax1.plot(vtu_df.n_visits, marker='o', color="r")
plt.yticks(size=fontsize, rotation=30)
ax1.set_xticks([])
ax1.set_ylabel("number of visits")
plt.grid(alpha=0.2,color="black") 
ax2=fig.add_subplot(gs[1, 0:4])


sns.heatmap(users, robust = True, cmap="Reds",linewidths =0.3,square = False, cbar= False, cbar_kws={"orientation": "horizontal"})

ax2.set_yticklabels(["Mon","Tue","Wed","Thu","Fry","Sat","Sun"])
plt.yticks(size=fontsize, rotation=30)

ax2.set_xlabel('weeks', fontsize=fontsize)
ax2.set_ylabel("active users")
plt.savefig(out_path + "visits_week_active_users.jpg")
#savefig("../output/Visits_week.png", dpi  = 300)

### Individual radius of gyration

In [None]:
from skmob.measures.individual import radius_of_gyration

tdf = skmob.TrajDataFrame(augmented.reset_index(), longitude = "lon", user_id = "uid", datetime = "date_time")
rg_df = radius_of_gyration(tdf)

print(rg_df.head())

### Individual number of visits

In [None]:
from skmob.measures.individual import number_of_visits

tdf = skmob.TrajDataFrame(augmented.reset_index(), longitude = "lon", user_id = "uid", datetime = "date_time")
num_v_df = number_of_visits(tdf)

print(num_v_df.head())

#### Plotting together radius of gyration and number of visits PDFs

In [None]:
# Radius of gyration + pdf visits
from skmob.measures.collective import visits_per_location
from skmob.measures.individual import radius_of_gyration
from mpl_toolkits.axes_grid.inset_locator import inset_axes
import matplotlib.gridspec as gridspec


sns.set(style="ticks",font_scale=1.2)

fontsize = 10
fig = plt.figure(constrained_layout=False,figsize=(10, 5))
gs = gridspec.GridSpec(2, 1, figure=fig)

ax1=fig.add_subplot(gs[0,0:])
plt.grid(alpha=0.5) 
sns.distplot(rg_df["radius_of_gyration"],bins=50,kde=True,ax=ax1,norm_hist=True)
plt.yticks(rotation=30)
#axins2 = inset_axes(ax1, width="30%", height="40%", loc=1, borderpad = 1)
#sns.distplot(rg_df["radius_of_gyration"],bins=50,kde=True,ax=axins2,norm_hist=True)
#axins2.set_xlim(15,25)
ax1.set_ylabel("PDF $r_g$")
#axins2.set_xlabel("")

# PDF visits per location
ax2=fig.add_subplot(gs[1,0:])
#plt.hist(visit_loc.n_visits, cumulative=True)
sns.distplot(num_v_df.number_of_visits,ax=ax2,kde=False,norm_hist=True)
plt.grid(alpha=0.5) 
ax2.set_ylabel("PDF n. of visits")
plt.loglog()
ax2.set_xlabel("")

plt.savefig(out_path + "Radius_PDF_visits.png", dpi  = 300)


### Frequency and recency

- #### Frequency

In [None]:
from skmob.measures.individual import frequency_rank

tdf = skmob.TrajDataFrame(augmented.reset_index(), longitude = "lon", user_id = "uid", datetime = "date_time")
fr_df = frequency_rank(tdf)

print(fr_df.info())

- #### Recency

In [None]:
from skmob.measures.individual import recency_rank

tdf = skmob.TrajDataFrame(augmented.reset_index(), longitude = "lon", user_id = "uid", datetime = "date_time")
rr_df = recency_rank(tdf)

print(rr_df.info())

#### Frequency and Recency joint plot

In [None]:
sns.set(style="ticks",font_scale=1.2)

n_rows = 1000000

merged_df = pd.merge(fr_df, rr_df, on=['uid', 'lat', 'lng'])
short_merged_df = merged_df[(merged_df.frequency_rank <= 5000) & (merged_df.recency_rank <= 5000)]

sns.jointplot(short_merged_df.frequency_rank, short_merged_df.recency_rank, kind="kde")

plt.savefig(out_path + "rec_vs_fre_augmented.png", dpi  = 300,bbox_inches = 'tight')
#savefig("../output/Rec_vs_Fre_kde.png", dpi  = 300,bbox_inches = 'tight')

#### Some more visualization
We visualize some of the trajectories and plot a datashader heatmap of the augmented dataset

In [None]:
#print(trajs)

tdf = skmob.TrajDataFrame(augmented[(augmented["uid"] == 1004)].reset_index(), longitude = "lon", datetime = "date_time")
print(tdf)


tdf.plot_trajectory(zoom=12, weight=3, opacity=0.9, tiles='Stamen Toner')

In [None]:
## import datashader as ds
from colorcet import fire
from datashader import transfer_functions as tf
import datashader as ds

from datashader.utils import lnglat_to_meters as webm

#df_
#augmented_t = augmented[(augmented['lat'].between(39, 41.5)) & (augmented['lon'].between(115, 117.5))]
df1 = augmented.copy(deep=True)


df1.loc[:, 'x'], df1.loc[:, 'y'] = webm(augmented.lon,augmented.lat)

#print(df1.head())
#print(df1.info())

"""plot_width = 1000
plot_height = 1000
"""
agg = ds.Canvas().points(df1, 'x', 'y')
img = tf.set_background(tf.shade(agg, cmap=fire),"black")
ds.utils.export_image(img=img,filename= out_path + 'beijing_dshader_augmented', fmt=".png", background="black")

In [None]:
import holoviews as hv
from holoviews.element.tiles import EsriImagery
from holoviews.operation.datashader import datashade
hv.extension('bokeh')

map_tiles  = EsriImagery().opts(alpha=0.5, width=800, height=800, bgcolor='black')
points     = hv.Points(df1, ['x', 'y'])
tracks = datashade(points, x_sampling=1, y_sampling=1, cmap=fire, width=600, height=600)


view = map_tiles * tracks
hv.save(view, out_path + "beijing_holoview.png", fmt="auto")
view