# Compute the coverage for a given scenario
A scenario defines the list of locations to consider for the coverage model. Scenarios are generated with notebook 5. Examples are: Mix, Subway, POis, Grid
- 1.Mix scenario


In [1]:
#! pip install pandas
#! pip install numpy
#! pip install matplotlib
#! pip install yaml
#! pip install scipy
#! pip install seaborn
#! pip install os
#! pip install math
#! pip install datetime

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yaml
import scipy.integrate as integrate
from scipy.stats import expon
import seaborn as sns
import os 
from datetime import datetime
import math



with open("conf.yaml") as f:
    conf = yaml.load(f, Loader = yaml.FullLoader)

out_path = conf["out_path"]
poi_path = conf["poi_path"]
geolife_data_path = conf["geolife_data_path"]
geolife_path = conf["geolife_path"]
scales_values = conf["scales_values"]
detour_radius = conf["detour_radius"] 

beijing_lat_min = conf["beijing_lat_min"]
beijing_lat_max = conf["beijing_lat_max"]
beijing_lon_min = conf["beijing_lon_min"]
beijing_lon_max = conf["beijing_lon_max"]

#used to parse the conf scales
def convert(s):
    try:
        return float(s)
    except ValueError:
        num, denom = s.split('/')
        return float(num) / float(denom)

scales = (list(map(convert, scales_values)))

# Check and read the start end period period for computing the coverage. 
if "coverage_start_period" and "coverage_end_period" in conf:
    coverage_start_period = conf["coverage_start_period"]
    coverage_end_period = conf["coverage_end_period"]
else:
    # coverage period overlaps with the start and end time of the data set
    coverage_start_period = conf["start_time"]
    coverage_end_period = conf["end_time"]

#start_time_obj = datetime.strptime(coverage_start_period, '%Y-%m-%d %H:%M:%S')
start_time_obj = datetime.strptime(coverage_start_period, '%Y-%m-%d %H:%M:%S').strftime("%Y-%m-%d")

#end_time_obj = datetime.strptime(coverage_end_period, '%Y-%m-%d %H:%M:%S')
end_time_obj = datetime.strptime(coverage_end_period, '%Y-%m-%d %H:%M:%S').strftime("%Y-%m-%d")
#a = str(start_time_obj.year)+"-"+str(start_time_obj.month)+"-"+str(start_time_obj.day)
#b = str(end_time_obj.year)+"-"+str(end_time_obj.month)+"-"+str(end_time_obj.day)
#suffix = a+"_"+b
suffix = start_time_obj+"_"+end_time_obj


print(conf)
print(suffix)

{'geolife_data_path': '../../dataset', 'geolife_path': '../../dataset/geo_life_full.csv', 'geo_life_analysis_path': '../output/GeoLife_analysis/', 'out_path': '../output/coverage/', 'poi_path': '../output/coverage/MIX.csv', 'start_time': '2008-07-01 00:00:00', 'end_time': '2009-12-31 23:59:59', 'beijing_lat_min': 39.54, 'beijing_lat_max': 40.3, 'beijing_lon_min': 115.75, 'beijing_lon_max': 117.13, 'speed_filter': 500, 'stop_radius_factor': 0.1, 'minutes_for_a_stop': 60, 'spatial_radius_km': 0.1, 'extended_medoids': 100, 'cluster_radius_km': 0.2, 'min_cluster_stop_sample': 4, 'interpolation_distance': 200, 'n_cars': 0, 'n_bikes': 0, 'n_pedestrians': 100, 'n_trajectory_user': 50, 'beijing_center_lon': 116.3912757, 'beijing_center_lat': 39.906217, 'distance_from_center': 50000, 'sw_lon': 115.7657, 'sw_lat': 39.6332, 'ne_lon': 116.7435, 'ne_lat': 40.1411, 'gripd_stepsize': 6561.68, 'scales_values': [50], 'detour_radius': 800, 'coverage_start_period': '2009-05-01 00:00:00', 'coverage_end_pe

In [3]:
# Load the dataset
cols = ["lat", "lon", "uid", "tid","date_time"]
dataset = pd.read_csv(geolife_path, usecols=cols, parse_dates = True)

#restricting to the coverage period, if specified
if "coverage_start_period" and "coverage_end_period" in conf:
    dataset = dataset[(dataset.date_time >= coverage_start_period) & (dataset.date_time <= coverage_end_period)]
    
#restricting mobility to beijing area
dataset = dataset[(dataset['lat'].between(beijing_lat_min, beijing_lat_max )) & (dataset['lon'].between(beijing_lon_min, beijing_lon_max))]

print("GeoLife augmented loaded with period:",suffix)
print(len(dataset))

GeoLife augmented loaded with period: 2009-05-01_2009-05-31
982306


In [4]:
 # Define a basic Haversine distance formula
def haversine(lat1, lon1, lat2, lon2):
    
    lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    a = (np.sin(dlat/2)**2) + (np.cos(lat1)) * (np.cos(lat2)) * (np.sin(dlon/2)**2)
    c = 2 * np.arcsin(np.sqrt(a))
    #c = np.arctan2(np.sqrt(a), np.sqrt(1-a))
    #total_meters = METERS * c
    r = 6371000 #radiu * 1000 to return meters
    return (c * r)

# Coverage functions
def coverage(locations_id, distances, scale):
    coverage = pd.DataFrame()
    coverage["id_location"] = locations_id
    coverage["probability"] = calculate_coverage(locations_id, distances, scale)
    return coverage

def calculate_coverage(locations_id, distances, scale):
    coverages = []
    
    #initialize coverage inner function
    inner = lambda x: expon.pdf(x,scale=scale)
    print("Calculating coverage with scale:",scale)
    counter = 0
    for loc in locations_id:
        #print("Processing location,count, tot: ",loc,counter,len(locations_id))
        counter+=1
        dists_location = distances[distances.id_location == loc]
        #if the location has no points whatsoever, assign 0 to the coverage value and continue
        if (len(dists_location) == 0):
            coverages.append(0)
            #print("no points for location: ",loc)
            continue
        
        # the list of ids of the users with some points close to the location
        uids = dists_location.uid.unique()
        # users
        w = []
        for user in uids:
            #print(user)
            dists_location_user = dists_location[(dists_location.uid == user)]
            #print("user:",dists_location_user)
            if(len(dists_location_user) == 0):
                #print("WARNING: no user's points for: ",loc)
                continue
            # Trajectories
            tids = dists_location_user.tid.unique()
            k = []
            for traj in tids:
                dists_location_user_traj = dists_location_user[(dists_location_user.tid == traj)]
                #print("traj:",dists_location_user_traj)
                if(len(dists_location_user_traj) == 0):
                    #print("WARNIN: no user's traj for: ",loc)
                    continue
                # min distance
                min_distance  = dists_location_user_traj["distance"].iloc[0]
                #y = integrate.quad(inner, min_distance, np.inf)[0]
                ## Truncate of the Exponential Distribution
                y = integrate.quad(inner, min_distance, np.inf)[0]/(1-math.exp(-scale*detour_radius))

                #print("min_distance",min_distance)
                #print("1-pdf",y)
                k.append(1-y)
                #print("cumulato",k)
            m=1-np.prod(k)
            w.append(1-m)
        coverages.append(1-np.prod(w))

    #import pdb; pdb.set_trace()                    
    return coverages

#calculates coverages on multiple scale values and serializes them on disk
def coverage_multiple_scales(scales, locations_id, distances,scenario,locations):
    for scale in scales:
        # copute the coverage prob
        coverages = coverage(locations_id, distances, scale)
        #import pdb; pdb.set_trace()
        #merge with df_t dataframe with location positions on "location" id then serialize on disk
        merged = locations.merge(coverages, on="id_location")
        name =  "coverages_" +suffix+".csv"
        scale_dest = str(float(scale))
        merged.to_csv(os.path.join(out_path,"scenario",scenario,scale_dest, name))
        

## 1. Mix scenario
- Load the scenario with locations selected according to MIX scenario: POIs + Random points
- Filter locations according to the detour_radius
- Extract the IDS of the locations
- Invoke the coverage_multiple_scales function for generating the coverage probability  


In [5]:
locations = pd.read_csv(poi_path)
print("Locations found ", len(locations))

# filter POIs according to the Beijing area
locations = locations[(locations['lat'].between(beijing_lat_min, beijing_lat_max )) & (locations['lon'].between(beijing_lon_min, beijing_lon_max))]


# list of distances
dists = []
count = 0
counter = 0

for index, location in locations.iterrows():
    #print("Processing:",counter,len(locations))
    counter+=1
    #print("location:",index)
   
    # distance between each loacation and ALL the dataset locations (input is: lat, lon, np.array(), np.array())
 #   import pdb; pdb.set_trace()                    
    d_i_h = pd.DataFrame()
   # d_i_h["distance"] = haversine(location.lat, location.lon, dataset["lat"].values, dataset["lon"].values)
    d_i_h["uid"] = dataset["uid"]
    d_i_h["tid"] = dataset["tid"]
    d_i_h["lon"] = dataset["lon"]
    d_i_h["lat"] = dataset["lat"]
    d_i_h["distance"] = haversine(location.lat, location.lon, dataset.lat, dataset.lon)
   
   # d_i_h["index"] = range(1, len(d_i_h) + 1)
   # d_i_h["date_time"] = dataset["date_time"] 
   # print(poi_path)
   # print(location)
   # pd.set_option('display.max_rows', None)
    #print("before sorting \n",d_i_h[(d_i_h.tid == 18949) & (d_i_h.index==6018)])

    
    # filter distances w.r.t detour_radius
    d_i_h = d_i_h[d_i_h.distance <= detour_radius]
  
    if (len(d_i_h) > 0):
        d_i_h = d_i_h.groupby(by="tid").min()
        d_i_h.reset_index(inplace=True)
        d_i_h["id_location"] = location.id_location
        # TODO: aggiungere la posizione della distanza minima
        dists.append(d_i_h)
    else:
        #d_i_h = pd.DataFrame({"location":location.id_location, "uid":np.nan, "distance":np.nan}, index=[0])
        count += 1
     #   d_i_h = d_i_h.iloc[0:0]  
    
print("There are {:d} locations that have no points in detour radius' range".format(count)) 
subway_dists = pd.concat(dists)
subway_dists.to_csv(out_path+"subway_dists_dr_"+str(detour_radius)+"_"+suffix+".csv")
#print(d_i_h)

Locations found  1954
There are 193 locations that have no points in detour radius' range


In [6]:
# to be executed after distance processing
# list of locations
locations = pd.read_csv(poi_path)
locations_ids = locations.id_location.unique()
print("Number of ocations of interests: ", len(locations_ids))

# the distances from the locations
dist_name = out_path+"subway_dists_dr_"+str(detour_radius)+"_"+suffix+".csv"
print("Reading: ",dist_name)
subway_dists = pd.read_csv(dist_name)
subway_dists.drop(columns=["Unnamed: 0"],inplace=True)
                      
print("Locations of interests with points in detour radius range: ", len(subway_dists.id_location.unique()))
print(subway_dists)

Number of ocations of interests:  1954
Reading:  ../output/coverage/subway_dists_dr_800_2009-05-01_2009-05-31.csv
Locations of interests with points in detour radius range:  1722
         tid  uid         lon        lat    distance   id_location
0       5731   41  115.973697  39.639435  483.909737  2.391265e+09
1       5732   41  115.961088  39.638997  261.238860  2.391265e+09
2       8863   84  115.928728  39.682084  308.118320  4.643050e+09
3       5731   41  116.071088  39.716338  289.853397  5.315328e+09
4       5732   41  116.082860  39.716768  665.720593  5.315328e+09
...      ...  ...         ...        ...         ...           ...
53063  16977  155  116.332175  39.974827   86.243664  2.311466e+04
53064  16978  155  116.330922  39.974257  120.912107  2.311466e+04
53065  16979  155  116.329353  39.974843  270.243668  2.311466e+04
53066  18651  180  116.330222  39.976189  404.207409  2.311466e+04
53067  18652  180  116.330479  39.976229  376.226066  2.311466e+04

[53068 rows x 6 

## compute the coverage given the:
- scales
- ids of the locations
- distances
- name of the scenario

In [7]:
scenario_name = "mix" #used to get a meeaningful name for out .csv
coverage_multiple_scales(scales, locations_ids, subway_dists, scenario_name,locations)

Calculating coverage with scale: 50.0
