In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load data

In [2]:
# Read Berlin Tapas data (takes up to 20s on my machine)
path = './VF_data/pandemos_tra_tapas_modell.csv'  # insert your path here
df = pd.read_csv(path)
df['activity_end_min'] = df.activity_start_min + df.activity_duration_min

In [3]:
df

Unnamed: 0,p_id,taz_id_start,loc_id_start,lon_start,lat_start,taz_id_end,loc_id_end,lon_end,lat_end,start_time_min,travel_time_sec,mode,activity,activity_start_min,activity_duration_min,activity_end_min
0,101357880,468,302652,13.159688,52.549478,467,-100792825,13.204950,52.560776,868,1316,2,10,890,145,1035
1,101357880,467,-100792825,13.204950,52.560776,466,302659,13.199601,52.565567,1035,924,0,740,1050,225,1275
2,101357880,466,302659,13.199601,52.565567,467,-100792825,13.204950,52.560776,1275,924,0,10,1290,120,1410
3,100020055,87,-100020055,13.397226,52.536221,728,187669,13.406449,52.463404,329,3655,5,212,390,900,1290
4,100020055,728,187669,13.406449,52.463404,87,-100020055,13.397226,52.536221,1290,3726,5,10,1352,98,1450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13578532,100141239,12,-100091850,13.336447,52.532269,915,-1,13.567840,52.457026,1201,2645,3,724,1245,105,1350
13578533,102787471,950,-101582536,13.715186,52.427953,915,-1,13.567840,52.457026,1204,2460,3,724,1245,105,1350
13578534,103202694,1094,-101808736,13.478268,52.509100,915,-1,13.567840,52.457026,1217,1681,3,724,1245,105,1350
13578535,101243442,437,-100726467,13.321122,52.481955,915,-1,13.567840,52.457026,1206,2345,6,724,1245,105,1350


# Select hood

In [4]:
def select_hood(df, centroid, width):
    # Select only trajectories from households within a lon-lat-box with certain width around a centroid
    min_lon, max_lon = centroid[0] - .5 * width, centroid[0] + .5 * width 
    min_lat, max_lat = centroid[1] - .5 * width, centroid[1] + .5 * width

    # Select trajectories that started from an household within the selected box
    # If a node starts from its household than the sign of loc_id_start is < -1
    df_box = df[(df.lon_start.between(min_lon, max_lon)) & (df.lat_start.between(min_lat, max_lat)) & (df.loc_id_start < -1)]
    nodes_box = df_box.p_id.unique()
    print('Number of nodes seleceted: ', len(nodes_box))

    # Select all trajectories from agents that live in one of the selected households
    df_hood = df[df.p_id.isin(nodes_box)]

    return df_hood

centroid = (13.199601, 52.565567)
df_hood = select_hood(df, centroid, .004)
df_hood

Number of nodes seleceted:  982


Unnamed: 0,p_id,taz_id_start,loc_id_start,lon_start,lat_start,taz_id_end,loc_id_end,lon_end,lat_end,start_time_min,travel_time_sec,mode,activity,activity_start_min,activity_duration_min,activity_end_min
22426,101383825,466,-100800470,13.200512,52.563988,467,302657,13.203437,52.564121,895,451,0,522,902,25,927
22427,101383825,467,302657,13.203437,52.564121,476,198546,13.210325,52.558965,927,1124,0,50,946,30,976
22428,101383825,476,198546,13.210325,52.558965,467,22951,13.202489,52.559971,976,839,0,50,990,35,1025
22429,101383825,467,22951,13.202489,52.559971,466,-100800470,13.200512,52.563988,1025,735,0,10,1037,5,1042
22430,101383825,466,-100800470,13.200512,52.563988,520,278132,13.260552,52.537342,1042,3129,5,211,1094,5,1099
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13564245,101363338,466,-100794644,13.199065,52.566347,915,-1,13.567840,52.457026,1189,3330,6,724,1245,105,1350
13574707,101363339,466,-100794644,13.199065,52.566347,915,-1,13.567840,52.457026,1189,3330,3,724,1245,105,1350
13574903,101349313,466,-100788567,13.198975,52.566591,915,-1,13.567840,52.457026,1189,3332,3,724,1245,105,1350
13575359,101303632,466,-100755316,13.198975,52.566591,915,-1,13.567840,52.457026,1189,3332,3,724,1245,105,1350


# Get contacts

In [5]:
def get_location_contacts(location):
    start_times, end_times = location.activity_start_min.values, location.activity_end_min.values
    event_ids = location.p_id.values
    loc_id_end = location.loc_id_end.values[0]
    # Takes a df containing all activities at location
    # Returns all possible contacts

    # Broadcast the start_time and end_time arrays for comparison with each other
    overlap_start = np.maximum.outer(start_times, start_times)
    overlap_end = np.minimum.outer(end_times, end_times)

    # Calculate the overlap duration matrix 
    overlap_durations = np.maximum(overlap_end - overlap_start, np.zeros(shape=overlap_start.shape)).astype('uint16')
    
    # Set lower triangle and main diagonal to zero (overlap of an event with itself and double counting)
    overlap_durations = np.triu(overlap_durations, 1)

    # Extract contact rows, cols
    rows, cols = np.where(overlap_durations > 0)
    p_A = event_ids[rows].astype('int')

    # Save contacts to new DataFrame
    contact_data = {'p_A': p_A,'p_B': event_ids[cols].astype('int'), 
                    'start_of_contact': overlap_start[rows, cols].astype('int'),
                    'end_of_contact': overlap_end[rows, cols].astype('int'),
                    'loc_id': np.repeat(loc_id_end, len(p_A)).astype('int32')}

    return pd.DataFrame(contact_data)

In [6]:
def get_all_contacts(df_hood, min_contact_duration=None):
    # Group trajectories by loc_end
    locations = df_hood.groupby('loc_id_end')

    # Get contacts per location
    df_contacts = locations.apply(get_location_contacts).reset_index(drop=True)

    # Calculate contact durations
    df_contacts['contact_duration'] = df_contacts.end_of_contact - df_contacts.start_of_contact

    # (optional) drop contacts below specified contact duration
    if min_contact_duration:
        df_contacts = df_contacts[df_contacts.contact_duration >= min_contact_duration]

    return df_contacts

contacts = get_all_contacts(df_hood)

# A small example

In [7]:
small_hood = df_hood[df_hood.loc_id_end.isin([22951, -100800470, 198546])].sort_values(['loc_id_end', 'activity_start_min'])
small_hood[['p_id', 'activity_start_min', 'activity_end_min', 'loc_id_end']]

Unnamed: 0,p_id,activity_start_min,activity_end_min,loc_id_end
23084,101383823,793,1173,-100800470
22429,101383825,1037,1042,-100800470
22431,101383825,1150,1190,-100800470
22790,101383822,1280,1400,-100800470
8192065,101380824,427,432,22951
22428,101383825,990,1025,22951
3274454,101300410,1065,1350,22951
13106672,101358781,288,295,198546
9158712,101360796,630,631,198546
5641662,101314977,654,689,198546


In [8]:
small_hood_contacts = get_all_contacts(small_hood)
small_hood_contacts

Unnamed: 0,p_A,p_B,start_of_contact,end_of_contact,loc_id,contact_duration
0,101383823,101383825,1037,1042,-100800470,5
1,101383823,101383825,1150,1173,-100800470,23
2,101314977,101375646,675,680,198546,5
3,101314977,101355670,687,689,198546,2
4,101358781,101314559,864,884,198546,20
5,101314559,101383825,946,976,198546,30
6,101314559,101332706,956,961,198546,5
7,101314559,101338166,989,1009,198546,20
8,101314559,101318342,994,1004,198546,10
9,101383825,101332706,956,961,198546,5


# Generate dynamic network with tacoma <br>
http://rocs.hu-berlin.de/~tacoma/temporal_networks/temporal_network_classes.html

In [9]:
import tacoma as tc

In [10]:
# Normalize node ids for tacoma and easy indexing
unique_nodes = df_hood.p_id.unique()
node_int_dict = dict(zip(unique_nodes, np.arange(0, len(unique_nodes), 1)))
contacts.p_A = contacts.p_A.map(node_int_dict)
contacts.p_B = contacts.p_B.map(node_int_dict)
contacts

Unnamed: 0,p_A,p_B,start_of_contact,end_of_contact,loc_id,contact_duration
0,266,267,716,816,-100803017,100
1,266,267,882,1190,-100803017,308
2,266,269,1016,1076,-100803017,60
3,266,269,1150,1190,-100803017,40
4,266,270,696,891,-100803017,195
...,...,...,...,...,...,...
1711,640,683,519,634,303529,115
1712,222,768,883,957,303535,74
1713,352,710,1012,1013,303930,1
1714,362,710,914,1004,303930,90


In [11]:
# Initilize tacoma temporal network
tn = tc.edge_changes()
tn.N = len(unique_nodes)
tmax, tmin = contacts.end_of_contact.max(), contacts.start_of_contact.min()
Nt = tmax - tmin + 1
tn.t = list(range(tmin, tmax + 1))
tn.tmax = tmax + 1
tn.time_unit = '20s'

# Make edges
edges_in, edges_out = [[] for _ in range(Nt)], [[] for _ in range(Nt)]
for _, c in contacts.iterrows():
    edges_in[c.start_of_contact - tmin].append([c.p_A, c.p_B])
    edges_out[c.end_of_contact - tmin].append([c.p_A, c.p_B])

tn.edges_in = edges_in
tn.edges_out = edges_out

# Check for errors
tc.verify(tn)

0

Tacoma has many interesting tools. For small networks it is even possible to create an interactive plot. However it can be tricky to exit the visualization.

In [None]:
from tacoma.interactive import visualize
visualize(tn, frame_dt=1) 