# Data preparation

[SocioPatterns.org/Datasets](http://www.sociopatterns.org/datasets/)

In [1]:
import os
base_dir = '../../../SocioPatterns'#os.path.expanduser(r'~\Desktop\MScThesis\SocioPatterns')
work_contacts_dir = os.path.join(base_dir, r'Contacts in a workplace')
primary_school_dir = os.path.join(base_dir, r'Primary school temporal')
hospital_ward_dir = os.path.join(base_dir, r'Hospital ward')
out_dir = os.path.expanduser('.')

In [2]:
# this workbook needs pathpy2 installed
%conda list pathpy

# packages in environment at C:\Users\mstud\Anaconda3\envs\pathpy2:
#
# Name                    Version                   Build  Channel
pathpy2                   2.2.0                    pypi_0    pypi

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pathpy as pp
import numpy as np
import scipy.sparse.linalg as sla
from collections import defaultdict

In [4]:
def iter_transitions(hon, include_subpaths:bool=True):
    "iterate over transition matrix of HigherOrderNetwork"
    mat = hon.transition_matrix(include_subpaths)
    if hon.order == 1:
        nodes = list((n,) for n in hon.nodes.keys())
    else:
        nodes = list(tuple(n.split(hon.separator)) for n in hon.nodes.keys())
    for r in range(mat.shape[0]):
        for ind in range(mat.indptr[r], mat.indptr[r+1]):
            yield nodes[mat.indices[ind]], nodes[r][-1], mat.data[ind] # yield (start, next_node, prob)

In [5]:
# orig: Calc_q.ipynb
def calc_q(net, C, weighted=False, **kwargs):
    nodes = list(net.nodes.keys())
    mat = net.adjacency_matrix(weighted=weighted, **kwargs) # scipy.sparse.csc.csc_matrix
    mat_sum = 0
    mat_sum_by_row_C = defaultdict(float)
    mat_sum_by_col_C = defaultdict(float)
    q = 0
    for c in range(mat.shape[1]):
        c_C = C[nodes[c]] # class of current column's node
        for ind in range(mat.indptr[c], mat.indptr[c+1]):
            r = mat.indices[ind]
            v = mat.data[ind]
            # assert mat[r,c]==v
            r_C = C[nodes[r]] # class of current rows's node
            mat_sum += v
            mat_sum_by_row_C[r_C] += v
            mat_sum_by_col_C[c_C] += v
            if c_C == r_C:
                q += v
    q_exp = sum( v*mat_sum_by_col_C[c] for c,v in mat_sum_by_row_C.items() ) / (mat_sum**2)
    q = q/mat_sum - q_exp
    q_max = 1 - q_exp
    return (q, q_max)

In [6]:
def export_rules(filename, paths, config, metadata, max_order:int=1, replace_space=False, include_subpaths=True):
    "Export the transition probabilities (rules) up to the estimated order"
    mog = pp.MultiOrderModel(paths, max_order)
    estimated_order = mog.estimate_order(paths)
    print('Estimated order:', estimated_order)
    stats = dict()
    with open(filename,'w') as f:
        for order in range(1,estimated_order+1):
            hon = mog.layers[order]
            if order == 1:
                print('Exporting stationary distribution')
                probs = hon.transition_matrix(include_subpaths)
                _,ev = sla.eigs(probs, k=1, which='LM')
                ev = np.abs(ev).flatten()
                ev_sum = ev.sum()
                stat_dist = {n:v/ev_sum for n,v in zip(hon.nodes.keys(), ev)}
                for n,v in sorted(list(stat_dist.items())):
                    f.write('=> %s %r\n' % (n, v))
            print('Exporting rules for order',order)
            #for start,next_node,prob in iter_transitions(hon):
            for start,next_node,prob in sorted(list(iter_transitions(hon, include_subpaths))):
                if replace_space:
                    next_node = next_node.replace(' ','_')
                    start = tuple( n.replace(' ','_') for n in start)
                for n in start:
                    f.write(n + ' ')
                f.write('=>')
                f.write(' %s %r\n' % (next_node,prob))
            # calc q, q_max
            node2cat = { n:metadata[n.split(',')[-1]] for n in hon.nodes } # last node is relevant for category
            q, q_max = calc_q(hon, node2cat, weighted=True, include_subpaths=include_subpaths)
            print(f'order {hon.order}: q={q}, q_max={q_max}, q/q_max={q/q_max}')
            stats.update({ f'q[{hon.order}]': q, f'q_max[{hon.order}]': q_max, f'q/q_max[{hon.order}]': q/q_max })
    config_filename = os.path.splitext(filename)[0]+'.config'
    with open(config_filename,'w') as f:
        f.write('code\t%s\n' % os.path.join(os.getcwd(), 'SocioPatterns.ipynb'))
        for k,v in config.items():
            f.write('%s\t%s\n' % (k,v))
        delta_min,delta_sec = config['delta'] // 3, config['delta'] % 3 *20
        delta_time = f'{delta_min}:{delta_sec:02d}'
        delta_text = '%d min' % delta_min if delta_sec==0 else '%d sec' % delta_sec if delta_min==0 else delta_time
        config_loc = dict(max_order=max_order, estimated_order=estimated_order, replace_space=replace_space, include_subpaths=include_subpaths,
                         delta_time=delta_time, delta_text=delta_text)
        for k,v in config_loc.items():
            f.write('%s\t%s\n' % (k,v))
        for k,v in stats.items():
            f.write('%s\t%s\n' % (k,v))

# Contacts in workplace
delta=90 finished in less than 6 hrs;
delta=180 did not finish within 24 hrs.

In [7]:
filename = os.path.join(work_contacts_dir, 'tij_InVS.dat')
c_workplace=dict(source=filename)
t_workplace = pp.TemporalNetwork()
with open(filename,'r') as f:
    for line in f:
        t,i,j = line.split()
        t_workplace.add_edge(i, j, int(t)//20)
        t_workplace.add_edge(j, i, int(t)//20)
print(t_workplace.summary())

Nodes:			92
Time-stamped links:	19654
Links/Nodes:		213.6304347826087
Observation period:	[1441, 50822]
Observation length:	 49381 
Time stamps:		 7104 
Avg. inter-event dt:	 6.952132901590877
Min/Max inter-event dt:	 1/11134


In [8]:
# copy metadata
m_workplace = dict()
with open(os.path.join(work_contacts_dir, 'metadata_InVS13.txt'),'r') as f:
    with open(os.path.join(out_dir, 'metadata_workplace.csv'),'w') as g:
        #g.write(f.read())
        for line in f:
            i,Ci = line.split()
            m_workplace[i]=Ci
            g.write(line)

In [9]:
extraction_param = dict(delta=3)
#extraction_param = dict(delta=15)
#extraction_param = dict(delta=30)
#extraction_param = dict(delta=60)
#extraction_param = dict(delta=90)
c_workplace.update(extraction_param)
p_workplace = pp.path_extraction.paths_from_temporal_network_dag(t_workplace, **extraction_param)

2020-11-24 23:38:14 [Severity.INFO]	Constructing time-unfolded DAG ...
2020-11-24 23:38:15 [Severity.INFO]	finished.
Directed Acyclic Graph
Nodes:		40046
Roots:		6090
Leaves:		21223
Links:		58962
Acyclic:	None

2020-11-24 23:38:15 [Severity.INFO]	Generating causal trees for 6090 root nodes ...
2020-11-24 23:38:16 [Severity.INFO]	Analyzing tree 609/6090 ...
2020-11-24 23:38:16 [Severity.INFO]	Analyzing tree 1218/6090 ...
2020-11-24 23:38:17 [Severity.INFO]	Analyzing tree 1827/6090 ...
2020-11-24 23:38:17 [Severity.INFO]	Analyzing tree 2436/6090 ...
2020-11-24 23:38:17 [Severity.INFO]	Analyzing tree 3045/6090 ...
2020-11-24 23:38:18 [Severity.INFO]	Analyzing tree 3654/6090 ...
2020-11-24 23:38:18 [Severity.INFO]	Analyzing tree 4263/6090 ...
2020-11-24 23:38:23 [Severity.INFO]	Analyzing tree 4872/6090 ...
2020-11-24 23:38:24 [Severity.INFO]	Analyzing tree 5481/6090 ...
2020-11-24 23:38:24 [Severity.INFO]	Analyzing tree 6090/6090 ...
2020-11-24 23:38:24 [Severity.INFO]	finished.


In [10]:
%time export_rules(os.path.join(out_dir, 'workplace_%d.csv' % extraction_param['delta']), p_workplace, c_workplace, m_workplace, max_order=4) # estimated_order=2

2020-11-24 23:38:24 [Severity.INFO]	Generating 0-th order layer ...
2020-11-24 23:38:24 [Severity.INFO]	Generating 1-th order layer ...
2020-11-24 23:38:24 [Severity.INFO]	Generating 2-th order layer ...
2020-11-24 23:38:24 [Severity.INFO]	Generating 3-th order layer ...
2020-11-24 23:38:25 [Severity.INFO]	Generating 4-th order layer ...
2020-11-24 23:38:33 [Severity.INFO]	finished.
2020-11-24 23:38:37 [Severity.INFO]	Likelihood ratio test for K_opt = 2, x = 99053.05568314757
2020-11-24 23:38:37 [Severity.INFO]	Likelihood ratio test, d_1-d_0 = 29924
2020-11-24 23:38:37 [Severity.INFO]	Likelihood ratio test, p = 0.0
2020-11-24 23:38:40 [Severity.INFO]	Likelihood ratio test for K_opt = 3, x = 10965.6320084834
2020-11-24 23:38:40 [Severity.INFO]	Likelihood ratio test, d_1-d_0 = 591247
2020-11-24 23:38:40 [Severity.INFO]	Likelihood ratio test, p = 1.0
2020-11-24 23:38:45 [Severity.INFO]	Likelihood ratio test for K_opt = 4, x = 5464.151524489163
2020-11-24 23:38:45 [Severity.INFO]	Likelihoo

In [11]:
del p_workplace

# Primary school
Unable to extract paths for delta=2 (on a PC with 16GB RAM)

In [12]:
filename = os.path.join(primary_school_dir, 'primaryschool.csv')
c_primaryschool = dict(source=filename)
t_primaryschool = pp.TemporalNetwork()
m_primaryschool = dict() # metadata
with open(filename,'r') as f:
    for line in f:
        t,i,j,Ci,Cj = line.split()
        t_primaryschool.add_edge(i, j, int(t)//20)
        t_primaryschool.add_edge(j, i, int(t)//20)
        m_primaryschool[i]=Ci
        m_primaryschool[j]=Cj
print(t_primaryschool.summary())

Nodes:			242
Time-stamped links:	251546
Links/Nodes:		1039.4462809917356
Observation period:	[1561, 7406]
Observation length:	 5845 
Time stamps:		 3100 
Avg. inter-event dt:	 1.8860922878347854
Min/Max inter-event dt:	 1/2747


In [13]:
# export metadata
with open(os.path.join(out_dir, 'metadata_primaryschool.csv'),'w') as g:
    for i,c in sorted(list(m_primaryschool.items())):
        g.write('%s\t%s\n' % (i,c))

In [14]:
# delta=1 took 2 hours
# delta=2 uses > 50GB RAM
extraction_param = dict(delta=1)
c_primaryschool.update(extraction_param)
p_primaryschool = pp.path_extraction.paths_from_temporal_network_dag(t_primaryschool, **extraction_param)

2020-11-24 23:38:54 [Severity.INFO]	Constructing time-unfolded DAG ...
2020-11-24 23:38:58 [Severity.INFO]	finished.
Directed Acyclic Graph
Nodes:		240014
Roots:		65218
Leaves:		65218
Links:		251546
Acyclic:	None

2020-11-24 23:38:58 [Severity.INFO]	Generating causal trees for 65218 root nodes ...
2020-11-25 02:08:45 [Severity.INFO]	finished.


In [15]:
%time export_rules(os.path.join(out_dir, 'primaryschool_%d.csv') % extraction_param['delta'], p_primaryschool, c_primaryschool, m_primaryschool, max_order=3)

2020-11-25 02:08:45 [Severity.INFO]	Generating 0-th order layer ...
2020-11-25 02:08:45 [Severity.INFO]	Generating 1-th order layer ...
2020-11-25 02:08:50 [Severity.INFO]	Generating 2-th order layer ...
2020-11-25 02:08:59 [Severity.INFO]	Generating 3-th order layer ...
2020-11-25 02:09:49 [Severity.INFO]	finished.
2020-11-25 02:46:00 [Severity.INFO]	Likelihood ratio test for K_opt = 2, x = 79586108.4486756
2020-11-25 02:46:00 [Severity.INFO]	Likelihood ratio test, d_1-d_0 = 1303716
2020-11-25 02:46:00 [Severity.INFO]	Likelihood ratio test, p = 0.0
2020-11-25 03:23:25 [Severity.INFO]	Likelihood ratio test for K_opt = 3, x = 27330624.334839195
2020-11-25 03:23:25 [Severity.INFO]	Likelihood ratio test, d_1-d_0 = 103903081
2020-11-25 03:23:25 [Severity.INFO]	Likelihood ratio test, p = 1.0
Estimated order: 2
Exporting stationary distribution
Exporting rules for order 1
order 1: q=0.44758433082597404, q_max=0.8637274056458426, q/q_max=0.5182009137377062
Exporting rules for order 2
order 2:

In [17]:
del p_primaryschool

# Hospital ward
delta=3 took 12 hrs to generate paths and 5 hrs to export rules

In [18]:
filename = os.path.join(hospital_ward_dir, 'detailed_list_of_contacts_Hospital.dat')
c_hospitalward = dict(source=filename)
t_hospitalward = pp.TemporalNetwork()
m_hospitalward = dict()
with open(filename,'r') as f:
    for line in f:
        t,i,j,Si,Sj = line.split()
        t_hospitalward.add_edge(i, j, int(t)//20)
        t_hospitalward.add_edge(j, i, int(t)//20)
        m_hospitalward[i]=Si
        m_hospitalward[j]=Sj
print(t_hospitalward.summary())

Nodes:			75
Time-stamped links:	64848
Links/Nodes:		864.64
Observation period:	[7, 17382]
Observation length:	 17375 
Time stamps:		 9453 
Avg. inter-event dt:	 1.838235294117647
Min/Max inter-event dt:	 1/1349


In [19]:
# export metadata
with open(os.path.join(out_dir, 'metadata_hospital.csv'),'w') as g:
    for i,c in sorted(list(m_hospitalward.items())):
        g.write('%s\t%s\n' % (i,c))

In [20]:
extraction_param = dict(delta=1)
#extraction_param = dict(delta=2)
#extraction_param = dict(delta=3)
c_hospitalward.update(extraction_param)
p_hospitalward = pp.path_extraction.paths_from_temporal_network_dag(t_hospitalward, **extraction_param)

2020-11-25 03:23:43 [Severity.INFO]	Constructing time-unfolded DAG ...
2020-11-25 03:23:44 [Severity.INFO]	finished.
Directed Acyclic Graph
Nodes:		66319
Roots:		15674
Leaves:		15674
Links:		64848
Acyclic:	None

2020-11-25 03:23:44 [Severity.INFO]	Generating causal trees for 15674 root nodes ...
2020-11-25 03:27:03 [Severity.INFO]	finished.


In [21]:
%time export_rules(os.path.join(out_dir, 'hospital_%d.csv' % extraction_param['delta']), p_hospitalward, c_hospitalward, m_hospitalward, max_order=4)

2020-11-25 03:27:03 [Severity.INFO]	Generating 0-th order layer ...
2020-11-25 03:27:03 [Severity.INFO]	Generating 1-th order layer ...
2020-11-25 03:27:03 [Severity.INFO]	Generating 2-th order layer ...
2020-11-25 03:27:04 [Severity.INFO]	Generating 3-th order layer ...
2020-11-25 03:27:07 [Severity.INFO]	Generating 4-th order layer ...
2020-11-25 03:28:20 [Severity.INFO]	finished.
2020-11-25 03:29:06 [Severity.INFO]	Likelihood ratio test for K_opt = 2, x = 1539106.6271252963
2020-11-25 03:29:06 [Severity.INFO]	Likelihood ratio test, d_1-d_0 = 85573
2020-11-25 03:29:06 [Severity.INFO]	Likelihood ratio test, p = 0.0
2020-11-25 03:29:51 [Severity.INFO]	Likelihood ratio test for K_opt = 3, x = 493542.75202822825
2020-11-25 03:29:51 [Severity.INFO]	Likelihood ratio test, d_1-d_0 = 3145815
2020-11-25 03:29:51 [Severity.INFO]	Likelihood ratio test, p = 1.0
2020-11-25 03:30:45 [Severity.INFO]	Likelihood ratio test for K_opt = 4, x = 319552.03792457865
2020-11-25 03:30:45 [Severity.INFO]	Like

In [22]:
del p_hospitalward