In [1]:
import sys
sys.path.append("..")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import datetime
from datetime import datetime, timedelta
from tools import *
import os

# Function Definitions

In [2]:
def downsample_dataset(contacts, co_locations, dataset): 
    if dataset == "primary_school":
        contacts["time"] = contacts["time"].map(lambda x: np.round(x/600))
        co_locations["time"] = co_locations["time"].map(lambda x: np.round((x-3600)/600))
    else:  
        contacts["time"] = contacts["time"].map(lambda x: np.round(x/3600))
        co_locations["time"] = co_locations["time"].map(lambda x: np.round(x/3600))
    return contacts, co_locations

'''
    assumes the columns to be in a particular form
    - contacts and co_locations: ['time', 'from', 'to']
    - labels: ['node', 'community']
'''
def relabel_dataset(contacts, co_locations, labels): 

    '''
        apply node mapping
    '''
    node_mapping = {node: i for i,node in enumerate(labels["node"].unique())}
    contacts[["from", "to"]] = contacts[["from", "to"]].applymap(lambda x: node_mapping[x])
    co_locations[["from", "to"]] = co_locations[["from", "to"]].applymap(lambda x: node_mapping[x])
    labels["node"] = labels["node"].map(lambda x: node_mapping[x])


    '''
        apply label mapping 
    '''
    label_mapping = {label: i for i,label in enumerate(labels["community"].unique())}
    labels["community"] = labels["community"].map(lambda x: label_mapping[x])
    labels = labels.reset_index().drop(columns=["index"])
    return contacts, co_locations, labels

def filter_dataset(contacts, co_locations, labels): 
    nodes_contacts = set(contacts["from"]).union(set(contacts["to"]))
    nodes_co_locations = set(co_locations["from"]).union(set(co_locations["to"]))
    nodes_labels = set(labels["node"])
    #print(nodes_contacts.difference(nodes_co_locations))
    ''' 
        filter contacts based on nodes in labels
    '''
    excluded_nodes = nodes_contacts.difference(nodes_labels)
    print("excluded nodes: ", excluded_nodes)
    if len(excluded_nodes) != 0: 
        condition = np.any([((contacts["from"] == node) | (contacts["to"] == node)) for node in excluded_nodes], axis=0)
        contacts = contacts.drop(contacts[condition].index)
           
    '''
        filter co_locations based on nodes in contacts
    '''
    excluded_nodes = nodes_co_locations.difference(nodes_contacts)
    print("excluded nodes: ", excluded_nodes)
    if len(excluded_nodes) != 0: 
        condition = np.any([((co_locations["from"] == node) | (co_locations["to"] == node)) for node in excluded_nodes], axis=0)
        co_locations = co_locations.drop(co_locations[condition].index)
            
    '''
        filter co_locations based on times in contacts
    '''
    
    excluded_times = set(co_locations["time"]).difference(set(contacts["time"]))
    print("excluded times: ", excluded_times)
    if len(excluded_times) != 0: 
        condition = np.any([(co_locations["time"] == time) for time in excluded_times], axis=0)
        co_locations = co_locations.drop(co_locations[condition].index)
    
    print("times in contacts but not in co_locations: ", set(contacts["time"]).difference(set(co_locations["time"])))
    '''
        filter labels based on nodes in contacts
    '''

    strangers = nodes_labels.difference(nodes_contacts)
    if len(strangers) != 0:
        condition = np.any([(labels["node"] == stranger) for stranger in strangers], axis=0)
        labels = labels.drop(labels[condition].index)
        
    nodes_contacts = set(contacts["from"]).union(set(contacts["to"]))
    nodes_co_locations = set(co_locations["from"]).union(set(co_locations["to"]))
    nodes_labels = set(labels["node"])   
    assert(nodes_contacts == nodes_labels)
    #assert(nodes_contacts == nodes_co_locations), nodes_contacts.difference(nodes_co_locations)
    #assert(nodes_labels == nodes_co_locations)
    return contacts, co_locations, labels

def create_data(contacts): 
    for t, hour in enumerate(contacts["time"].unique()): 
        data_t = contacts[contacts["time"] == hour]
        dft = pd.DataFrame.from_dict(dict({"source": data_t["from"], "target": data_t["to"], f"T{t}": [1] * len(data_t.index)}))
        dft = dft.groupby(['source', 'target']).aggregate({f'T{t}': 'sum'}).reset_index()
        if t == 0:
            df = dft
        else: 
            df = df.merge(dft, on=["source", "target"], how="outer")

    df = df.fillna(0)
    df = df.astype('int32')
    df = df.set_index(["source", "target"])
    data = np.zeros((T, N, N))
    for i, row in df.iterrows():
        for t in range(T): 
            data[t,i[0],i[1]] = row["T"+str(t)]
            
    """
        make the data symmetric and set diagonal to zero
    """
    
    
    for t in range(T):
        for i in range(N): 
            for j in range(N): 
                data[t,i,j] = max(data[t,i,j], data[t,j,i])
                data[t,j,i] = data[t,i,j]
        np.fill_diagonal(data[t], 0)
    
    return data

def create_Z(co_locations, data): 
    T, N = data.shape[0], data.shape[1]
    print(f"T: {T}, N: {N}")
    for t, hour in enumerate(contacts["time"].unique()): 
        data_t = co_locations[co_locations["time"] == hour]
        dft = pd.DataFrame.from_dict(dict({"source": data_t["from"], "target": data_t["to"], f"T{t}": [1] * len(data_t.index)}))
        dft = dft.groupby(['source', 'target']).aggregate({f'T{t}': 'sum'}).reset_index()
        if t == 0:
            df = dft
        else: 
            df = df.merge(dft, on=["source", "target"], how="outer")

    df = df.fillna(0)
    df = df.astype('int32')
    df = df.set_index(["source", "target"])
    
    
    exp_matrix = np.zeros((T, N, N))
    for i, row in df.iterrows():
        for t in range(T): 
            exp_matrix[t,i[0],i[1]] = row["T"+str(t)]
    '''
        create Z for markov inference
    '''
    Z_markov = exp_matrix>0
    
    '''
        ensure that exposure is always 1 if an interaction is observed
    '''
    Z_markov = np.maximum(Z_markov, data>0)
    
    
    """
        make Z symmetric and set diagonal to zero
    """
    for t in range(T):
        for i in range(N): 
            for j in range(N): 
                Z_markov[t,i,j] = max(Z_markov[t,i,j], Z_markov[t,j,i])
                Z_markov[t,j,i] = Z_markov[t,i,j]
        np.fill_diagonal(Z_markov[t], 0)
    
    
    
    '''
        create Z for heaviside inference
    '''
    exp_matrix = np.append(exp_matrix, np.ones((1,N,N)), axis=0)
    Z_heaviside = np.argmax(exp_matrix>0, axis=0)
    
    '''
        ensure that exposure happens before or at the same time of the first interaction
    '''
    interaction_tensor = np.append(data!=0, np.ones((1,N,N)), axis=0)
    first_interaction = np.argmax(interaction_tensor, axis=0)
    Z_heaviside = np.minimum(Z_heaviside, first_interaction)
    
    """
        make Z symmetric and set diagonal to zero
    """
    for i in range(N): 
        for j in range(N): 
            Z_heaviside[i,j] = min(Z_heaviside[i,j], Z_heaviside[j,i])
            Z_heaviside[j,i] = Z_heaviside[i,j]
    np.fill_diagonal(Z_heaviside, 0)
    
    return Z_heaviside, Z_markov


In [3]:
x = np.arange(0,9).reshape((3,3))
y = np.ones((3,3))

In [4]:
np.maximum(x,y)

array([[1., 1., 2.],
       [3., 4., 5.],
       [6., 7., 8.]])

# Preprocessing all datasets

In [5]:
in_folder = "../../data/input/sociopattern/"
datasets = os.listdir(in_folder)

In [6]:
for dataset in datasets: 
    print_bold(dataset)
    contacts = pd.read_csv(in_folder + dataset + '/' + 'contacts.csv', names=["time", "from", "to"], sep=' ')
    labels = pd.read_csv(in_folder + dataset + '/' + 'communities.csv', names=["node", "community"], sep='\t')
    co_locations = pd.read_csv(in_folder + dataset + '/' + 'co-locations.csv', names=["time", "from", "to"], sep=' ')
    # print initial times
    contacts_initial_time = contacts["time"].head(1).to_numpy()[0]
    co_locations_initial_time = co_locations["time"].head(1).to_numpy()[0] 
    print(f"initial time contacts: {contacts_initial_time}, intial time co_locations: {co_locations_initial_time}")
    
    '''
        for the highschool dataset, contacts time is stored in a unix datetime format
    '''
    if dataset == "highschool": 
        co_locations["time"] = co_locations["time"].map(lambda x: (datetime(year=2013, month=12, day=2) + timedelta(seconds=x)).timestamp())
        #contacts["time"] = contacts["time"].map(lambda x: datetime.fromtimestamp(x).time())
        #contacts["time"] = contacts["time"].map(lambda t: (t.hour * 60 + t.minute) * 60 + t.second + 3600)
        #print(contacts["time"].head(1).to_numpy()[0])
        #print(co_locations["time"].head(1).to_numpy()[0])
    '''
        downsample in time to get a reasonable number of timesteps
    '''
    contacts, co_locations = downsample_dataset(contacts, co_locations, dataset)
    '''
    print("times in contacts: ", set(contacts["time"]))
    print("times in co_locations: ", set(co_locations["time"]))
    
    print("nodes in contacts: ", set(contacts["from"]).union(set(contacts["to"])))
    print("nodes in co_locations: ", set(co_locations["from"]).union(set(co_locations["to"])))
    
    print("nodes in labels: ", set(labels["node"]))
    '''
    contacts, co_locations, labels = filter_dataset(contacts, co_locations, labels)
    contacs, co_locations, labels = relabel_dataset(contacts, co_locations, labels)

    
    '''
        create latent variables and data tensor
    '''
    T = contacts["time"].unique().size
    N = len(labels.index)
    K = len(set(labels["community"]))
    data = create_data(contacts)
    Z_heaviside, Z_markov = create_Z(co_locations, data)
    u = np.zeros((N,K))
    for i in range(N):
        u[i,labels["community"][i]] = 1
    v = u.copy()
    
    '''
        store results
    '''
    write_data(in_folder + dataset + "/", "data", data)
    np.savez(in_folder + dataset + "/" + "params", u=u, v=v, Z_heaviside=Z_heaviside, Z_markov=Z_markov)
    
    '''
        create and store masks for five-fold cross-validation
    '''
    mask = extract_mask(T,N)
    np.savez(in_folder + dataset + "/" + "mask", mask=mask)
    

[1mworkplace01[0m
initial time contacts: 28820, intial time co_locations: 28820
excluded nodes:  set()
excluded nodes:  {89, 374, 782}
excluded times:  {259.0, 272.0, 19.0, 20.0, 283.0, 284.0, 43.0, 44.0, 56.0, 187.0, 188.0, 68.0, 200.0, 80.0, 212.0, 92.0, 104.0, 236.0, 115.0, 116.0}
times in contacts but not in co_locations:  set()
T: 109, N: 92
Adjacency matrix saved in: ../../data/input/sociopattern/workplace01/data.csv
[1mhighschool[0m
initial time contacts: 1385982020, intial time co_locations: 29960
excluded nodes:  set()
excluded nodes:  {2}
excluded times:  {384992.0, 385025.0, 384993.0, 384994.0, 385097.0, 385001.0, 385073.0, 385049.0, 384991.0}
times in contacts but not in co_locations:  set()
T: 46, N: 327
Adjacency matrix saved in: ../../data/input/sociopattern/highschool/data.csv
[1mhospital[0m
initial time contacts: 140, intial time co_locations: 0
excluded nodes:  set()
excluded nodes:  {1632, 1513, 1580, 1518, 1590, 1594}
excluded times:  {34.0, 35.0, 36.0, 37.0, 