# Create Hazard Tables
This `Python` notebook creates hazard tables to use in cox regression. It creates one hazard table per block (so there is a matching data json file). 

In [1]:
%pylab inline
from helpers import retrace
import json
import copy
import pandas as pd
from datetime import datetime
import glob

Populating the interactive namespace from numpy and matplotlib


In [2]:
def fast_n_triangle_paths(M, edge):
    """ Fast check for triangle closing rule"""
    try:
        from_neighbors = set(M[edge[0]])  # if concept 0 not in network, false
        to_neighbors = set(M[edge[1]])  # if concept 1 not in network, false
        return len(from_neighbors & to_neighbors)  # closes number of existing paths
    except:
        return 0


def instantaneous_hazard_factors(game, g, t):
    """
    Given the state of the game, what are the factors influencing the hazard of adopting all beliefs?

    Returns a row for the hazard factors of all individuals for adopting all beliefs 
    given the current state of the game. Gets big fast.
    """
    prompt_nodes = set(game['data.clues']['tclue_1_2']['nodes'])

    rows = []
    for player_id, player_data in game['players'].items():

        pos = player_data['data.position']
        M = g.nodes()[player_id]['M']  # promising leads (memory)
        F = g.nodes()[player_id]['F']  # dead ends (forgetory)

        for clue_id, clue_data in game['data.clues'].items():
            if clue_id[0] == pos[0]: # only collect treatment clues for treatment players & vice versa
                
                nodes = set(clue_data['nodes'])
                row = {
                    'start': t,
                    
                    'exposure_id': '%s_%s'%(player_id, clue_id),  # to group all rows corresponding to same exposure 
                    'player_id': player_id,  # player random effect
                    'game_id': game['_id'],  # game random effect
    
                    'is_treatment_condition': pos.startswith('t'),
                    'is_spoke': len(nodes.intersection(prompt_nodes)) == 1,
                    'is_link_or_spur': len(nodes.intersection(prompt_nodes)) == 0,
                    'is_prompt': nodes == prompt_nodes,
                    'is_in_leads': M.has_edge(*nodes),
                    'is_in_deads': F.has_edge(*nodes),
                    
                    'n_exposures': sum([g.nodes()[nid]['M'].has_edge(*nodes) for nid in g.neighbors(player_id)]),  # number of neighbors exposing

                    # number of beliefs already adopted
                    'n_existing_leads': M.number_of_edges(),
                    
                    # dummies for the time block
                    'in_startup':    t<30,   # reading newly available clues
                    'in_peak':   30<=t<180,  # most active time
                    'in_tail':  180<=t<420,  # less active time
                    'in_close': 420<=t,      # last minute, running out of time
                    
                    # number of connections by any clue to any of the rim nodes
                    'n_rim_connections': sum([v for k,v in M.degree(nodes-prompt_nodes)]),  # includes the current clue, if it exists
                    
                    # number of triangle paths
                    'n_triangle_paths': fast_n_triangle_paths(M, clue_data['nodes']),
                    
                    # number of beliefs that the player has that are also in the exposers' leads
                    'n_shared_edges': len({
                        edge for nid in g.neighbors(player_id) 
                        if g.nodes()[nid]['M'].has_edge(*nodes) 
                        for edge in g.nodes()[nid]['M'].edges()
                    }.intersection({edge for edge in M.edges()})), 
                }
                row['is_link'] = row['is_link_or_spur'] & row['is_treatment_condition']
                row['is_spur'] = row['is_link_or_spur'] & ~row['is_treatment_condition']                
                rows.append(row)

    return rows


def process_exposure_group(group, t_last):
    """
    Groups represent player-clue combinations, or unique "exposure" possibilities
    Takes the hazard table and creates a table that lifelines can use.
    1. Condenses multiple rows (by dropping duplicates)
    2. Treats start and end times
    3. Identifies adoption events
    """
    
    # discard player-clue groups where the player is never exposed to the clue
    if max(group['n_exposures']) == 0: 
        return pd.DataFrame()
    
    # check that group is sorted
    group.sort_values(['start'], inplace=True)
    
    # drop consecutive duplicate rows (ie, nothing changes w.r.t. the adoption factors)
    match_on_cols = set(group.columns) - {'start'}
    keep_rows = (group[match_on_cols].shift() != group[match_on_cols]).any(axis=1)
    group = group.loc[keep_rows]

    # identify exposures where the player is exposed at start
    # the player may react differently to these than others
    group['is_exposed_t0'] = (group[group['start']<3]['n_exposures'] > 0).any()
    
    # identify clues the player holds at start
    group['is_held_t0'] = (group[group['start']<3]['is_in_leads']).any()

    # add "stop" column
    group['stop'] = group['start'].shift(-1)
    group.loc[group.index[-1], 'stop'] = t_last
    
    # identify "adopt" events 
    # ie. the row period ends with an adoption change
    group['adopt_event'] = group['is_in_leads'] < group.shift(-1)['is_in_leads']
    group.loc[group.index[-1], 'adopt_event'] = False

    # identify "forget" events
    group['forget_event'] = group['is_in_leads'] > group.shift(-1)['is_in_leads']
    group.loc[group.index[-1], 'forget_event'] = False

    return group

In [3]:
def process_block(block_file):
    with open(block_file, 'r') as f:
        block = json.load(f)

    block_collector = []
    for name, game in block.items():
        # game level constant calculations
        t_final = datetime.strptime(game['finishedAt'], '%Y-%m-%dT%H:%M:%S.%fZ')
        t_start = datetime.strptime(game['createdAt'], '%Y-%m-%dT%H:%M:%S.%fZ')
        total_time = (t_final-t_start).total_seconds()
        
        # compute instantaneous hazard factors at each state change
        hazard_factors_collector = []
        for (active_player_id, g, t) in retrace(game):
            hazard_factors_collector += instantaneous_hazard_factors(game, g, t)
        hazard_factors = pd.DataFrame(hazard_factors_collector)
        
        # compute condensed hazard table
        hazard_table_collector = []
        for i, (eid, group) in enumerate(hazard_factors.groupby('exposure_id')):
            hazard_table_collector.append(process_exposure_group(group, total_time))
        hazard_table = pd.concat(hazard_table_collector)
        hazard_table['is_caveman_game'] = 'caveman' in game['data.gameSetupId']
        
        block_collector.append(hazard_table)
    
    # assemble all games in block into single hazard table
    block_hazard_table = pd.concat(block_collector)
    
    # force boolean types to numeric
    block_hazard_table *= 1
    
    # write to file
    block_hazard_table_file = block_file.replace('.json', '_hazards.csv')
    block_hazard_table.to_csv(block_hazard_table_file)

    

In [4]:
output_dir = "../results-anonymized/experiment/"
files = glob.glob(output_dir+'block_*.json')
files

['../results-anonymized/experiment/block_9.json',
 '../results-anonymized/experiment/block_16.json',
 '../results-anonymized/experiment/block_20.json',
 '../results-anonymized/experiment/block_5.json',
 '../results-anonymized/experiment/block_4.json',
 '../results-anonymized/experiment/block_21.json',
 '../results-anonymized/experiment/block_17.json',
 '../results-anonymized/experiment/block_8.json',
 '../results-anonymized/experiment/block_26.json',
 '../results-anonymized/experiment/block_3.json',
 '../results-anonymized/experiment/block_10.json',
 '../results-anonymized/experiment/block_11.json',
 '../results-anonymized/experiment/block_2.json',
 '../results-anonymized/experiment/block_27.json',
 '../results-anonymized/experiment/block_1.json',
 '../results-anonymized/experiment/block_24.json',
 '../results-anonymized/experiment/block_12.json',
 '../results-anonymized/experiment/block_28.json',
 '../results-anonymized/experiment/block_29.json',
 '../results-anonymized/experiment/blo

In [6]:
from os import path

for file in files:
    if not path.exists(file.replace('.json', '_hazards.csv')):
        process_block(file)
        print(file+" complete")
    else:
        print(file+" already processed")

../results-anonymized/experiment/block_9.json already processed
../results-anonymized/experiment/block_16.json already processed
../results-anonymized/experiment/block_20.json already processed
../results-anonymized/experiment/block_5.json already processed
../results-anonymized/experiment/block_4.json already processed
../results-anonymized/experiment/block_21.json already processed
../results-anonymized/experiment/block_17.json already processed
../results-anonymized/experiment/block_8.json already processed
['Bennet', "in their early 20's"] no longer in source 363Wtc9W7pHpCXn2n
['Mitchell', 'a white Toyota Avalon'] no longer in source pxdEnxkB8c6zGXbot
['a blue long sleeve shirt', 'a journalist uncovering a story'] no longer in source ktfhRNJT5GdPwZ6bk


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A val

['a white Toyota Avalon', 'a broken axle'] no longer in source 7Z9tEGZ4mfgQEH5uR
../results-anonymized/experiment/block_26.json complete
../results-anonymized/experiment/block_3.json already processed
../results-anonymized/experiment/block_10.json already processed
../results-anonymized/experiment/block_11.json already processed
../results-anonymized/experiment/block_2.json already processed
../results-anonymized/experiment/block_27.json complete
../results-anonymized/experiment/block_1.json already processed
['Mills', 'the bracelet'] no longer in source XYDJ6XdbJrK3SoQXC
['Roberts', 'a blue Chevrolet Corvette'] no longer in source 4nBtzav8tLCvp4nWd
../results-anonymized/experiment/block_24.json complete
../results-anonymized/experiment/block_12.json already processed
['Bennet', 'a bear'] no longer in source Y7Pt2rrYEaPeDWTS3
['a short man', 'a broken arm'] no longer in source GWFShkhDJWncK4bQo
['a partially-bald man', 'the Dalhoff Estate'] no longer in source QHZcFHi4iKZp65jFr
['a set

In [6]:
!say "analysis complete"