# Create Hazard Tables
This notebook creates hazard tables to use in cox regression. It creates one hazard table per block (so there is a matching data json file). 

In [1]:
%pylab inline
from helpers import retrace
import json
import copy
import pandas as pd
from datetime import datetime
import glob

Populating the interactive namespace from numpy and matplotlib


In [6]:
def fast_n_triangle_paths(M, edge):
    """ Fast check for triangle closing rule"""
    try:
        from_neighbors = set(M[edge[0]])  # if concept 0 not in network, false
        to_neighbors = set(M[edge[1]])  # if concept 1 not in network, false
        return len(from_neighbors & to_neighbors)  # closes number of existing paths
    except:
        return 0


def instantaneous_hazard_factors(game, g, t):
    """
    Given the state of the game, what are the factors influencing the hazard of adopting all beliefs?

    Returns a row for the hazard factors of all individuals for adopting all beliefs 
    given the current state of the game. Gets big fast.
    """
    prompt_nodes = set(game['data.clues']['tclue_1_2']['nodes'])

    rows = []
    for player_id, player_data in game['players'].items():

        pos = player_data['data.position']
        M = g.nodes()[player_id]['M']  # promising leads (memory)
        F = g.nodes()[player_id]['F']  # dead ends (forgetory)

        for clue_id, clue_data in game['data.clues'].items():
            if clue_id[0] == pos[0]: # only collect treatment clues for treatment players & vice versa
                
                nodes = set(clue_data['nodes'])
                row = {
                    'start': t,
                    
                    'exposure_id': '%s_%s'%(player_id, clue_id),  # to group all rows corresponding to same exposure 
                    'player_id': player_id,  # player random effect
                    'game_id': game['_id'],  # game random effect
    
                    'is_treatment_condition': pos.startswith('t'),
                    'is_spoke': len(nodes.intersection(prompt_nodes)) == 1,
                    'is_link_or_spur': len(nodes.intersection(prompt_nodes)) == 0,
                    'is_prompt': nodes == prompt_nodes,
                    'is_in_leads': M.has_edge(*nodes),
                    'is_in_deads': F.has_edge(*nodes),
                    
                    'n_exposures': sum([g.nodes()[nid]['M'].has_edge(*nodes) for nid in g.neighbors(player_id)]),  # number of neighbors exposing

                    # number of beliefs already adopted
                    'n_existing_leads': M.number_of_edges(),
                    
#                    'n_fresh_candidates': ... # expensive, and not very influential 
                    # dummies for the time block
                    'in_startup':    t<30,   # reading newly available clues
                    'in_peak':   30<=t<180,  # most active time
                    'in_tail':  180<=t<420,  # less active time
                    'in_close': 420<=t,      # last minute, running out of time
                    
                    # number of connections by any clue to any of the rim nodes
                    'n_rim_connections': sum([v for k,v in M.degree(nodes-prompt_nodes)]),  # includes the current clue, if it exists
                    
                    # number of triangle paths
                    'n_triangle_paths': fast_n_triangle_paths(M, clue_data['nodes']),
                    
                    # number of beliefs that the player has that are also in the exposers' leads
                    'n_edges_shared_with_exposers': len({
                        edge for nid in g.neighbors(player_id) 
                        if g.nodes()[nid]['M'].has_edge(*nodes) 
                        for edge in g.nodes()[nid]['M'].edges()
                    }.intersection({edge for edge in M.edges()})), 
#                    'n_spoke_connections': ... # todo. 
#                    'n_link_or_spur_connections': 
                }
                row['is_link'] = row['is_link_or_spur'] & row['is_treatment_condition']
                row['is_spur'] = row['is_link_or_spur'] & ~row['is_treatment_condition']                
                rows.append(row)

    return rows


def process_exposure_group(group, t_last):
    """
    Groups represent player-clue combinations, or unique "exposure" possibilities
    Takes the hazard table and creates a table that lifelines can use.
    1. Condenses multiple rows (by dropping duplicates)
    2. Treats start and end times
    3. Identifies adoption events
    """
    
    # discard player-clue groups where the player is never exposed to the clue
    if max(group['n_exposures']) == 0: 
        return pd.DataFrame()
    
    # check that group is sorted
    group.sort_values(['start'], inplace=True)
    
    # drop consecutive duplicate rows (ie, nothing changes w.r.t. the adoption factors)
    match_on_cols = set(group.columns) - {'start'}
    keep_rows = (group[match_on_cols].shift() != group[match_on_cols]).any(axis=1)
    group = group.loc[keep_rows]

    # identify exposures where the player is exposed at start
    # the player may react differently to these than others
    group['is_exposed_t0'] = (group[group['start']<3]['n_exposures'] > 0).any()
    
    # identify clues the player holds at start
    group['is_held_t0'] = (group[group['start']<3]['is_in_leads']).any()

    # add "stop" column
    group['stop'] = group['start'].shift(-1)
    group.loc[group.index[-1], 'stop'] = t_last
    
    # identify "adopt" events 
    # ie. the row period ends with an adoption change
    group['adopt_event'] = group['is_in_leads'] < group.shift(-1)['is_in_leads']
    group.loc[group.index[-1], 'adopt_event'] = False

    # identify "forget" events
    group['forget_event'] = group['is_in_leads'] > group.shift(-1)['is_in_leads']
    group.loc[group.index[-1], 'forget_event'] = False

    return group

In [7]:
def process_block(block_file):
    with open(block_file, 'r') as f:
        block = json.load(f)

    block_collector = []
    for name, game in block.items():
        # game level constant calculations
        t_final = datetime.strptime(game['finishedAt'], '%Y-%m-%dT%H:%M:%S.%fZ')
        t_start = datetime.strptime(game['createdAt'], '%Y-%m-%dT%H:%M:%S.%fZ')
        total_time = (t_final-t_start).total_seconds()
        
        # compute instantaneous hazard factors at each state change
        hazard_factors_collector = []
        for (active_player_id, g, t) in retrace(game):
            hazard_factors_collector += instantaneous_hazard_factors(game, g, t)
        hazard_factors = pd.DataFrame(hazard_factors_collector)
        
        # compute condensed hazard table
        hazard_table_collector = []
        for i, (eid, group) in enumerate(hazard_factors.groupby('exposure_id')):
            hazard_table_collector.append(process_exposure_group(group, total_time))
        hazard_table = pd.concat(hazard_table_collector)
        hazard_table['is_caveman_game'] = 'caveman' in game['data.gameSetupId']
        
        block_collector.append(hazard_table)
    
    # assemble all games in block into single hazard table
    block_hazard_table = pd.concat(block_collector)
    
    # force boolean types to numeric
    block_hazard_table *= 1
    
    # write to file
    block_hazard_table_file = block_file.replace('.json', '_hazards.csv')
    block_hazard_table.to_csv(block_hazard_table_file)

    

In [8]:
output_dir = "../results-anonymized/pilot/"
files = glob.glob(output_dir+'block_*_pilot.json')
files

['../results-anonymized/pilot/block_20200505_pilot.json',
 '../results-anonymized/pilot/block_20200507_pilot.json',
 '../results-anonymized/pilot/block_20200624_pilot.json',
 '../results-anonymized/pilot/block_20200626_pilot.json',
 '../results-anonymized/pilot/block_20200506_pilot.json']

In [9]:
for file in files:
    process_block(file)
    print(file+" complete")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value i

../results-anonymized/pilot/block_20200505_pilot.json complete
['a blue denim jacket', 'DeRolfe Jewelers'] no longer in source YcXuMSvBGNS8TfwyQ
['Mitchell', 'a blue Honda Fit'] no longer in source 7ZiYuKHheZ6umhAcG
['a blue Honda Fit', 'a broken grill'] no longer in source Ds9n4eazBrNuMFsGq
['the necklace', 'a blowtorch'] no longer in source Ds9n4eazBrNuMFsGq
../results-anonymized/pilot/block_20200507_pilot.json complete
../results-anonymized/pilot/block_20200624_pilot.json complete
../results-anonymized/pilot/block_20200626_pilot.json complete
['Bennet', 'a pipe cutter'] no longer in source NBRMZjA7bcAyFxKDm
../results-anonymized/pilot/block_20200506_pilot.json complete


In [10]:
!say "analysis complete"

In [6]:
%debug

> [0;32m/Users/jameshoughton/Google Drive/MIT PhD/Factionalism_Research/detective-game-interdependent-diffusion/analysis/helpers.py[0m(64)[0;36mretrace[0;34m()[0m
[0;32m     62 [0;31m    [0;31m# trace game[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     63 [0;31m    [0;31m#t_start = datetime.strptime(game['createdAt'], '%Y-%m-%dT%H:%M:%S.%fZ')[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 64 [0;31m    [0mround[0m [0;34m=[0m [0;34m[[0m[0mr[0m [0;32mfor[0m [0mr[0m [0;32min[0m [0mgame[0m[0;34m[[0m[0;34m'rounds'[0m[0;34m][0m [0;32mif[0m [0mr[0m[0;34m[[0m[0;34m'name'[0m[0;34m][0m [0;34m==[0m [0;34m'response'[0m[0;34m][0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     65 [0;31m    [0mt_start[0m [0;34m=[0m [0mdatetime[0m[0;34m.[0m[0mstrptime[0m[0;34m([0m[0mround[0m[0;34m[[0m[0;34m'startTimeAt'[0m[0;34m][0m[0;34m,[0m [0;34m'%Y-%m-%dT%H:%M:%S.%fZ'[0m[0;34m)[0m[0;34m[0