# Postprocess and anonymize empirica export

This `Python` notebook:
- combines the various information that empirica exports into a useable structure
- removes mechanical turk IDs and url parameters to anonymize data
- removes survey questions about fair pay, enough time, and general feedback
- combines into blocks that are analyzed together (you have to specify which games go in which blocks)
- exports each block to a file in the `results-anonymized` folder

In [1]:
import json
from os import path

## Load Data

In [6]:
#source_dir = "../results-sensitive/pilot/20200626/jsonl/"
source_dir = "../results-sensitive/pilot/20200507/jsonl/"
output_dir = "../results-anonymized/pilot/"


In [7]:
players = []
with open(source_dir + 'players.jsonl', 'r') as f:
    for line in f:
        player_data = json.loads(line)
        # remove identifying information
        player_data.pop('id')
        player_data.pop('urlParams')
        
        if 'data.survey' in player_data:
            player_data['data.survey'].pop('strength')
            player_data['data.survey'].pop('fair')
            player_data['data.survey'].pop('feedback')
            player_data['data.survey'].pop('time')
            
        if player_data['exitStatus'] != 'gameFull':
            players.append(player_data)

games = []
with open(source_dir + 'games.jsonl', 'r') as f:
    for line in f:
        games.append(json.loads(line))
        
treatments = []
with open(source_dir + 'treatments.jsonl', 'r') as f:
    for line in f:
        treatments.append(json.loads(line))

logs = []
with open(source_dir + 'player-logs.jsonl', 'r') as f:
    for line in f:
        entry = json.loads(line)
        entry['data'] = json.loads(entry['jsonData'])
        logs.append(entry)  
        
stages = []
with open(source_dir + 'stages.jsonl', 'r') as f:
    for line in f:
        stages.append(json.loads(line)) 
        
# match games, players, treatments, and log info
loaded_games = []
for game in games:
    game['players'] = {pl['_id']:pl for pl in players if pl['_id'] in game['playerIds']}
    treatment = [t for t in treatments if t["_id"] == game['treatmentId']][0]
    game['gameSetupId'] = treatment['name']
    game['log'] = [l for l in logs if l['gameId'] == game['_id']]
    game['stages'] = [r for r in stages if r['gameId'] == game['_id']]
    
    loaded_games.append(game)
    
for i, game in enumerate(loaded_games):
    print(i, game['createdAt'], game['gameSetupId'])

0 2020-05-04T20:25:48.663Z testBots
1 2020-05-05T18:10:44.030Z panel_1_matched_pair_exp_design6_matched_20200319_134546
2 2020-05-05T18:11:30.507Z panel_1_matched_pair_caveman_exp_design6_matched_20200319_134546
3 2020-05-06T18:13:27.831Z panel_2_matched_pair_exp_design6_matched_20200319_134546
4 2020-05-06T18:19:54.834Z panel_2_matched_pair_caveman_exp_design6_matched_20200319_134546
5 2020-05-07T16:21:41.220Z testBots
6 2020-05-07T17:20:47.582Z panel_3_matched_pair_caveman_exp_design6_matched_20200319_134546
7 2020-05-07T17:25:08.619Z panel_3_matched_pair_exp_design6_matched_20200319_134546


## Construct Blocks
Each block contains a set of games that should be processed together

In [8]:
# adjust these to process new data
blocks = {
    "block_20200505_pilot": [1,2],
    "block_20200506_pilot": [3,4],
    "block_20200507_pilot": [7,6]
}

# blocks = {
#     "block_20200624_pilot": [1],
#     "block_20200626_pilot": [2],
# }

for block_name, games_list in blocks.items(): 
    outfile_name = output_dir + block_name + '.json'
    if path.exists(outfile_name):
        print(outfile_name + " already exists. Be careful if you don't want to overwrite")

../results-anonymized/pilot/block_20200505_pilot.json already exists. Be careful if you don't want to overwrite
../results-anonymized/pilot/block_20200506_pilot.json already exists. Be careful if you don't want to overwrite
../results-anonymized/pilot/block_20200507_pilot.json already exists. Be careful if you don't want to overwrite


In [9]:
for block_name, games_list in blocks.items(): 
    obj = {loaded_games[i]['gameSetupId']: loaded_games[i] for i in games_list}
    outfile_name = output_dir + block_name + '.json'
    with open(outfile_name, 'w') as f:
        print("Writing " + str(games_list) + " to " + outfile_name)
        json.dump(obj, f)

Writing [1, 2] to ../results-anonymized/pilot/block_20200505_pilot.json
Writing [3, 4] to ../results-anonymized/pilot/block_20200506_pilot.json
Writing [7, 6] to ../results-anonymized/pilot/block_20200507_pilot.json
