In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
valid = [
    "word-input",
    "get-team-letters",
    "word-tiles",
    "team-letters",
    "anagrams-words-word-7-anagrams-14,",
    "word-input-group",
    "anagrams-words-word-8-anagrams-16",
    "copy-letters-div",
    "anagrams-words-word-20-anagrams-40",
    "anagrams-words-word-47-anagrams-62",
    "anagrams-words-word-55-anagrams-66",
    "anagrams-words-",
    "anagrams-words-word-61-anagrams-72",
    "anagrams-words-word-62-anagrams-74",
    "anagrams-words-word-69-anagrams-82",
    "anagrams-words-word-71-anagrams-86",
    "your-letters",
    "anagrams-words-word-73-anagrams-90",
    "countdown-number",
    "anagrams-words-word-74-anagrams-92",
    "anagrams-words-word-75-anagrams-94",
    "anagrams-words-word-81-anagrams-96",
    "main_container",
    "anagrams-words-word-83-anagrams-98",
    "anagrams-words-word-88-anagrams-102"
]

In [18]:
dirname = '2017-11-24'

# Experiment Level Results
summary = pd.read_csv('../data/experiment/{}/CompletedSessionSummary.csv'.format(dirname))
kValues = pd.read_csv('../data/experiment/{}/k-file.csv'.format(dirname),index_col=0)

clicks = pd.read_csv('../data/experiment/{}/Clicktracking.csv'.format(dirname))
clicks = clicks[clicks.element.isin(valid)]
demographic = pd.read_csv('../data/experiment/{}/demographic.csv'.format(dirname))
demographic_detailed = pd.read_csv('../data/experiment/{}/demographic_detailed.csv'.format(dirname))
time_spent = pd.read_csv('../data/experiment/{}/TimeSpent.csv'.format(dirname))
ruse = pd.read_csv('../data/experiment/{}/ruse.csv'.format(dirname))


# Anagrams Game
anagrams = pd.read_csv('../data/experiment/{}/anagrams.csv'.format(dirname))
instructions_anagrams = pd.read_csv('../data/experiment/{}/instructions_anagrams.csv'.format(dirname))
letter_transactions = pd.read_csv('../data/experiment/{}/Letter transactions.csv'.format(dirname))
neighbors = pd.read_csv('../data/experiment/{}/Neighbors.csv'.format(dirname))
team_words = pd.read_csv('../data/experiment/{}/Team words.csv'.format(dirname))
user_letters = pd.read_csv('../data/experiment/{}/User letters.csv'.format(dirname))


# Public Goods Game
public_goods = pd.read_csv('../data/experiment/{}/public_goods.csv'.format(dirname))
instructions_pgg = pd.read_csv('../data/experiment/{}/instructions_pgg.csv'.format(dirname))

In [4]:
partToSess = demographic[['session.code', 'participant.code']].set_index('participant.code').to_dict()['session.code']

## Measurable Inputs; need to show all available from study as well.

In [5]:
df = pd.DataFrame()

# DIFI Score
difi_linear_pregame = instructions_anagrams[['session.code', 'participant.code', 'player.distanceScale_before']]
difi_linear_pregame = difi_linear_pregame.replace(9999,np.nan)
difi_linear_pregame_mean = difi_linear_pregame.groupby('session.code').mean()
difi_linear_pregame_mean.columns = ['difi_pregame_mean']
difi_linear_pregame_var = difi_linear_pregame.groupby('session.code').var()
difi_linear_pregame_var.columns = ['difi_pregame_var']

df = pd.concat([df, difi_linear_pregame_mean, difi_linear_pregame_var], axis = 1)

difi_overlap_pregame  = instructions_anagrams[['session.code', 'participant.code', 'player.overlapScale_before']]
difi_overlap_pregame  = difi_overlap_pregame.replace(9999,np.nan)
difi_overlap_pregame_mean = difi_overlap_pregame.groupby('session.code').mean()
difi_overlap_pregame_mean.columns = ['difi_overlap_pregame_mean']
difi_overlap_pregame_var = difi_overlap_pregame.groupby('session.code').var()
difi_overlap_pregame_var.columns = ['difi_overlap_pregame_var']

df = pd.concat([df, difi_overlap_pregame_mean, difi_overlap_pregame_var], axis = 1)

# Request Diversity
requestDiversity = pd.DataFrame(letter_transactions[['player__participant__code','timestamp']], copy=True)
requestDiversity.columns = ['participant.code', 'timestamp']
requestDiversity['session.code'] = requestDiversity.apply(lambda x: partToSess[x['participant.code']], axis=1)
requestDiversity_mean = requestDiversity.groupby(['session.code', 'participant.code']).count().groupby('session.code').mean()
requestDiversity_mean.columns = ['requestDiversity_mean']
requestDiversity_var = requestDiversity.groupby(['session.code', 'participant.code']).count().groupby('session.code').var()
requestDiversity_var.columns = ['requestDiversity_var']

df = pd.concat([df, requestDiversity_mean, requestDiversity_var], axis = 1)

# Participation Rate
participationRate = clicks[['participant__code', 'element', 'timestamp']]
participationRate.columns = ['participant.code', 'element', 'timestamp']
participationRate['session.code'] = participationRate.apply(lambda x: partToSess[x['participant.code']], axis=1)
participationRate['diff'] = participationRate.groupby('session.code')['timestamp'].apply(pd.Series.diff)
participationRate_mean = participationRate.groupby('session.code')['diff'].mean()
participationRate_mean.name = 'participationRate_mean'
participationRate_var = participationRate.groupby('session.code')['diff'].var()
participationRate_var.name = 'participationRate_var'

df = pd.concat([df, participationRate_mean, participationRate_var], axis = 1)

# Correct Response Rate
correctResponses = pd.DataFrame(team_words[['player__participant__session__code', 'player__participant__code', 'timestamp']], copy=True)
correctResponses.columns = ['session.code','participant.code','timestamp']
x = correctResponses.groupby('session.code')
correctResponses['diff'] = x['timestamp'].apply(pd.Series.diff)
correctResponseRate_mean = correctResponses.groupby('session.code')['diff'].mean()
correctResponseRate_mean.name = 'correctResponseRate_mean'
correctResponseRate_var = correctResponses.groupby('session.code')['diff'].var()
correctResponseRate_var.name = 'correctResponseRate_var'

df = pd.concat([df, correctResponseRate_mean, correctResponseRate_var], axis = 1)

# Letter Request Rate
letterRequestRate = pd.DataFrame(letter_transactions[['player__participant__code','timestamp']], copy=True)
letterRequestRate.columns = ['participant.code', 'timestamp']
letterRequestRate['session.code'] = letterRequestRate.apply(lambda x: partToSess[x['participant.code']], axis=1)
x = letterRequestRate.groupby('session.code')
letterRequestRate['diff'] = x['timestamp'].apply(pd.Series.diff)
letterRequestRate_mean = letterRequestRate.groupby('session.code')['diff'].mean()
letterRequestRate_mean.name = 'letterRequestRate_mean'
letterRequestRate_var = letterRequestRate.groupby('session.code')['diff'].var()
letterRequestRate_var.name = 'letterRequestRate_var'

df = pd.concat([df, letterRequestRate_mean, letterRequestRate_var], axis = 1)

# Participation Times
#participationTimes_mean = 0
#participationTimes_var = 0

## Measureable Outputs; need to show all available from study as well.

In [6]:
# DIFI Score
difi_linear_postgame = public_goods[['session.code', 'participant.code', 'player.distanceScale']]
difi_linear_postgame = difi_linear_postgame.replace(9999,np.nan)
difi_linear_postgame_mean = difi_linear_postgame.groupby('session.code').mean()
difi_linear_postgame_mean.columns = ['difi_linear_postgame_mean']
difi_linear_postgame_var = difi_linear_postgame.groupby('session.code').var()
difi_linear_postgame_var.columns = ['difi_linear_postgame_var']

df = pd.concat([df, difi_linear_postgame_mean, difi_linear_postgame_var], axis = 1)

difi_overlap_postgame = public_goods[['session.code', 'participant.code', 'player.overlapScale']]
difi_overlap_postgame = difi_overlap_postgame.replace(9999,np.nan)
difi_overlap_postgame_mean = difi_overlap_postgame.groupby('session.code').mean()
difi_overlap_postgame_mean.columns = ['difi_overlap_postgame_mean']
difi_overlap_postgame_var = difi_overlap_postgame.groupby('session.code').var()
difi_overlap_postgame_var.columns = ['difi_overlap_postgame_var']

df = pd.concat([df, difi_overlap_postgame_mean, difi_overlap_postgame_var], axis = 1)

# Reponse Times
responseTimes = time_spent[time_spent.page_name == 'Contribute'][['participant__code', 'seconds_on_page']]
responseTimes.columns = ['participant.code', 'seconds_on_page']
responseTimes['session.code'] = responseTimes.apply(lambda x: partToSess[x['participant.code']], axis = 1)
responseTimes_mean = responseTimes.groupby('session.code').mean()
responseTimes_mean.columns = ['responseTimes_mean']
responseTimes_var = responseTimes.groupby('session.code').var()
responseTimes_var.columns = ['responseTimes_var']

df = pd.concat([df, responseTimes_mean, responseTimes_var], axis = 1)

# Public Goods Contribution
pggContribution = public_goods[['session.code', 'participant.code', 'player.contribution']]
pggContribution_mean = pggContribution.groupby('session.code').mean()
pggContribution_mean.columns = ['pggContribution_mean']
pggContribution_var = pggContribution.groupby('session.code').var()
pggContribution_var.columns = ['pggContribution_var']

df = pd.concat([df, pggContribution_mean, pggContribution_var], axis = 1)

In [7]:
valid_idx = summary[(summary.total_cost > 0) & (summary.duration < 100) & (summary.n_part_consented > 0)].session_code
df = df[df.index.isin(valid_idx)]

In [8]:
df.corr()

Unnamed: 0,difi_pregame_mean,difi_pregame_var,difi_overlap_pregame_mean,difi_overlap_pregame_var,requestDiversity_mean,requestDiversity_var,participationRate_mean,participationRate_var,correctResponseRate_mean,correctResponseRate_var,letterRequestRate_mean,letterRequestRate_var,difi_linear_postgame_mean,difi_linear_postgame_var,difi_overlap_postgame_mean,difi_overlap_postgame_var,responseTimes_mean,responseTimes_var,pggContribution_mean,pggContribution_var
difi_pregame_mean,1.0,-0.220234,0.942876,0.257345,0.037869,0.101908,-0.057033,-0.057146,0.160666,0.123472,-0.071607,-0.043931,0.25332,-0.067432,0.302934,0.001408,0.060428,0.057402,-0.091597,0.008721
difi_pregame_var,-0.220234,1.0,0.026328,0.553471,-0.095232,-0.024936,-0.086362,-0.0864,-0.201861,-0.147825,0.046748,-0.016096,-0.109018,0.418995,0.020351,0.345581,-0.065015,0.075395,0.062016,-0.07279
difi_overlap_pregame_mean,0.942876,0.026328,1.0,0.285659,0.003607,0.060742,-0.090677,-0.090807,0.154878,0.089981,-0.073281,-0.079591,0.218868,0.047632,0.299101,0.099744,0.02241,0.04549,-0.111064,-0.011273
difi_overlap_pregame_var,0.257345,0.553471,0.285659,1.0,-0.10946,0.0179,0.083842,0.083983,-0.206878,-0.117336,0.042044,0.065397,0.155078,0.061504,0.228693,0.095819,-0.056322,0.067304,-0.093769,-0.154643
requestDiversity_mean,0.037869,-0.095232,0.003607,-0.10946,1.0,0.691368,-0.380095,-0.126203,-0.406052,-0.357249,-0.549157,-0.503767,0.307357,-0.320008,0.167685,0.001668,0.130098,0.162332,0.112294,0.045676
requestDiversity_var,0.101908,-0.024936,0.060742,0.0179,0.691368,1.0,-0.299421,-0.168508,-0.052854,0.011064,-0.290412,-0.336563,0.361052,-0.033245,0.296619,0.313054,0.037863,0.073046,0.00562,0.295446
participationRate_mean,-0.057033,-0.086362,-0.090677,0.083842,-0.380095,-0.299421,1.0,0.999998,0.703468,0.71743,0.423895,0.217484,0.082781,0.230964,0.099184,0.148101,-0.05842,-0.05285,-0.241436,-0.132653
participationRate_var,-0.057146,-0.0864,-0.090807,0.083983,-0.126203,-0.168508,0.999998,1.0,-0.11108,-0.050761,-0.143021,-0.091986,0.083698,0.091374,0.100036,0.131835,-0.058657,-0.052786,-0.241427,-0.133053
correctResponseRate_mean,0.160666,-0.201861,0.154878,-0.206878,-0.406052,-0.052854,0.703468,-0.11108,1.0,0.848135,0.720535,0.521196,-0.576858,0.352944,-0.478532,0.267535,-0.012823,-0.019703,0.006558,0.478339
correctResponseRate_var,0.123472,-0.147825,0.089981,-0.117336,-0.357249,0.011064,0.71743,-0.050761,0.848135,1.0,0.643254,0.483322,-0.626876,0.293453,-0.487472,0.232341,-0.118062,-0.12252,0.090389,0.554531


In [9]:
df.to_csv('../data/output/grp-by-sessionId_{}.csv'.format(dirname))

In [10]:
df

Unnamed: 0_level_0,difi_pregame_mean,difi_pregame_var,difi_overlap_pregame_mean,difi_overlap_pregame_var,requestDiversity_mean,requestDiversity_var,participationRate_mean,participationRate_var,correctResponseRate_mean,correctResponseRate_var,letterRequestRate_mean,letterRequestRate_var,difi_linear_postgame_mean,difi_linear_postgame_var,difi_overlap_postgame_mean,difi_overlap_postgame_var,responseTimes_mean,responseTimes_var,pggContribution_mean,pggContribution_var
session.code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0j9gkpml,31.200000,5922.700000,40.000000,3000.000000,,,,,,,,,37.800000,5299.700000,43.600000,2548.800000,52.833333,829.366667,38.333333,806.666667
0mv7j8p9,61.333333,4981.066667,52.000000,2779.600000,6.600000,18.800000,4210.091954,2.509035e+07,8.805405,88.445003,9.093762,61.728022,90.500000,3459.666667,75.250000,2450.250000,54.250000,589.583333,46.250000,2022.916667
0v7aydut,39.875000,3014.383333,38.000000,1484.400000,,,,,,,,,51.812500,2573.495833,49.437500,1630.129167,42.500000,526.133333,45.375000,1035.583333
20pmmmna,54.500000,5795.714286,53.750000,1749.071429,,,,,,,,,57.625000,6289.125000,56.375000,2226.839286,62.375000,394.839286,60.625000,1274.553571
239cmkfa,59.750000,2092.916667,47.500000,1433.666667,7.250000,4.250000,4186.042254,2.006600e+07,3.415153,10.927378,8.719206,102.744034,58.000000,240.000000,53.000000,339.333333,55.500000,728.333333,62.500000,2291.666667
2413scgn,49.111111,2889.611111,42.666667,2055.250000,,,,,,,,,56.111111,2019.611111,46.000000,1434.500000,41.555556,626.277778,76.111111,754.861111
2m9rd2cp,21.611111,2442.839869,20.611111,1042.251634,5.000000,0.923077,1362.348659,5.514733e+06,1.473142,2.902172,3.758416,19.533891,47.833333,987.424242,42.416667,1164.265152,52.214286,941.412088,57.357143,1535.170330
2mvuz23s,53.250000,4272.785714,46.500000,2123.142857,,,,,,,,,66.500000,4606.857143,57.000000,2196.285714,43.750000,398.785714,50.625000,1260.267857
31f1vzlc,71.230769,3417.025641,57.692308,2365.230769,4.500000,3.388889,2242.548387,8.407011e+06,2.261036,6.375214,6.285599,97.460765,78.800000,3261.288889,68.700000,1577.344444,62.363636,855.454545,59.090909,749.090909
3k2emuiz,33.444444,2999.027778,36.222222,1398.694444,,,,,,,,,34.666667,5089.000000,42.888889,1904.861111,38.555556,191.277778,39.555556,1705.527778


In [12]:
difi_linear_postgame_mean

Unnamed: 0_level_0,difi_linear_postgame_mean
session.code,Unnamed: 1_level_1
0j9gkpml,1698.000000
0v7aydut,51.812500
20pmmmna,57.625000
2413scgn,56.111111
2cwrrlzd,
2m9rd2cp,1469.428571
2mvuz23s,66.500000
31f1vzlc,980.636364
3k2emuiz,34.666667
3nmquptg,82.800000


In [16]:
difi_linear_postgame[difi_linear_postgame['session.code'] == '0j9gkpml']

Unnamed: 0,session.code,participant.code,player.distanceScale
194,0j9gkpml,6jra1tpu,95.0
195,0j9gkpml,uuyan5u6,
196,0j9gkpml,c5vfbsrf,32.0
197,0j9gkpml,qv2gulcr,
198,0j9gkpml,lq78tq63,
199,0j9gkpml,4zzsw03a,125.0
200,0j9gkpml,9gmzm0mh,
201,0j9gkpml,d4pfxh1v,-13.0
202,0j9gkpml,pkaz6pup,-50.0
203,0j9gkpml,idtwo3ms,9999.0


In [56]:
summary = summary.join(kValues,on=['session_code'],how='left',rsuffix='kFile')
T = summary[['session_code','experiment_type','n_part_requested','K']]
T.columns = ['session_code', 'Anagrams','N','K']
T.N = T.N.apply(int)
T.Anagrams = T.Anagrams.apply(lambda x: int(not x.endswith('Anagrams')))
T

Unnamed: 0,session_code,Anagrams,N,K
0,wiii46il,1,5,2
1,vhmb74qv,1,5,2
2,5ydhsfg6,1,10,2
3,gbwspag3,1,10,2
4,89ykytnu,1,10,2
5,ef0hzbwg,1,10,2
6,3k2emuiz,0,10,2
7,20pmmmna,0,10,2
8,0j9gkpml,0,10,2
9,2mvuz23s,0,10,2


In [57]:
treatment_labels = ['Anagrams','N','K']

In [64]:
treatments = {}
for label in treatment_labels:
    gb = T.groupby(label).session_code.unique()
    
    for k,v in gb.iteritems():
        treatments['{}={}'.format(label,k)] = list(v)

In [65]:
treatments

{'Anagrams=0': ['3k2emuiz',
  '20pmmmna',
  '0j9gkpml',
  '2mvuz23s',
  'n4k2cx8b',
  'hi2c3026',
  '7pigyn3f',
  '4fs4fgfk',
  '5g3has3t',
  '2413scgn',
  'xwseefyz',
  'q93srm0v',
  '3nmquptg',
  '8zxyni8a',
  'h99ooh68',
  '5dbazi94',
  'gqdb8o26',
  'b2n1ddig',
  'xlc6rf1m',
  '8rqy84ck',
  'wqrygjn5',
  'hlzq44hl',
  'fpnd0fca',
  'xemkcr4x',
  'btkq36sn',
  'kdrbhzfy',
  'i5mx6e03',
  '0v7aydut',
  'bdjjsy69',
  'w6pz34i8',
  'o0ktodt9'],
 'Anagrams=1': ['wiii46il',
  'vhmb74qv',
  '5ydhsfg6',
  'gbwspag3',
  '89ykytnu',
  'ef0hzbwg',
  'jrseoprn',
  '7tbi6mwd',
  'pzv22f48',
  'frec209s',
  'wzq3kfp9',
  'aab8cga3',
  'kfn4hfj7',
  'r7gyn3n4',
  'vjdla1q9',
  'ehcc5wgj',
  'dhsdipge',
  'ub8pim6i',
  '942arxyx',
  'emwpven2',
  'oi2esved',
  '2m9rd2cp',
  '31f1vzlc',
  'ya61mn6p',
  'bquung7a',
  'ty1ttrub',
  'i0wkxelq',
  'eu7tcyrg',
  'krab6ohp',
  's14j5qtb',
  'wc7b7s80',
  'ltg4dv3d',
  'oflc49un',
  '9582pncb',
  'h504sxxm',
  'enxjuor7',
  'odl6fbvj',
  '9pl9m1y7',
  'wh

In [66]:
import json
with open('../data/output/treatments.json','w') as wFile:
    json.dump(treatments,wFile)