# Load libraries and global functions

In [1]:
# analysis
import collections
import random
import math
import pandas as pd
import numpy as np

In [2]:
# data loading
from copy import deepcopy
from pymongo import MongoClient

In [3]:
# code performance
import time
import multiprocessing as mp
from tqdm import tqdm_notebook as tqdm

# Fetch Pairing Data from [pairresearch.io](http://pairresearch.io/)

In [4]:
uri = 'mongodb://delta:delta@ds011419.mlab.com:11419/pair-research'
dbName = 'pair-research'
client = MongoClient(uri)
db = client[dbName]
db.list_collection_names()

['affinities',
 'meteor_accounts_loginServiceConfiguration',
 'tasks_history',
 'groups',
 'users',
 'objectlabs-system.admin.collections',
 'pairs_history',
 'tasks',
 'system.indexes',
 'pairings',
 'objectlabs-system',
 'affinities_history']

In [5]:
groups = pd.DataFrame(list(db.groups.find({})))

# remove testing groups
group_creator_ignore_list = ['Demo Admin', 'ykykykykykykykykykyk', 'Stella', 'Kevin Northwestern',
                             'Kevin Chen', 'Leesha', 'Jennie']
group_ignore_ids = groups[groups['creatorName'].isin(group_creator_ignore_list)]['_id'].unique()

# subset groups by id
groups_orig_size = len(groups)
groups_new_size = 0

groups = groups[~groups['_id'].isin(group_ignore_ids)]
groups.reset_index(drop=True, inplace=True)

# get valid group ids from remaining groups
valid_group_ids = groups['_id'].unique()

# print change in size
groups_new_size = len(groups)
print('Number of Groups \nOriginal size: {} --> New size: {}'.format(groups_orig_size, groups_new_size))

# display task history
groups.head()

Number of Groups 
Original size: 711 --> New size: 84


Unnamed: 0,_id,groupName,description,creatorId,creatorName,roles,creationDate,members,active,activePairing
0,uPLDbfFqqdHEEkgCT,Beatles,Rock and Roll Band,goGr47HDwtfphJ5xK,Julian Vicens,"[{'title': 'Guitar', '_id': 'oB3qMqXdTJNqR6vbZ...",2016-08-10 18:55:16.164,"[{'fullName': 'Julian Vicens', 'userId': 'goGr...",True,
1,Et46F6odTBmiFiDSZ,Knight Lab Testing,Knight Lab taking Pair Research for a spin,NtZ9hv3g6eLAwN2nY,Joe Germuska,"[{'title': 'Admin', '_id': 's2JKkhE9XC6GPW5ev'...",2016-07-18 21:21:54.117,"[{'fullName': 'Joe Germuska', 'userId': 'NtZ9h...",True,nnN46Abcc78AAtqKf
2,kY7xHo6c5m5tCiQMH,Knight Lab Pair Research,Thursdays at 2:30,u2GAvznbx7Jbf97Hk,Emily Withrow,"[{'title': 'Professor', '_id': 'q3PJXDZpMMhcZB...",2016-09-28 19:17:10.709,"[{'fullName': 'Emily Withrow', 'userId': 'u2GA...",False,
3,KEo62WdN5WSkHa9Hh,Knight Lab Pair Research,Thursdays at 2:30,u2GAvznbx7Jbf97Hk,Emily Withrow,"[{'title': 'Professor', '_id': '6L6YwxgDwpqgoY...",2016-09-29 15:15:15.184,"[{'fullName': 'Emily Withrow', 'userId': 'u2GA...",False,
4,qPnf2DHHihugATnxD,Segal Design Cluster,an intellectual community for design faculty a...,PavTL8zD9664wvtfB,Haoqi Zhang,"[{'title': 'Professor', '_id': 'sSNgzD6So2kz95...",2016-11-10 18:38:04.379,"[{'fullName': 'Haoqi Zhang', 'userId': 'PavTL8...",True,52meFWjxGNoAqTJxx


In [6]:
users = pd.DataFrame(list(db.users.find({})))

# users must be in at least one valid group
valid_group_ids_set = set(valid_group_ids)
users['valid_user'] = users['groups'].apply(lambda x: bool(valid_group_ids_set & set([y['groupId'] for y in x])))

# remove invalid users
users_orig_size = len(users)
users_new_size = 0

users = users[users['valid_user']]

# print change in size
users_new_size = len(users)
print('Number of Users \nOriginal size: {} --> New size: {}'.format(users_orig_size, users_new_size))

# display users
users.head()

Number of Users 
Original size: 1143 --> New size: 1055


Unnamed: 0,_id,createdAt,services,emails,profile,groups,valid_user
1,BPQ7hyoHgghctHPqq,2016-08-29 18:24:50.295,{'password': {'bcrypt': '$2a$10$1.nd.WyfVggPpg...,"[{'address': 'egerber@northwestern.edu', 'veri...",{'fullName': 'Liz Gerber'},"[{'groupId': '9mdkMmj4pY8Q2TwqF', 'role': {'_i...",True
2,bZEjadPH7KrjM9PfD,2016-11-10 19:19:34.147,{'password': {}},"[{'address': 'ampiper@northwestern.edu', 'veri...",{'fullName': 'ampiper@northwestern.edu'},"[{'groupId': 'qPnf2DHHihugATnxD', 'role': {'_i...",True
5,4nAboBfRx5RMJg68G,2017-03-27 14:33:17.771,{'password': {}},"[{'address': 'g-danko@northwestern.edu', 'veri...",{'fullName': 'g-danko@northwestern.edu'},"[{'groupId': 'u4kjJC55DPMLpR8bC', 'role': {'ti...",True
6,9iEAAD9Y54n4hMy3D,2017-03-27 14:39:27.572,{'password': {}},"[{'address': 'a-prachand@northwestern.edu', 'v...",{'fullName': 'a-prachand@northwestern.edu'},"[{'groupId': 'u4kjJC55DPMLpR8bC', 'role': {'ti...",True
8,Byki6KMawAsYnmr8x,2017-06-01 21:25:16.500,{'password': {}},"[{'address': 'bjoern@eecs.berkeley.edu', 'veri...",{'fullName': 'bjoern@eecs.berkeley.edu'},"[{'groupId': 'je9bo2hHLbYwWNtRd', 'role': {'_i...",True


In [7]:
tasks_history = pd.DataFrame(list(db.tasks_history.find({})))

# remove bad groups
tasks_history_orig_size = len(tasks_history)
tasks_history_new_size = 0

tasks_history = tasks_history[tasks_history['groupId'].isin(valid_group_ids)]
tasks_history.reset_index(drop=True, inplace=True)

# add group_pairing_id
tasks_history['group_pairing_id'] = tasks_history['groupId'] + '-' + tasks_history['pairingId']

# print change in size
tasks_history_new_size = len(tasks_history)
print('Number of Tasks\nOriginal size: {} --> New size: {}'.format(tasks_history_orig_size, tasks_history_new_size))

# display task history
tasks_history.head()

Number of Tasks
Original size: 4434 --> New size: 4422


Unnamed: 0,_id,name,userId,groupId,task,pairingId,group_pairing_id
0,k4ewZSgDHsvDFkXpX,Yongsung Kim,EDEFWcagLwCfXP5Jg,9mdkMmj4pY8Q2TwqF,i need to send out a short-survey to interviewees,nRAQpsPhsQs4zRvTL,9mdkMmj4pY8Q2TwqF-nRAQpsPhsQs4zRvTL
1,RZZWR8pABaJBKYNFu,Julian Vicens,goGr47HDwtfphJ5xK,9mdkMmj4pY8Q2TwqF,I would like to talk about different ways to m...,nRAQpsPhsQs4zRvTL,9mdkMmj4pY8Q2TwqF-nRAQpsPhsQs4zRvTL
2,Xr3dvNreiwzq9ixrQ,Spencer Carlson,vbsF64nAgoitwrNeB,9mdkMmj4pY8Q2TwqF,Make educated guesses about the quality of my ...,nRAQpsPhsQs4zRvTL,9mdkMmj4pY8Q2TwqF-nRAQpsPhsQs4zRvTL
3,dFpfXT8szHkp2pYgG,Leesha,aNdSTecskgeAm2St5,9mdkMmj4pY8Q2TwqF,I need help planning a latency handling featur...,nRAQpsPhsQs4zRvTL,9mdkMmj4pY8Q2TwqF-nRAQpsPhsQs4zRvTL
4,zEMk9HQo9azvKzDye,Eureka Foong,JaEySKdKKg7LAF3Yg,9mdkMmj4pY8Q2TwqF,Installing a program using Terminal (I'm bad a...,nRAQpsPhsQs4zRvTL,9mdkMmj4pY8Q2TwqF-nRAQpsPhsQs4zRvTL


In [8]:
pairings = pd.DataFrame(list(db.pairings.find({})))

# remove bad groups
pairings_orig_size = len(pairings)
pairings_new_size = 0

pairings = pairings[pairings['groupId'].isin(valid_group_ids)]

# add group_pair id
pairings['group_pairing_id'] = pairings['groupId'] + '-' + pairings['_id']
pairings.reset_index(drop=True, inplace=True)

# print change in size
pairings_new_size = len(pairings)
print('Number of Pairing Sessions\nOriginal size: {} --> New size: {}'.format(pairings_orig_size, pairings_new_size))

# display current pairings
print('Pairing count: {}, Unique group count: {}'.format(len(pairings), len(pairings.groupId.unique())))
pairings.sort_values('timestamp', ascending=False, inplace=True)
pairings.head()

Number of Pairing Sessions
Original size: 691 --> New size: 604
Pairing count: 604, Unique group count: 55


Unnamed: 0,_id,groupId,pairings,timestamp,group_pairing_id
603,wyZa88XjC559Epspt,5QXWCwAFBrdbLYGar,"[{'firstUserId': 'MyhNrCv6sYAdnEvBF', 'firstUs...",2019-11-27 18:36:06.111,5QXWCwAFBrdbLYGar-wyZa88XjC559Epspt
602,QEvCZCAQeE4WJuhAE,5QXWCwAFBrdbLYGar,"[{'firstUserId': 'uuHeMxx9twNKXtiZR', 'firstUs...",2019-11-27 18:32:34.399,5QXWCwAFBrdbLYGar-QEvCZCAQeE4WJuhAE
601,Gp44ikq2z6mYF26jB,5QXWCwAFBrdbLYGar,"[{'firstUserId': 'en4PobG8cG73svWCQ', 'firstUs...",2019-11-27 18:30:42.901,5QXWCwAFBrdbLYGar-Gp44ikq2z6mYF26jB
600,qMqYnQhxEb6wuCRpw,5QXWCwAFBrdbLYGar,"[{'firstUserId': 'BiWY7NwRNLGYgQG79', 'firstUs...",2019-11-22 19:25:24.590,5QXWCwAFBrdbLYGar-qMqYnQhxEb6wuCRpw
599,5ikQM3qH9p7RfP3vB,5QXWCwAFBrdbLYGar,"[{'firstUserId': 'en4PobG8cG73svWCQ', 'firstUs...",2019-11-22 19:19:54.694,5QXWCwAFBrdbLYGar-5ikQM3qH9p7RfP3vB


In [9]:
pairs_history = pd.DataFrame(list(db.pairs_history.find({})))

# remove bad groups
pairs_history_orig_size = len(pairs_history)
pairs_history_new_size = 0

pairs_history = pairs_history[pairs_history['groupId'].isin(valid_group_ids)]

# add group_pairing_id column
pairs_history['group_pairing_id'] = pairs_history['groupId'] + '-' + pairs_history['pairingId']
pairs_history.reset_index(drop=True, inplace=True)

# print change in size
pairs_history_new_size = len(pairs_history)
print('Number of Pairs\nOriginal size: {} --> New size: {}'.format(pairs_history_orig_size, pairs_history_new_size))

# display current pairs_history
print('Unique group count: {}, Unique pairing count: {}'.format(len(pairs_history.groupId.unique()), 
                                                                len(pairs_history.group_pairing_id.unique())))
pairs_history.sort_values('timestamp', ascending=False, inplace=True)
pairs_history.head(10)

Number of Pairs
Original size: 3122 --> New size: 3114
Unique group count: 55, Unique pairing count: 604


Unnamed: 0,_id,groupId,pairingId,firstUserId,firstUserName,firstUserRole,secondUserId,secondUserName,secondUserRole,timestamp,group_pairing_id
3113,antBG9RczKdzcMDb9,5QXWCwAFBrdbLYGar,wyZa88XjC559Epspt,BiWY7NwRNLGYgQG79,Daniel Zhu,Furnace FPS Helper,6vpLhKvhfxn9vKP2f,Yongsung Kim,DTR Student,2019-11-27 18:36:06.111,5QXWCwAFBrdbLYGar-wyZa88XjC559Epspt
3112,k53mnCRoJZkpuufSJ,5QXWCwAFBrdbLYGar,wyZa88XjC559Epspt,uuHeMxx9twNKXtiZR,Mary Truong,Furnace FPS Helper,mdhFQ6PNiAhfP7ce2,Kapil Garg,DTR Student,2019-11-27 18:36:06.111,5QXWCwAFBrdbLYGar-wyZa88XjC559Epspt
3111,vc3cDTDWKTYFztxsX,5QXWCwAFBrdbLYGar,wyZa88XjC559Epspt,MyhNrCv6sYAdnEvBF,Josh Klein,Furnace FPS Helper,en4PobG8cG73svWCQ,Caryl Henry,Furnace FPS Helper,2019-11-27 18:36:06.111,5QXWCwAFBrdbLYGar-wyZa88XjC559Epspt
3110,ofLyAq6Td5pkMtruv,5QXWCwAFBrdbLYGar,QEvCZCAQeE4WJuhAE,6vpLhKvhfxn9vKP2f,Yongsung Kim,DTR Student,en4PobG8cG73svWCQ,Caryl Henry,Furnace FPS Helper,2019-11-27 18:32:34.399,5QXWCwAFBrdbLYGar-QEvCZCAQeE4WJuhAE
3109,b4Woisr4LydM5iwAB,5QXWCwAFBrdbLYGar,QEvCZCAQeE4WJuhAE,BiWY7NwRNLGYgQG79,Daniel Zhu,Furnace FPS Helper,MyhNrCv6sYAdnEvBF,Josh Klein,Furnace FPS Helper,2019-11-27 18:32:34.399,5QXWCwAFBrdbLYGar-QEvCZCAQeE4WJuhAE
3108,HS96aCWQLME9B8Cvi,5QXWCwAFBrdbLYGar,QEvCZCAQeE4WJuhAE,uuHeMxx9twNKXtiZR,Mary Truong,Furnace FPS Helper,mdhFQ6PNiAhfP7ce2,Kapil Garg,DTR Student,2019-11-27 18:32:34.399,5QXWCwAFBrdbLYGar-QEvCZCAQeE4WJuhAE
3107,xpRNoKYoTRyAkxfgq,5QXWCwAFBrdbLYGar,Gp44ikq2z6mYF26jB,6vpLhKvhfxn9vKP2f,Yongsung Kim,DTR Student,BiWY7NwRNLGYgQG79,Daniel Zhu,Furnace FPS Helper,2019-11-27 18:30:42.901,5QXWCwAFBrdbLYGar-Gp44ikq2z6mYF26jB
3106,xdGrw765cvcAcvvet,5QXWCwAFBrdbLYGar,Gp44ikq2z6mYF26jB,MyhNrCv6sYAdnEvBF,Josh Klein,Furnace FPS Helper,uuHeMxx9twNKXtiZR,Mary Truong,Furnace FPS Helper,2019-11-27 18:30:42.901,5QXWCwAFBrdbLYGar-Gp44ikq2z6mYF26jB
3105,gypeWe7urA2DRLts4,5QXWCwAFBrdbLYGar,Gp44ikq2z6mYF26jB,en4PobG8cG73svWCQ,Caryl Henry,Furnace FPS Helper,mdhFQ6PNiAhfP7ce2,Kapil Garg,DTR Student,2019-11-27 18:30:42.901,5QXWCwAFBrdbLYGar-Gp44ikq2z6mYF26jB
3103,wAdmjQhmCnbDKCzty,5QXWCwAFBrdbLYGar,qMqYnQhxEb6wuCRpw,BiWY7NwRNLGYgQG79,Sarah Helper,Furnace FPS Helper,en4PobG8cG73svWCQ,Sarah Hanson,Furnace FPS Helper,2019-11-22 19:25:24.590,5QXWCwAFBrdbLYGar-qMqYnQhxEb6wuCRpw


In [10]:
tasks = pd.DataFrame(list(db.tasks.find({})))

# remove bad groups
tasks_orig_size = len(tasks)
tasks_new_size = 0

tasks = tasks[tasks['groupId'].isin(valid_group_ids)]
tasks.reset_index(drop=True, inplace=True)

# print change in size
tasks_new_size = len(tasks)
print('Number of Tasks\nOriginal size: {} --> New size: {}'.format(tasks_orig_size, tasks_new_size))

# display current tasks
tasks.head()

Number of Tasks
Original size: 1268 --> New size: 826


Unnamed: 0,_id,name,userId,groupId,task
0,qSPQiuE42yMiZJYrM,Joe Germuska,NtZ9hv3g6eLAwN2nY,Et46F6odTBmiFiDSZ,
1,9ZtF3iuf2Gs273Nq6,wise@northwestern.edu,c2bWRsNjfijQtq6pN,Et46F6odTBmiFiDSZ,
2,36BHem3sZ7vPesS9v,e-withrow@northwestern.edu,u2GAvznbx7Jbf97Hk,Et46F6odTBmiFiDSZ,
3,fYA2q2QAaahrvym9N,Julian Vicens,goGr47HDwtfphJ5xK,uPLDbfFqqdHEEkgCT,
4,pDv2qxmc3Qtgi5msk,rebecca.poulson@northwestern.edu,WTKxXpLuJAnDfgvFH,Et46F6odTBmiFiDSZ,


In [11]:
affinities = pd.DataFrame(list(db.affinities.find({})))

# remove bad groups
affinities_orig_size = len(affinities)
affinities_new_size = 0

affinities = affinities[affinities['groupId'].isin(valid_group_ids)]
affinities.reset_index(drop=True, inplace=True)

# print change in size
affinities_new_size = len(affinities)
print('Number of Current Affinities\nOriginal size: {} --> New size: {}'.format(affinities_orig_size, affinities_new_size))

# display current affinities
affinities.head()

Number of Current Affinities
Original size: 3598 --> New size: 2190


Unnamed: 0,_id,helperId,helpeeId,groupId,value
0,CBAFDuJRt4PCqMFbi,u2GAvznbx7Jbf97Hk,WTKxXpLuJAnDfgvFH,Et46F6odTBmiFiDSZ,1.0
1,sn3M9GLYLwxrdNuLf,dKco6yw8vaxbGpdrr,WTKxXpLuJAnDfgvFH,Et46F6odTBmiFiDSZ,-1.0
2,QTWuMLM39mmfKyqqk,WTKxXpLuJAnDfgvFH,dKco6yw8vaxbGpdrr,Et46F6odTBmiFiDSZ,0.33
3,cQAeZBQdFyagMjJbJ,dKco6yw8vaxbGpdrr,u2GAvznbx7Jbf97Hk,Et46F6odTBmiFiDSZ,1.0
4,zTpAK9XCN7p2Ea6Pg,u2GAvznbx7Jbf97Hk,dKco6yw8vaxbGpdrr,Et46F6odTBmiFiDSZ,1.0


In [12]:
affinities_history = pd.DataFrame(list(db.affinities_history.find({})))

# remove bad groups
affinities_history_orig_size = len(affinities_history)
affinities_history_new_size = 0

affinities_history = affinities_history[affinities_history['groupId'].isin(valid_group_ids)]

# add group_pairing_id column
affinities_history['group_pairing_id'] = affinities_history['groupId'] + '-' + affinities_history['pairingId']

# remove duplicate ratings
affinities_history.sort_values(['group_pairing_id', 'helpeeId', 'helperId'], inplace=True)
affinities_history.drop_duplicates(subset=['group_pairing_id', 'helpeeId', 'helperId'], keep='last', inplace=True)
affinities_history.reset_index(drop=True, inplace=True)

# print change in size
affinities_history_new_size = len(affinities_history)
print('Number of Past Affinities\nOriginal size: {} --> New size: {}'.format(affinities_history_orig_size, affinities_history_new_size))

# display affinity data
print('Unique Group Pairings: {}'.format(len(affinities_history.group_pairing_id.unique())))
affinities_history.head()

Number of Past Affinities
Original size: 52879 --> New size: 52609
Unique Group Pairings: 585


Unnamed: 0,_id,helperId,helpeeId,groupId,value,pairingId,group_pairing_id
0,v3nKkg77Jouf6BZ8G,GLTz7m8y7RqZCYzxx,3si95Pn6NjXTxCWcT,2rFoGTfRa9LFdpQNA,0.33,2EPbA6HkydPTdxCWD,2rFoGTfRa9LFdpQNA-2EPbA6HkydPTdxCWD
1,D2kBQDRftmygv5f4L,PWufwHDsbRaw4se4X,3si95Pn6NjXTxCWcT,2rFoGTfRa9LFdpQNA,1.0,2EPbA6HkydPTdxCWD,2rFoGTfRa9LFdpQNA-2EPbA6HkydPTdxCWD
2,R588B5nqLhmLbC4iW,f8wwqTXaifkxxoAc2,3si95Pn6NjXTxCWcT,2rFoGTfRa9LFdpQNA,0.0,2EPbA6HkydPTdxCWD,2rFoGTfRa9LFdpQNA-2EPbA6HkydPTdxCWD
3,poiynLy2tnCMNzdGf,iyRaCwz7QzxPRSi5t,3si95Pn6NjXTxCWcT,2rFoGTfRa9LFdpQNA,1.0,2EPbA6HkydPTdxCWD,2rFoGTfRa9LFdpQNA-2EPbA6HkydPTdxCWD
4,KmiSFQicDRa263Nfc,kEZXdjhfohiGxJWdu,3si95Pn6NjXTxCWcT,2rFoGTfRa9LFdpQNA,-1.0,2EPbA6HkydPTdxCWD,2rFoGTfRa9LFdpQNA-2EPbA6HkydPTdxCWD


# Cleaning Data
Make sure all pairing sessions are valid. A valid pairing session from the `pairings` table for a group-pairing ID will have $n$ members in the 'pairings' column. 

It must also have:
1. $\frac{n}{2}$ pairs in the 'pairings' column of the `pairings` table for that group-pairing ID entry.
2. $\frac{n}{2}$ rows in the `pairs_history` table for rows with the same group-pairing ID entry.
3. $n$ tasks in the `tasks_history` table for rows with the same group-pairing ID entry.

Ideally, data should have the following, but these are not guaranteed since users may not report affinities for all users in the current pool.
4. $n*(n - 1)$ total affinities in the `affinities_history` table for rows with the same group-pairing ID entry.
5. $n - 1$ affinities per person in the `affinities_history` table for rows with the same group-pairing ID entry.

## TODO
- Unchecked edge case: when pair research doesn't pair everyone given an even number of users
- Plot when different kinds of conditions are breaking as a bar plot (aggregate), and a timeseries plot (trend) to show if phenomena is ongoing

## Determine invalid group-pairing sessions

In [13]:
def count_members_in_pairing(pairing):
    """
    Counts number of members in a pairing.
    
    Input:
        pairing (list of dicts): pairings between members as lists of dicts.
    
    Output:
        (int): number of people in the pairing.
    """
    count = 0
    for pair in pairing:
        if 'firstUserId' in pair and pair['firstUserId'] is not None:
            count += 1
        if 'secondUserId' in pair and pair['secondUserId'] is not None:
            count += 1
            
    return count

In [14]:
def validate_pairing(group_pairing_id, debug=False):
    """
    Validates that all data for a pairing is good.
    
    A valid pairing with n particpants for a given group_pairing_id meets the following conditions:
    1. n/2 pairs in the 'pairings' column of the `pairings` table for that group_pairing_id entry.
    2. n/2 rows in the `pairs_history` table for rows with the same group_pairing_id entry.
    3. n tasks in the `tasks_history` table for rows with the same group_pairing_id entry.
    
    Input:
        group_pairing_id (string): pairing session for group to validate.
        debug (bool): optional parameter to print whenever invalid session is detected.
        
    Output:
        (bool): whether pairing is valid
        (list of string): conditions failed in check
    """
    # get the pairing and number of users
    curr_pairing = pairings.query("group_pairing_id == @group_pairing_id").iloc[0]['pairings']
    n = count_members_in_pairing(curr_pairing)
    pairs_count = math.ceil(n / 2)
    
    # store each condition check
    condition_checks = [False for x in range(3)]
    
    # check condition 1
    condition_checks[0] = len(curr_pairing) == pairs_count
    
    # check condition 2
    condition_checks[1] = len(pairs_history.query("group_pairing_id == @group_pairing_id")) == pairs_count
    
    # check condition 3
    temp_tasks_history = tasks_history.query("group_pairing_id == @group_pairing_id").copy(deep=True)
    if len(temp_tasks_history) > 0:
        temp_tasks_history['group_pairing_user_id'] = temp_tasks_history.apply(lambda x: '{}-{}-{}'.format(x['groupId'], x['pairingId'], x['userId']), axis=1)
        temp_tasks_history.drop_duplicates(['group_pairing_user_id'], keep=False, inplace=True)
        condition_checks[2] = len(temp_tasks_history) == n
    
    # return checks
    all_conds_valid = all(condition_checks)
    failed_conds = None
    
    if not all_conds_valid:
        failed_conds = [str(index + 1) for index, condition in enumerate(condition_checks) if not condition]
        
        if debug:
            print('Invalid Group-Pairing Session: {} | Check Conditions Failed: {}'.format(group_pairing_id, ', '.join(failed_conds)))

    return all_conds_valid, failed_conds

In [15]:
# get group
group_pairing_ids = pairings['group_pairing_id'].unique()
group_pairing_ids.sort()

# collect all invalid group-pairing sessions
invalid_group_pairings = pd.DataFrame({
    'group_pairing_id': [],
    'group_id': [],
    'pairing_id': [],
    'conditions_failed': [],
    'user_count': [],
    'task_count': [],
    'expected_task_count': [],
    'pairing_count': [],
    'expected_pairing_count': [],
    'pairs_hist_count': [],
    'expected_pairs_hist_count': [],
    'affinity_count': [],
    'expected_affinity_count': []
})

for curr_id in tqdm(group_pairing_ids):
    curr_group_id, curr_pairing_id = curr_id.split('-')
    is_valid_pairing, conditions_failed = validate_pairing(curr_id)
    
    if not is_valid_pairing:
        n = count_members_in_pairing(pairings.query("group_pairing_id == @curr_id").iloc[0]['pairings'])
        
        invalid_group_pairings = invalid_group_pairings.append({
            'group_pairing_id': curr_id,
            'group_id': curr_group_id,
            'pairing_id': curr_pairing_id,
            'conditions_failed': conditions_failed,
            'user_count': n,
            'task_count': len(tasks_history.query("group_pairing_id == @curr_id")),
            'expected_task_count': n,
            'pairing_count': len(pairings.query("group_pairing_id == @curr_id").iloc[0]['pairings']),
            'expected_pairing_count': math.ceil(n / 2),
            'pairs_hist_count': len(pairs_history.query("group_pairing_id == @curr_id")),
            'expected_pairs_hist_count': math.ceil(n / 2),
            'affinity_count': len(affinities_history.query("group_pairing_id == @curr_id")),
            'expected_affinity_count': n * (n - 1)
        }, ignore_index=True)
        
invalid_group_pairings.sort_values('conditions_failed', inplace=True)
invalid_group_pairings.reset_index(drop=True, inplace=True)
invalid_group_pairings.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=604), HTML(value='')))




Unnamed: 0,group_pairing_id,group_id,pairing_id,conditions_failed,user_count,task_count,expected_task_count,pairing_count,expected_pairing_count,pairs_hist_count,expected_pairs_hist_count,affinity_count,expected_affinity_count
0,9mdkMmj4pY8Q2TwqF-P8SGjH7mLk9FxCf45,9mdkMmj4pY8Q2TwqF,P8SGjH7mLk9FxCf45,"[1, 2]",12.0,12.0,12.0,7.0,6.0,7.0,6.0,132.0,132.0
1,BibLRuKtNNv7QEDqb-YuHJWnrgg2prjPjRp,BibLRuKtNNv7QEDqb,YuHJWnrgg2prjPjRp,"[1, 2]",4.0,4.0,4.0,3.0,2.0,3.0,2.0,8.0,12.0
2,BibLRuKtNNv7QEDqb-evGbShCDMHxvCBABe,BibLRuKtNNv7QEDqb,evGbShCDMHxvCBABe,"[1, 2]",6.0,6.0,6.0,4.0,3.0,4.0,3.0,29.0,30.0
3,cToFEbgXcFbrKsSrj-o33KWpqZFwSNt3tDe,cToFEbgXcFbrKsSrj,o33KWpqZFwSNt3tDe,"[1, 2]",10.0,10.0,10.0,6.0,5.0,6.0,5.0,69.0,90.0
4,BibLRuKtNNv7QEDqb-jdfB7xy4iuaG66yiZ,BibLRuKtNNv7QEDqb,jdfB7xy4iuaG66yiZ,"[1, 2]",8.0,8.0,8.0,5.0,4.0,5.0,4.0,49.0,56.0


In [16]:
invalid_group_pairings[invalid_group_pairings.conditions_failed.apply(lambda x: '3' in x)].head()

Unnamed: 0,group_pairing_id,group_id,pairing_id,conditions_failed,user_count,task_count,expected_task_count,pairing_count,expected_pairing_count,pairs_hist_count,expected_pairs_hist_count,affinity_count,expected_affinity_count
19,cToFEbgXcFbrKsSrj-kTtxafCNsKoqsnhCt,cToFEbgXcFbrKsSrj,kTtxafCNsKoqsnhCt,"[1, 2, 3]",14.0,7.0,14.0,8.0,7.0,8.0,7.0,131.0,182.0
20,fduEdDA8nk5ybcYze-GjXnW5cj2SE8rzNCJ,fduEdDA8nk5ybcYze,GjXnW5cj2SE8rzNCJ,"[1, 2, 3]",3.0,0.0,3.0,3.0,2.0,3.0,2.0,6.0,6.0
21,fduEdDA8nk5ybcYze-vyrrtwEpzzRqCACxi,fduEdDA8nk5ybcYze,vyrrtwEpzzRqCACxi,"[1, 2, 3]",3.0,0.0,3.0,3.0,2.0,3.0,2.0,6.0,6.0
22,9mdkMmj4pY8Q2TwqF-nRAQpsPhsQs4zRvTL,9mdkMmj4pY8Q2TwqF,nRAQpsPhsQs4zRvTL,"[1, 2, 3]",10.0,24.0,10.0,6.0,5.0,24.0,5.0,30.0,90.0
23,tAC6QTAqiEgFRpHMj-7JhveY48LSxc8s87j,tAC6QTAqiEgFRpHMj,7JhveY48LSxc8s87j,"[1, 2, 3]",2.0,0.0,2.0,2.0,1.0,2.0,1.0,2.0,2.0


## Filter out invalid pairing sessions

In [17]:
def remove_invalid_sessions(df, df_name, exclusion_list):
    """
    Removes invalid group-pairing session ids from dataframe.
    
    Input:
        df (pandas dataframe): dataframe to remove group-pairing sessions from.
        df_name (string): name of data frame.
        exclusion_list (list of strings): group_pairing_ids to remove from df.
        
    Output:
        (pandas dataframe): cleaned dataframe
    """
    # track size for printing
    orig_size = len(df)
    new_size = 0
    
    # clean data
    df = df[~df['group_pairing_id'].isin(exclusion_list)]
    df.reset_index(drop=True, inplace=True)
    new_size = len(df)
    
    # print change in size
    print('{} Cleaning \nOrig Size: {} ==> New Size: {}'.format(df_name, orig_size, new_size), end='\n\n')
    
    # return cleaned df
    return df

In [18]:
invalid_group_pairing_ids_list = list(invalid_group_pairings['group_pairing_id'].unique())

In [19]:
tasks_history = remove_invalid_sessions(tasks_history, 'tasks_history', invalid_group_pairing_ids_list)
pairings = remove_invalid_sessions(pairings, 'pairings', invalid_group_pairing_ids_list)
pairs_history = remove_invalid_sessions(pairs_history, 'pairs_history', invalid_group_pairing_ids_list)
affinities_history = remove_invalid_sessions(affinities_history, 'affinities_history', invalid_group_pairing_ids_list)

tasks_history Cleaning 
Orig Size: 4422 ==> New Size: 3953

pairings Cleaning 
Orig Size: 604 ==> New Size: 395

pairs_history Cleaning 
Orig Size: 3114 ==> New Size: 2085

affinities_history Cleaning 
Orig Size: 52609 ==> New Size: 37353



## Add unique sequence number to each pairing session for each group
For example, first session for DTR is 1, second is 2, etc.

In [20]:
# extract sorted pairing ids
pairings_id_df = pairs_history[['groupId', 'pairingId', 'timestamp']].drop_duplicates(subset=['groupId', 'pairingId'], keep='first').sort_values(['groupId', 'timestamp']).reset_index(drop=True)
pairings_id_df['group_pairing_id'] = pairings_id_df['groupId'] + '-' + pairings_id_df['pairingId']
pairings_id_df['pairing_session_index'] = 0

# add index label for each
for groupId in pairings_id_df['groupId'].unique():
    n_row = len(pairings_id_df[pairings_id_df['groupId'] == groupId])
    pairings_id_df.loc[pairings_id_df.groupId == groupId, 'pairing_session_index'] = range(1, n_row + 1)

# show top 5 frows
pairings_id_df.head()

Unnamed: 0,groupId,pairingId,timestamp,group_pairing_id,pairing_session_index
0,2rFoGTfRa9LFdpQNA,A6d3rQwrRZHEz4qHu,2017-08-22 17:19:36.847,2rFoGTfRa9LFdpQNA-A6d3rQwrRZHEz4qHu,1
1,2rFoGTfRa9LFdpQNA,pNFhXhotBwQ6Z79Md,2017-08-22 21:54:05.882,2rFoGTfRa9LFdpQNA-pNFhXhotBwQ6Z79Md,2
2,2rFoGTfRa9LFdpQNA,eSXY7BuRX3ZhXX627,2017-10-24 20:56:43.001,2rFoGTfRa9LFdpQNA-eSXY7BuRX3ZhXX627,3
3,2rFoGTfRa9LFdpQNA,SwhcfsdjNCZcyzx3t,2017-11-28 21:48:06.568,2rFoGTfRa9LFdpQNA-SwhcfsdjNCZcyzx3t,4
4,2rFoGTfRa9LFdpQNA,SpiKfuqCoEZRLfDNK,2018-01-16 21:42:19.584,2rFoGTfRa9LFdpQNA-SpiKfuqCoEZRLfDNK,5


In [21]:
# add pairing index to earlier data frames
tasks_history = tasks_history.merge(pairings_id_df[['group_pairing_id', 'pairing_session_index']], on=['group_pairing_id'])
pairings = pairings.merge(pairings_id_df[['group_pairing_id', 'pairing_session_index']], on=['group_pairing_id'])
pairs_history = pairs_history.merge(pairings_id_df[['group_pairing_id', 'pairing_session_index']], on=['group_pairing_id'])
affinities_history = affinities_history.merge(pairings_id_df[['group_pairing_id', 'pairing_session_index']], on=['group_pairing_id'])

# Isolate DTR Pairing Data

In [22]:
dtr_group_id = 'sM3z5FkZfsABqcj3g'

In [23]:
# create initial filters on the data for DTR only
dtr_pairs_history = pairs_history.query("groupId == @dtr_group_id").reset_index(drop=True)
dtr_pairs_history = dtr_pairs_history[['group_pairing_id',
                                       'firstUserId', 'firstUserName', 'secondUserId', 'secondUserName', 'pairing_session_index']]

dtr_task_history = tasks_history.query("groupId == @dtr_group_id").drop_duplicates(['task', 'userId', 'group_pairing_id']).reset_index(drop=True)

dtr_affinities_history = affinities_history.query("groupId == @dtr_group_id").reset_index(drop=True)
dtr_affinities_history = dtr_affinities_history[['group_pairing_id', 'helpeeId', 'helperId', 'value', 'pairing_session_index']]

In [24]:
# duplicate pairs data for non-nan entries
dtr_pairs_history_dup = dtr_pairs_history[~dtr_pairs_history['secondUserId'].isnull()]
dtr_pairs_history_dup = dtr_pairs_history_dup[['group_pairing_id', 'secondUserId', 'secondUserName', 'firstUserId', 'firstUserName', 'pairing_session_index']]
dtr_pairs_history_dup.columns = ['group_pairing_id', 'helperId', 'helperName', 'helpeeId', 'helpeeName', 'pairing_session_index']

# combine back with original dtr_pairs_history df
dtr_pairs_history.columns = ['group_pairing_id', 'helperId', 'helperName', 'helpeeId', 'helpeeName', 'pairing_session_index']
dtr_pairs_history = dtr_pairs_history.append(dtr_pairs_history_dup, ignore_index=True)

In [25]:
dtr_pairs_history = dtr_pairs_history.sort_values(['pairing_session_index', 'group_pairing_id', 'helperId']).reset_index(drop=True)
dtr_pairs_history.head(10)

Unnamed: 0,group_pairing_id,helperId,helperName,helpeeId,helpeeName,pairing_session_index
0,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,6rZbYn3cbQ9KNLRM5,Meg Grasse,aNdSTecskgeAm2St5,Leesha,1
1,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,EDEFWcagLwCfXP5Jg,Yongsung Kim,Rse39xrxtP6xRHsbK,AlainaKafkes2017@u.northwestern.edu,1
2,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,N3LsK5NJeKm8PkBx8,Allison Sun,mdhFQ6PNiAhfP7ce2,Kapil Garg,1
3,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,Rse39xrxtP6xRHsbK,AlainaKafkes2017@u.northwestern.edu,EDEFWcagLwCfXP5Jg,Yongsung Kim,1
4,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,TFtNL3sYbSSGykQJE,Ryan Madden,aupdNzYu8WmNEi4e5,Alex Kaldjian,1
5,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,XjH8WJxEXFgTwdi3o,Sameer Srivastava,nDHZGzczDWyqvyFhp,Sarah Lim,1
6,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,aNdSTecskgeAm2St5,Leesha,6rZbYn3cbQ9KNLRM5,Meg Grasse,1
7,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,aupdNzYu8WmNEi4e5,Alex Kaldjian,TFtNL3sYbSSGykQJE,Ryan Madden,1
8,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,cupTmXWNEZ2N7vxPk,Katie George,iEHKgJBH7hNSroEjw,Greg Kim,1
9,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,iEHKgJBH7hNSroEjw,Greg Kim,cupTmXWNEZ2N7vxPk,Katie George,1


In [26]:
# add task requests made by the helpee
dtr_pairs_tasks = dtr_pairs_history.merge(dtr_task_history[['group_pairing_id', 'userId', 'task', 'pairing_session_index']],
                                          left_on=['group_pairing_id', 'helpeeId', 'pairing_session_index'],
                                          right_on=['group_pairing_id', 'userId', 'pairing_session_index'],
                                          how='left')
dtr_pairs_tasks['paired'] = True
del dtr_pairs_tasks['userId']
dtr_pairs_tasks.head()

Unnamed: 0,group_pairing_id,helperId,helperName,helpeeId,helpeeName,pairing_session_index,task,paired
0,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,6rZbYn3cbQ9KNLRM5,Meg Grasse,aNdSTecskgeAm2St5,Leesha,1,I need to think through & implement test cases...,True
1,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,EDEFWcagLwCfXP5Jg,Yongsung Kim,Rse39xrxtP6xRHsbK,AlainaKafkes2017@u.northwestern.edu,1,Scaffolding feedback,True
2,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,N3LsK5NJeKm8PkBx8,Allison Sun,mdhFQ6PNiAhfP7ce2,Kapil Garg,1,Finish up scenarios and study design,True
3,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,Rse39xrxtP6xRHsbK,AlainaKafkes2017@u.northwestern.edu,EDEFWcagLwCfXP5Jg,Yongsung Kim,1,i need help with my fellowship research statement,True
4,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,TFtNL3sYbSSGykQJE,Ryan Madden,aupdNzYu8WmNEi4e5,Alex Kaldjian,1,coming up with prototyping examples for explor...,True


In [27]:
# add helper's ability to help. keep all affinities even if people weren't actually paired (how=right; for only paired people, use how='left')
dtr_pairs_tasks_affinities = dtr_pairs_tasks.merge(dtr_affinities_history,
                                                   left_on=['group_pairing_id', 'helperId', 'helpeeId', 'pairing_session_index'],
                                                   right_on=['group_pairing_id', 'helperId', 'helpeeId', 'pairing_session_index'],
                                                   how='right')

# fill with 0 affinity if person has no partner
dtr_pairs_tasks_affinities.loc[(~dtr_pairs_tasks_affinities['helpeeId'].isnull()) & (dtr_pairs_tasks_affinities['value'].isnull()), 'value'] = 0

# set paired to false for any unparied people (i.e., all rated affinties for people who weren't paired)
dtr_pairs_tasks_affinities.loc[dtr_pairs_tasks_affinities.paired.isnull(), 'paired'] = False

# fill blank names
dtr_name_dict = {row['userId']: row['name'] for index, row in dtr_task_history[['userId',  'name']].iterrows()}
dtr_pairs_tasks_affinities['helperName'] = dtr_pairs_tasks_affinities['helperName'].fillna(dtr_pairs_tasks_affinities['helperId']).replace(dtr_name_dict)
dtr_pairs_tasks_affinities['helpeeName'] = dtr_pairs_tasks_affinities['helpeeName'].fillna(dtr_pairs_tasks_affinities['helpeeId']).replace(dtr_name_dict)

# fill in blank tasks
dtr_pairs_tasks_affinities = dtr_pairs_tasks_affinities.merge(dtr_task_history[['group_pairing_id', 'userId', 'task']],
                                                              left_on=['group_pairing_id', 'helpeeId'],
                                                              right_on=['group_pairing_id', 'userId'])
del dtr_pairs_tasks_affinities['task_x']
dtr_pairs_tasks_affinities.rename(columns={'task_y': 'task'}, inplace=True)

# sort table and print
dtr_pairs_tasks_affinities.head()

Unnamed: 0,group_pairing_id,helperId,helperName,helpeeId,helpeeName,pairing_session_index,paired,value,userId,task
0,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,6rZbYn3cbQ9KNLRM5,Meg Grasse,aNdSTecskgeAm2St5,Leesha,1,True,0.66,aNdSTecskgeAm2St5,I need to think through & implement test cases...
1,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,N3LsK5NJeKm8PkBx8,Allison Sun,aNdSTecskgeAm2St5,Leesha,1,False,0.33,aNdSTecskgeAm2St5,I need to think through & implement test cases...
2,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,TFtNL3sYbSSGykQJE,Ryan Madden,aNdSTecskgeAm2St5,Leesha,1,False,0.0,aNdSTecskgeAm2St5,I need to think through & implement test cases...
3,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,XjH8WJxEXFgTwdi3o,Sameer Srivastava,aNdSTecskgeAm2St5,Leesha,1,False,0.33,aNdSTecskgeAm2St5,I need to think through & implement test cases...
4,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,aupdNzYu8WmNEi4e5,Alex Kaldjian,aNdSTecskgeAm2St5,Leesha,1,False,0.66,aNdSTecskgeAm2St5,I need to think through & implement test cases...


In [28]:
# replace values with 1 - 5
value_mappings = {
    '-1.0': 1,
    '0.0':  2,
    '0.33': 3,
    '0.66': 4,
    '1.0':  5
}

dtr_pairs_tasks_affinities['value'] = dtr_pairs_tasks_affinities['value'].astype(str)
dtr_pairs_tasks_affinities.replace({'value': value_mappings}, inplace=True)

# replace names
name_mappings = {
    'richardhuang2019@u.northwestern.edu': 'Richard Huang',
    'AlainaKafkes2017@u.northwestern.edu': 'Alaina Kafkes',
    'judylee2021@u.northwestern.edu': 'Judy Lee',
    'Leesha': 'Leesha Maliakal',
    'andrew': 'Andrew Finke',
    'Garrett': 'Garrett Hedman'
}
dtr_pairs_tasks_affinities.replace({'helperName': name_mappings, 'helpeeName': name_mappings}, inplace=True)

# more intutive column names
dtr_pairs_tasks_affinities.rename(columns={'task': 'helpeeRequest', 'value': 'helperAbilityToHelp'}, inplace=True)

# reorder columns
dtr_pairs_tasks_affinities = dtr_pairs_tasks_affinities[['group_pairing_id', 'pairing_session_index', 'helperId', 'helperName',
                                                         'helpeeId', 'helpeeName', 'helpeeRequest', 'helperAbilityToHelp', 'paired']]

In [29]:
# remove nans for initial analysis, and save out csv
dtr_pairs_tasks_affinities.dropna(how='any',axis=0).to_csv('./documents/dtr-pair-research-pairings.csv',
                                                           index=False)

## Summary Stats from DTR

In [30]:
# number of pairing sessions
print('Number of Pairing Sessions: {}'.format(len(dtr_pairs_history['group_pairing_id'].unique())))

Number of Pairing Sessions: 70


In [31]:
# number of unique users
print('Number of Unique Users: {}'.format(len(set(dtr_pairs_history['helperId'].unique()).union(set(dtr_pairs_history['helpeeId'].unique())))))

Number of Unique Users: 63


In [32]:
# total number of tasks
print('Total number of tasks: {}'.format(len(dtr_task_history)))

Total number of tasks: 1028


In [33]:
# number of users/tasks per session
dtr_task_history.groupby('group_pairing_id')['userId'].count().describe()

count    70.000000
mean     14.685714
std       4.023636
min       5.000000
25%      12.000000
50%      14.000000
75%      17.000000
max      24.000000
Name: userId, dtype: float64

In [34]:
# total number of specified affinities
print('Total number of affinities: {}'.format(len(dtr_affinities_history)))

Total number of affinities: 13172


In [35]:
# number of affinities per session
dtr_affinities_history.groupby(['group_pairing_id'])['value'].count().describe()

count     70.000000
mean     188.171429
std      103.680525
min       14.000000
25%      126.000000
50%      155.000000
75%      237.750000
max      535.000000
Name: value, dtype: float64

In [36]:
# number of affinities per user per session
dtr_affinities_history.groupby(['group_pairing_id', 'helperId'])['value'].count().describe()

count    992.000000
mean      13.278226
std        4.564507
min        1.000000
25%       11.000000
50%       13.000000
75%       16.000000
max       23.000000
Name: value, dtype: float64

# Isolate Delta Lab Data

In [37]:
delta_group_id = '9mdkMmj4pY8Q2TwqF'

In [38]:
# create initial filters on the data for Delta only
delta_pairs_history = pairs_history.query("groupId == @delta_group_id").reset_index(drop=True)
delta_pairs_history = delta_pairs_history[['group_pairing_id',
                                       'firstUserId', 'firstUserName', 'secondUserId', 'secondUserName', 'pairing_session_index']]

delta_task_history = tasks_history.query("groupId == @delta_group_id").drop_duplicates(['task', 'userId', 'group_pairing_id']).reset_index(drop=True)

delta_affinities_history = affinities_history.query("groupId == @delta_group_id").reset_index(drop=True)
delta_affinities_history = delta_affinities_history[['group_pairing_id', 'helpeeId', 'helperId', 'value', 'pairing_session_index']]

In [39]:
# duplicate pairs data for non-nan entries
delta_pairs_history_dup = delta_pairs_history[~delta_pairs_history['secondUserId'].isnull()]
delta_pairs_history_dup = delta_pairs_history_dup[['group_pairing_id', 'secondUserId', 'secondUserName', 'firstUserId', 'firstUserName', 'pairing_session_index']]
delta_pairs_history_dup.columns = ['group_pairing_id', 'helperId', 'helperName', 'helpeeId', 'helpeeName', 'pairing_session_index']

# combine back with original delta_pairs_history df
delta_pairs_history.columns = ['group_pairing_id', 'helperId', 'helperName', 'helpeeId', 'helpeeName', 'pairing_session_index']
delta_pairs_history = delta_pairs_history.append(delta_pairs_history_dup, ignore_index=True)

In [40]:
delta_pairs_history = delta_pairs_history.sort_values(['pairing_session_index', 'group_pairing_id', 'helperId']).reset_index(drop=True)
delta_pairs_history.head(10)

Unnamed: 0,group_pairing_id,helperId,helperName,helpeeId,helpeeName,pairing_session_index
0,9mdkMmj4pY8Q2TwqF-N23iLvjp2GWcsHYd5,57MnWENtTDkXRYhcL,Gulu,zBZSGgrZFfW5KH5vj,Natalia Smirnov,1
1,9mdkMmj4pY8Q2TwqF-N23iLvjp2GWcsHYd5,6iR9Z64HEJDcD8qbu,Matt Easterday,MJkj24zXWKhnZQCc3,Daniel George Rees Lewis,1
2,9mdkMmj4pY8Q2TwqF-N23iLvjp2GWcsHYd5,EDEFWcagLwCfXP5Jg,Yongsung Kim,,,1
3,9mdkMmj4pY8Q2TwqF-N23iLvjp2GWcsHYd5,JaEySKdKKg7LAF3Yg,Eureka Foong,gynuaAvfp3gAd4Gyo,eharburg@gmail.com,1
4,9mdkMmj4pY8Q2TwqF-N23iLvjp2GWcsHYd5,KYnkykoMwd9fbBbWB,Julie Hui,aNdSTecskgeAm2St5,Leesha,1
5,9mdkMmj4pY8Q2TwqF-N23iLvjp2GWcsHYd5,MJkj24zXWKhnZQCc3,Daniel George Rees Lewis,6iR9Z64HEJDcD8qbu,Matt Easterday,1
6,9mdkMmj4pY8Q2TwqF-N23iLvjp2GWcsHYd5,aNdSTecskgeAm2St5,Leesha,KYnkykoMwd9fbBbWB,Julie Hui,1
7,9mdkMmj4pY8Q2TwqF-N23iLvjp2GWcsHYd5,gynuaAvfp3gAd4Gyo,eharburg@gmail.com,JaEySKdKKg7LAF3Yg,Eureka Foong,1
8,9mdkMmj4pY8Q2TwqF-N23iLvjp2GWcsHYd5,zBZSGgrZFfW5KH5vj,Natalia Smirnov,57MnWENtTDkXRYhcL,Gulu,1
9,9mdkMmj4pY8Q2TwqF-soiecrpv6CRPTqmkd,PavTL8zD9664wvtfB,Haoqi Zhang,,,2


In [41]:
# add task requests made by the helpee
delta_pairs_tasks = delta_pairs_history.merge(delta_task_history[['group_pairing_id', 'userId', 'task', 'pairing_session_index']],
                                          left_on=['group_pairing_id', 'helpeeId', 'pairing_session_index'],
                                          right_on=['group_pairing_id', 'userId', 'pairing_session_index'],
                                          how='left')
delta_pairs_tasks['paired'] = True
del delta_pairs_tasks['userId']
delta_pairs_tasks.head()

Unnamed: 0,group_pairing_id,helperId,helperName,helpeeId,helpeeName,pairing_session_index,task,paired
0,9mdkMmj4pY8Q2TwqF-N23iLvjp2GWcsHYd5,57MnWENtTDkXRYhcL,Gulu,zBZSGgrZFfW5KH5vj,Natalia Smirnov,1,Sit next to me while I fix revisions on my pap...,True
1,9mdkMmj4pY8Q2TwqF-N23iLvjp2GWcsHYd5,6iR9Z64HEJDcD8qbu,Matt Easterday,MJkj24zXWKhnZQCc3,Daniel George Rees Lewis,1,Read CSCW R&R and edit it -- also find citatio...,True
2,9mdkMmj4pY8Q2TwqF-N23iLvjp2GWcsHYd5,EDEFWcagLwCfXP5Jg,Yongsung Kim,,,1,,True
3,9mdkMmj4pY8Q2TwqF-N23iLvjp2GWcsHYd5,JaEySKdKKg7LAF3Yg,Eureka Foong,gynuaAvfp3gAd4Gyo,eharburg@gmail.com,1,Help me revise my lit for CheerOn,True
4,9mdkMmj4pY8Q2TwqF-N23iLvjp2GWcsHYd5,KYnkykoMwd9fbBbWB,Julie Hui,aNdSTecskgeAm2St5,Leesha,1,I need help testing CrowdCheer outside! You ju...,True


In [42]:
# add helper's ability to help. keep all affinities even if people weren't actually paired (how=right; for only paired people, use how='left')
delta_pairs_tasks_affinities = delta_pairs_tasks.merge(delta_affinities_history,
                                                   left_on=['group_pairing_id', 'helperId', 'helpeeId', 'pairing_session_index'],
                                                   right_on=['group_pairing_id', 'helperId', 'helpeeId', 'pairing_session_index'],
                                                   how='right')

# fill with 0 affinity if person has no partner
delta_pairs_tasks_affinities.loc[(~delta_pairs_tasks_affinities['helpeeId'].isnull()) & (delta_pairs_tasks_affinities['value'].isnull()), 'value'] = 0

# set paired to false for any unparied people (i.e., all rated affinties for people who weren't paired)
delta_pairs_tasks_affinities.loc[delta_pairs_tasks_affinities.paired.isnull(), 'paired'] = False

# fill blank names
delta_name_dict = {row['userId']: row['name'] for index, row in delta_task_history[['userId',  'name']].iterrows()}
delta_pairs_tasks_affinities['helperName'] = delta_pairs_tasks_affinities['helperName'].fillna(delta_pairs_tasks_affinities['helperId']).replace(delta_name_dict)
delta_pairs_tasks_affinities['helpeeName'] = delta_pairs_tasks_affinities['helpeeName'].fillna(delta_pairs_tasks_affinities['helpeeId']).replace(delta_name_dict)

# fill in blank tasks
delta_pairs_tasks_affinities = delta_pairs_tasks_affinities.merge(delta_task_history[['group_pairing_id', 'userId', 'task']],
                                                              left_on=['group_pairing_id', 'helpeeId'],
                                                              right_on=['group_pairing_id', 'userId'])
del delta_pairs_tasks_affinities['task_x']
delta_pairs_tasks_affinities.rename(columns={'task_y': 'task'}, inplace=True)

# sort table and print
delta_pairs_tasks_affinities.head()

Unnamed: 0,group_pairing_id,helperId,helperName,helpeeId,helpeeName,pairing_session_index,paired,value,userId,task
0,9mdkMmj4pY8Q2TwqF-N23iLvjp2GWcsHYd5,57MnWENtTDkXRYhcL,Gulu,zBZSGgrZFfW5KH5vj,Natalia Smirnov,1,True,1.0,zBZSGgrZFfW5KH5vj,Sit next to me while I fix revisions on my pap...
1,9mdkMmj4pY8Q2TwqF-N23iLvjp2GWcsHYd5,6iR9Z64HEJDcD8qbu,Matt Easterday,zBZSGgrZFfW5KH5vj,Natalia Smirnov,1,False,-1.0,zBZSGgrZFfW5KH5vj,Sit next to me while I fix revisions on my pap...
2,9mdkMmj4pY8Q2TwqF-N23iLvjp2GWcsHYd5,EDEFWcagLwCfXP5Jg,Yongsung Kim,zBZSGgrZFfW5KH5vj,Natalia Smirnov,1,False,0.33,zBZSGgrZFfW5KH5vj,Sit next to me while I fix revisions on my pap...
3,9mdkMmj4pY8Q2TwqF-N23iLvjp2GWcsHYd5,JaEySKdKKg7LAF3Yg,Eureka Foong,zBZSGgrZFfW5KH5vj,Natalia Smirnov,1,False,0.66,zBZSGgrZFfW5KH5vj,Sit next to me while I fix revisions on my pap...
4,9mdkMmj4pY8Q2TwqF-N23iLvjp2GWcsHYd5,KYnkykoMwd9fbBbWB,Julie Hui,zBZSGgrZFfW5KH5vj,Natalia Smirnov,1,False,-1.0,zBZSGgrZFfW5KH5vj,Sit next to me while I fix revisions on my pap...


In [43]:
# replace values with 1 - 5
value_mappings = {
    '-1.0': 1,
    '0.0':  2,
    '0.33': 3,
    '0.66': 4,
    '1.0':  5
}

delta_pairs_tasks_affinities['value'] = delta_pairs_tasks_affinities['value'].astype(str)
delta_pairs_tasks_affinities.replace({'value': value_mappings}, inplace=True)

# replace names
name_mappings = {
    'Leesha': 'Leesha Maliakal',
    'Garrett': 'Garrett Hedman',
    'eharburg@gmail.com': 'Emily Harburg'
}
delta_pairs_tasks_affinities.replace({'helperName': name_mappings, 'helpeeName': name_mappings}, inplace=True)

# more intutive column names
delta_pairs_tasks_affinities.rename(columns={'task': 'helpeeRequest', 'value': 'helperAbilityToHelp'}, inplace=True)

# reorder columns
delta_pairs_tasks_affinities = delta_pairs_tasks_affinities[['group_pairing_id', 'pairing_session_index', 'helperId', 'helperName',
                                                         'helpeeId', 'helpeeName', 'helpeeRequest', 'helperAbilityToHelp', 'paired']]

In [44]:
# remove nans for initial analysis, and save out csv
delta_pairs_tasks_affinities.dropna(how='any',axis=0).to_csv('./documents/delta-pair-research-pairings.csv',
                                                           index=False)

## Summary Stats from Delta

In [45]:
# number of pairing sessions
print('Number of Pairing Sessions: {}'.format(len(delta_pairs_history['group_pairing_id'].unique())))

Number of Pairing Sessions: 131


In [46]:
# number of unique users
print('Number of Unique Users: {}'.format(len(set(delta_pairs_history['helperId'].unique()).union(set(delta_pairs_history['helpeeId'].unique())))))

Number of Unique Users: 38


In [47]:
# total number of tasks
print('Total number of tasks: {}'.format(len(delta_task_history)))

Total number of tasks: 1196


In [48]:
# number of users/tasks per session
delta_task_history.groupby('group_pairing_id')['userId'].count().describe()

count    131.000000
mean       9.129771
std        2.995887
min        1.000000
25%        6.000000
50%        9.000000
75%       11.000000
max       16.000000
Name: userId, dtype: float64

In [49]:
# total number of specified affinities
print('Total number of affinities: {}'.format(len(delta_affinities_history)))

Total number of affinities: 10391


In [50]:
# number of affinities per session
delta_affinities_history.groupby(['group_pairing_id'])['value'].count().describe()

count    129.000000
mean      80.550388
std       49.330829
min        1.000000
25%       30.000000
50%       81.000000
75%      108.000000
max      234.000000
Name: value, dtype: float64

In [51]:
# number of affinities per user per session
delta_affinities_history.groupby(['group_pairing_id', 'helperId'])['value'].count().describe()

count    1182.000000
mean        8.791032
std         2.790662
min         1.000000
25%         7.000000
50%         9.000000
75%        10.000000
max        15.000000
Name: value, dtype: float64