# Load libraries and global functions

In [1]:
# analysis
import collections
import random
import math
import pandas as pd
import numpy as np

In [2]:
# data loading
from copy import deepcopy
from pymongo import MongoClient

In [3]:
# code performance
import time
import multiprocessing as mp
from tqdm import tqdm_notebook as tqdm

In [4]:
pd.set_option('display.max_colwidth', -1)

  pd.set_option('display.max_colwidth', -1)


# Fetch Pairing Data from [pairresearch.io](http://pairresearch.io/)

In [5]:
uri = 'mongodb://delta:delta@ds011419.mlab.com:11419/pair-research'
dbName = 'pair-research'
client = MongoClient(uri)
db = client[dbName]
db.list_collection_names()

['affinities',
 'meteor_accounts_loginServiceConfiguration',
 'tasks_history',
 'groups',
 'users',
 'objectlabs-system.admin.collections',
 'pairs_history',
 'tasks',
 'system.indexes',
 'pairings',
 'objectlabs-system',
 'affinities_history']

In [6]:
groups = pd.DataFrame(list(db.groups.find({})))

# remove testing groups
group_creator_ignore_list = ['Demo Admin', 'ykykykykykykykykykyk', 'Stella', 'Kevin Northwestern',
                             'Kevin Chen', 'Leesha', 'Jennie']
group_id_ignore_list = ['xwDA4HBxXudxF9Swp', '5QXWCwAFBrdbLYGar']

group_ignore_ids = list(groups[groups['creatorName'].isin(group_creator_ignore_list)]['_id'].unique()) + group_id_ignore_list

# subset groups by id
groups_orig_size = len(groups)
groups_new_size = 0

groups = groups[~groups['_id'].isin(group_ignore_ids)]
groups.reset_index(drop=True, inplace=True)

# get valid group ids from remaining groups
valid_group_ids = groups['_id'].unique()

# print change in size
groups_new_size = len(groups)
print('Number of Groups \nOriginal size: {} --> New size: {}'.format(groups_orig_size, groups_new_size))

# display task history
groups.head()

Number of Groups 
Original size: 750 --> New size: 103


Unnamed: 0,_id,groupName,description,creatorId,creatorName,roles,creationDate,members,active,activePairing
0,uPLDbfFqqdHEEkgCT,Beatles,Rock and Roll Band,goGr47HDwtfphJ5xK,Julian Vicens,"[{'title': 'Guitar', '_id': 'oB3qMqXdTJNqR6vbZ'}, {'title': 'Drums', '_id': 'B5cMzpGKBfQu3roS4'}, {'_id': 'SWwLr9Qa9m6NPo7ob', 'title': 'Bass Guitar'}, {'_id': '9mPrHM5yTWyycFmQA', 'title': 'Vocals'}]",2016-08-10 18:55:16.164,"[{'fullName': 'Julian Vicens', 'userId': 'goGr47HDwtfphJ5xK', 'role': {'title': 'Guitar', '_id': 'oB3qMqXdTJNqR6vbZ'}, 'isAdmin': True, 'isPending': False}]",True,
1,Et46F6odTBmiFiDSZ,Knight Lab Testing,Knight Lab taking Pair Research for a spin,NtZ9hv3g6eLAwN2nY,Joe Germuska,"[{'title': 'Admin', '_id': 's2JKkhE9XC6GPW5ev'}, {'title': 'Member', '_id': 'DP5toGb7JSku49GRL'}]",2016-07-18 21:21:54.117,"[{'fullName': 'Joe Germuska', 'userId': 'NtZ9hv3g6eLAwN2nY', 'role': {'title': 'Admin', '_id': 's2JKkhE9XC6GPW5ev'}, 'isAdmin': True, 'isPending': False}, {'fullName': 'Zach Wise', 'userId': 'c2bWRsNjfijQtq6pN', 'role': {'title': 'Admin', '_id': 's2JKkhE9XC6GPW5ev'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'Emily Withrow', 'userId': 'u2GAvznbx7Jbf97Hk', 'role': {'title': 'Admin', '_id': 's2JKkhE9XC6GPW5ev'}, 'isAdmin': True, 'isPending': False}, {'fullName': 'Rebecca Poulson', 'userId': 'WTKxXpLuJAnDfgvFH', 'role': {'title': 'Admin', '_id': 's2JKkhE9XC6GPW5ev'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'Aditi Hemant Bhandari', 'userId': 'dKco6yw8vaxbGpdrr', 'role': {'title': 'Admin', '_id': 's2JKkhE9XC6GPW5ev'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'Matthew Zhang', 'userId': 'wZLsLedqXaSNQ2qrB', 'role': {'title': 'Admin', '_id': 's2JKkhE9XC6GPW5ev'}, 'isAdmin': False, 'isPending': False}]",True,nnN46Abcc78AAtqKf
2,kY7xHo6c5m5tCiQMH,Knight Lab Pair Research,Thursdays at 2:30,u2GAvznbx7Jbf97Hk,Emily Withrow,"[{'title': 'Professor', '_id': 'q3PJXDZpMMhcZBRzM'}, {'title': 'Graduate Student', '_id': 'NTFZYPLDZJvF5ZBbm'}, {'title': 'Undergraduate Student', '_id': 'TMkdfs2crEuMJnaMN'}]",2016-09-28 19:17:10.709,"[{'fullName': 'Emily Withrow', 'userId': 'u2GAvznbx7Jbf97Hk', 'role': {'title': 'Professor', '_id': 'q3PJXDZpMMhcZBRzM'}, 'isAdmin': True, 'isPending': False}, {'fullName': 'Joe Germuska', 'userId': 'NtZ9hv3g6eLAwN2nY', 'role': {'title': 'Professor', '_id': 'q3PJXDZpMMhcZBRzM'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'Rebecca Poulson', 'userId': 'WTKxXpLuJAnDfgvFH', 'role': {'title': 'Professor', '_id': 'q3PJXDZpMMhcZBRzM'}, 'isAdmin': False, 'isPending': False}]",False,
3,KEo62WdN5WSkHa9Hh,Knight Lab Pair Research,Thursdays at 2:30,u2GAvznbx7Jbf97Hk,Emily Withrow,"[{'title': 'Professor', '_id': '6L6YwxgDwpqgoYfQb'}, {'title': 'Graduate Student', '_id': 'Re9vHdfpX5xFivRt7'}, {'title': 'Undergraduate Student', '_id': 'jSABmEMJaLwomCa7A'}]",2016-09-29 15:15:15.184,"[{'fullName': 'Emily Withrow', 'userId': 'u2GAvznbx7Jbf97Hk', 'role': {'title': 'Professor', '_id': '6L6YwxgDwpqgoYfQb'}, 'isAdmin': True, 'isPending': False}, {'fullName': 'Zach Wise', 'userId': 'c2bWRsNjfijQtq6pN', 'role': {'title': 'Professor', '_id': '6L6YwxgDwpqgoYfQb'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'SierraBoone2017@u.northwestern.edu', 'userId': 'Rpj3KmgFLbf5PNi9Y', 'role': {'title': 'Undergraduate Student', '_id': 'jSABmEMJaLwomCa7A'}, 'isAdmin': False, 'isPending': True}, {'fullName': 'Missy Chen', 'userId': 'eTprphmMsyM2BhS8f', 'role': {'title': 'Undergraduate Student', '_id': 'jSABmEMJaLwomCa7A'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'Holly He', 'userId': 'MXmgyeXydQF6YjuxH', 'role': {'title': 'Undergraduate Student', '_id': 'jSABmEMJaLwomCa7A'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'David Latimore II', 'userId': 'Bq5TSjSfFJxPFwQga', 'role': {'title': 'Undergraduate Student', '_id': 'jSABmEMJaLwomCa7A'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'Gregory Leung', 'userId': '4ReSPWMiunKcaF2Yt', 'role': {'title': 'Undergraduate Student', '_id': 'jSABmEMJaLwomCa7A'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'Diane Liu', 'userId': 'CZZEJEKvrPtLh7waq', 'role': {'title': 'Undergraduate Student', '_id': 'jSABmEMJaLwomCa7A'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'Michael Martinez', 'userId': 'bQyYuyeYAjdDuaBjE', 'role': {'title': 'Undergraduate Student', '_id': 'jSABmEMJaLwomCa7A'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'Josh Rosenblat', 'userId': 'WCSkPAoew76XfYLBv', 'role': {'title': 'Undergraduate Student', '_id': 'jSABmEMJaLwomCa7A'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'Rudy DeBerry', 'userId': 'GcRKXYyjt2B4cSxPo', 'role': {'title': 'Undergraduate Student', '_id': 'jSABmEMJaLwomCa7A'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'Lauren Dolowich', 'userId': 'NAsf3uAhoHdpGHMry', 'role': {'title': 'Undergraduate Student', '_id': 'jSABmEMJaLwomCa7A'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'Jordan Friedman', 'userId': 'HLMuPL5sKzy4nhWMy', 'role': {'title': 'Undergraduate Student', '_id': 'jSABmEMJaLwomCa7A'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'Nicholas Garbaty', 'userId': 'Qg8ggCZbxeBnpMpJc', 'role': {'title': 'Undergraduate Student', '_id': 'jSABmEMJaLwomCa7A'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'Raven Haynes', 'userId': 'FbFJjzX7nnJxkp8TP', 'role': {'title': 'Undergraduate Student', '_id': 'jSABmEMJaLwomCa7A'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'Nihal Kolur', 'userId': 'Jf73GjaTuxpLwDmFr', 'role': {'title': 'Undergraduate Student', '_id': 'jSABmEMJaLwomCa7A'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'Faith', 'userId': 'Ci4LfmCqgkSRA7eoX', 'role': {'title': 'Undergraduate Student', '_id': 'jSABmEMJaLwomCa7A'}, 'isAdmin': False, 'isPending': True}, {'fullName': 'GauriRangrass2017@u.northwestern.edu', 'userId': 'gqY8spKoFX8NDWF74', 'role': {'title': 'Undergraduate Student', '_id': 'jSABmEMJaLwomCa7A'}, 'isAdmin': False, 'isPending': True}, {'fullName': 'Caroline Vakil', 'userId': 'FfiAnnLSBRzmFvr3E', 'role': {'title': 'Undergraduate Student', '_id': 'jSABmEMJaLwomCa7A'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'Virginia Van Keuren', 'userId': 'Lvepd4k7vnygobvbv', 'role': {'title': 'Undergraduate Student', '_id': 'jSABmEMJaLwomCa7A'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'Katie George', 'userId': 'cupTmXWNEZ2N7vxPk', 'role': {'title': 'Undergraduate Student', '_id': 'jSABmEMJaLwomCa7A'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'Saurabh Rane', 'userId': 'NLHP6aMuxdkpCmPM7', 'role': {'title': 'Graduate Student', '_id': 'Re9vHdfpX5xFivRt7'}, 'isAdmin': False, 'isPending': False}]",False,
4,qPnf2DHHihugATnxD,Segal Design Cluster,an intellectual community for design faculty and doctoral students at Northwestern,PavTL8zD9664wvtfB,Haoqi Zhang,"[{'title': 'Professor', '_id': 'sSNgzD6So2kz95vjt'}, {'title': 'Post Doc', '_id': 'JEjFZZpJo5ENYzQoT'}, {'title': 'Graduate Student', '_id': '83nMtbrqc3o6C4AT4'}, {'title': 'Undergraduate Student', '_id': 'Yd9LAqncRQ2qyFTLX'}]",2016-11-10 18:38:04.379,"[{'fullName': 'Haoqi Zhang', 'userId': 'PavTL8zD9664wvtfB', 'role': {'title': 'Professor', '_id': 'sSNgzD6So2kz95vjt'}, 'isAdmin': True, 'isPending': False}, {'fullName': 'Josh Hibschman', 'userId': 'ZgLq4QzHZQTBsRBKY', 'role': {'title': 'Graduate Student', '_id': '83nMtbrqc3o6C4AT4'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'scottallencambo@gmail.com', 'userId': 'EtSy6MCnBbcPdMx56', 'role': {'title': 'Graduate Student', '_id': '83nMtbrqc3o6C4AT4'}, 'isAdmin': False, 'isPending': True}, {'fullName': 'Sugat Dabholkar', 'userId': '8zpoRYY8KdamXqSrD', 'role': {'title': 'Graduate Student', '_id': '83nMtbrqc3o6C4AT4'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'Eureka Foong', 'userId': 'JaEySKdKKg7LAF3Yg', 'role': {'title': 'Graduate Student', '_id': '83nMtbrqc3o6C4AT4'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'Daniel George Rees Lewis', 'userId': 'MJkj24zXWKhnZQCc3', 'role': {'title': 'Graduate Student', '_id': '83nMtbrqc3o6C4AT4'}, 'isAdmin': True, 'isPending': False}, {'fullName': 'Ethan Manilow', 'userId': 'M8fjWKXyTgwMQ62C6', 'role': {'title': 'Graduate Student', '_id': '83nMtbrqc3o6C4AT4'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'Mmachi God'sglory Obiorah', 'userId': 'SfDsj2yphHxWeDBq7', 'role': {'title': 'Graduate Student', '_id': '83nMtbrqc3o6C4AT4'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'Matt Easterday', 'userId': '6iR9Z64HEJDcD8qbu', 'role': {'title': 'Professor', '_id': 'sSNgzD6So2kz95vjt'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'Michael Horn', 'userId': 'BFKtDmafxsTk8Gtoy', 'role': {'title': 'Professor', '_id': 'sSNgzD6So2kz95vjt'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'dgergle@northwestern.edu', 'userId': 'asfBWfZ63pGgDfDGH', 'role': {'_id': '55555555555555555', 'title': 'Pending'}, 'isAdmin': False, 'isPending': True}, {'fullName': 'ampiper@northwestern.edu', 'userId': 'bZEjadPH7KrjM9PfD', 'role': {'_id': '55555555555555555', 'title': 'Pending'}, 'isAdmin': False, 'isPending': True}, {'fullName': 'pardo@northwestern.edu', 'userId': 'o9A5d76CWFvkhZp3C', 'role': {'_id': '55555555555555555', 'title': 'Pending'}, 'isAdmin': False, 'isPending': True}, {'fullName': 'maciver@northwestern.edu', 'userId': '4QHj9x3W27wdiSyfw', 'role': {'_id': '55555555555555555', 'title': 'Pending'}, 'isAdmin': False, 'isPending': True}, {'fullName': 'franconeri@northwestern.edu', 'userId': 'kuKx5jMkbq3kGjwnz', 'role': {'_id': '55555555555555555', 'title': 'Pending'}, 'isAdmin': False, 'isPending': True}, {'fullName': 'mbeeman@northwestern.edu', 'userId': 'Am3Azc5x5ZJDTt293', 'role': {'title': 'Graduate Student', '_id': '83nMtbrqc3o6C4AT4'}, 'isAdmin': False, 'isPending': True}, {'fullName': 'JosephMullenbach2015@u.northwestern.edu', 'userId': 'MLeZ2XexieTBmzTzW', 'role': {'title': 'Graduate Student', '_id': '83nMtbrqc3o6C4AT4'}, 'isAdmin': False, 'isPending': True}, {'fullName': 'juliehui@u.northwestern.edu', 'userId': 'dErXBarBLWbpcD3yh', 'role': {'title': 'Graduate Student', '_id': '83nMtbrqc3o6C4AT4'}, 'isAdmin': False, 'isPending': True}, {'fullName': 'MingxianWang2016@u.northwestern.edu', 'userId': '2dpuLijTDqhcfceQw', 'role': {'title': 'Graduate Student', '_id': '83nMtbrqc3o6C4AT4'}, 'isAdmin': False, 'isPending': True}, {'fullName': 'beheshti@u.northwestern.edu', 'userId': '6WjWbMZdh2B7tkxQK', 'role': {'title': 'Graduate Student', '_id': '83nMtbrqc3o6C4AT4'}, 'isAdmin': False, 'isPending': True}, {'fullName': 'Natalia Smirnov', 'userId': 'zBZSGgrZFfW5KH5vj', 'role': {'title': 'Graduate Student', '_id': '83nMtbrqc3o6C4AT4'}, 'isAdmin': False, 'isPending': True}, {'fullName': 'EmmanouilTzorakoleftherakis2012@u.northwestern.edu', 'userId': 'JaveZHZAGNwEdgdrY', 'role': {'title': 'Graduate Student', '_id': '83nMtbrqc3o6C4AT4'}, 'isAdmin': False, 'isPending': True}, {'fullName': 'CraigShultz2012@u.northwestern.edu', 'userId': 'uQDBJBX5XMigKd2rc', 'role': {'title': 'Graduate Student', '_id': '83nMtbrqc3o6C4AT4'}, 'isAdmin': False, 'isPending': True}, {'fullName': 'KrystalVillanosa2017@u.northwestern.edu', 'userId': '793f3E7iho7B7hpWF', 'role': {'title': 'Graduate Student', '_id': '83nMtbrqc3o6C4AT4'}, 'isAdmin': False, 'isPending': True}, {'fullName': 'ArindamPaul2012@u.northwestern.edu', 'userId': '2QvkN9qbCPDjRXJ43', 'role': {'title': 'Graduate Student', '_id': '83nMtbrqc3o6C4AT4'}, 'isAdmin': False, 'isPending': True}, {'fullName': 'Noah', 'userId': 'u7qCxBFWLFNivgpye', 'role': {'title': 'Graduate Student', '_id': '83nMtbrqc3o6C4AT4'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'Elizabeth Hunter', 'userId': 'w8r9sgA6AmX7GshaW', 'role': {'title': 'Graduate Student', '_id': '83nMtbrqc3o6C4AT4'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'gabby.anton@u.northwestern.edu', 'userId': 'u6opsppzb7EGS3Zcy', 'role': {'title': 'Graduate Student', '_id': '83nMtbrqc3o6C4AT4'}, 'isAdmin': False, 'isPending': True}, {'fullName': 'rnbrewer@u.northwestern.edu', 'userId': 'F37PqF29MjeHTvrNq', 'role': {'title': 'Graduate Student', '_id': '83nMtbrqc3o6C4AT4'}, 'isAdmin': False, 'isPending': True}, {'fullName': 'bongjun@u.northwestern.edu', 'userId': 'oZfJLdpgZdd7GZGBE', 'role': {'title': 'Graduate Student', '_id': '83nMtbrqc3o6C4AT4'}, 'isAdmin': False, 'isPending': True}, {'fullName': 'Yongsung Kim', 'userId': 'EDEFWcagLwCfXP5Jg', 'role': {'title': 'Graduate Student', '_id': '83nMtbrqc3o6C4AT4'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'schueller@northwestern.edu', 'userId': 'AAxPH8ySwJnEGuvqT', 'role': {'title': 'Professor', '_id': 'sSNgzD6So2kz95vjt'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'Scott Allen Cambo', 'userId': '3zasasYkausaRMBqL', 'role': {'title': 'Graduate Student', '_id': '83nMtbrqc3o6C4AT4'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'Uri Wilensky', 'userId': 'nPtLyTbzn4NNNmYFC', 'role': {'title': 'Professor', '_id': 'sSNgzD6So2kz95vjt'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'Julie Hui', 'userId': 'KYnkykoMwd9fbBbWB', 'role': {'title': 'Graduate Student', '_id': '83nMtbrqc3o6C4AT4'}, 'isAdmin': True, 'isPending': False}, {'fullName': 'Liz Gerber', 'userId': 'BPQ7hyoHgghctHPqq', 'role': {'title': 'Professor', '_id': 'sSNgzD6So2kz95vjt'}, 'isAdmin': True, 'isPending': False}, {'fullName': 'Ada Ng', 'userId': 'XLhRAjhGXSYEvC6Q3', 'role': {'title': 'Graduate Student', '_id': '83nMtbrqc3o6C4AT4'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'Wei Chen', 'userId': 'NEdS2iCrcCqqoQ7hk', 'role': {'title': 'Professor', '_id': 'sSNgzD6So2kz95vjt'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'Spencer Carlson', 'userId': 'vbsF64nAgoitwrNeB', 'role': {'title': 'Graduate Student', '_id': '83nMtbrqc3o6C4AT4'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'Leesha', 'userId': 'aNdSTecskgeAm2St5', 'role': {'title': 'Graduate Student', '_id': '83nMtbrqc3o6C4AT4'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'Jamie Gorson', 'userId': 'jkwZFtFa69FijNWyY', 'role': {'title': 'Graduate Student', '_id': '83nMtbrqc3o6C4AT4'}, 'isAdmin': False, 'isPending': False}, {'fullName': 'Anne Marie Piper', 'userId': 'y5D7YfEuTMh8WaFzx', 'role': {'title': 'Professor', '_id': 'sSNgzD6So2kz95vjt'}, 'isAdmin': False, 'isPending': False}]",True,52meFWjxGNoAqTJxx


In [7]:
users = pd.DataFrame(list(db.users.find({})))

# users must be in at least one valid group
valid_group_ids_set = set(valid_group_ids)
users['valid_user'] = users['groups'].apply(lambda x: bool(valid_group_ids_set & set([y['groupId'] for y in x])))

# remove invalid users
users_orig_size = len(users)
users_new_size = 0

users = users[users['valid_user']]

# print change in size
users_new_size = len(users)
print('Number of Users \nOriginal size: {} --> New size: {}'.format(users_orig_size, users_new_size))

# display users
users.head()

Number of Users 
Original size: 1298 --> New size: 1180


Unnamed: 0,_id,createdAt,services,emails,profile,groups,valid_user
1,BPQ7hyoHgghctHPqq,2016-08-29 18:24:50.295,"{'password': {'bcrypt': '$2a$10$1.nd.WyfVggPpgSO/2GFLezANdA/eLbRmLjcOHb3vU1cVGpYGuQbO'}, 'resume': {'loginTokens': [{'when': datetime.datetime(2020, 7, 7, 20, 4, 31, 313000), 'hashedToken': 'oeIZlYBEwvxUweWtUGqKnqmOXC2NwNpkrTQhOoXC3BU='}]}}","[{'address': 'egerber@northwestern.edu', 'verified': True}]",{'fullName': 'Liz Gerber'},"[{'groupId': '9mdkMmj4pY8Q2TwqF', 'role': {'_id': 'mb6YtK69EM6TrYddm', 'title': 'Faculty'}, 'groupName': 'Delta Lab', 'isAdmin': True, 'isPending': False}, {'groupId': 'qPnf2DHHihugATnxD', 'role': {'title': 'Professor', '_id': 'sSNgzD6So2kz95vjt'}, 'groupName': 'Segal Design Cluster', 'isAdmin': True, 'isPending': False}]",True
2,bZEjadPH7KrjM9PfD,2016-11-10 19:19:34.147,{'password': {}},"[{'address': 'ampiper@northwestern.edu', 'verified': False}]",{'fullName': 'ampiper@northwestern.edu'},"[{'groupId': 'qPnf2DHHihugATnxD', 'role': {'_id': '55555555555555555', 'title': 'Pending'}, 'groupName': 'Segal Design Cluster', 'isAdmin': False, 'isPending': True}]",True
5,4nAboBfRx5RMJg68G,2017-03-27 14:33:17.771,{'password': {}},"[{'address': 'g-danko@northwestern.edu', 'verified': False}]",{'fullName': 'g-danko@northwestern.edu'},"[{'groupId': 'u4kjJC55DPMLpR8bC', 'role': {'title': 'Student', '_id': '9KYNRehWRuGhECA7s'}, 'groupName': 'MSHE Research', 'isAdmin': False, 'isPending': True}]",True
6,9iEAAD9Y54n4hMy3D,2017-03-27 14:39:27.572,{'password': {}},"[{'address': 'a-prachand@northwestern.edu', 'verified': False}]",{'fullName': 'a-prachand@northwestern.edu'},"[{'groupId': 'u4kjJC55DPMLpR8bC', 'role': {'title': 'Professor', '_id': '9rrbXcQixxhtiKMpM'}, 'groupName': 'MSHE Research', 'isAdmin': False, 'isPending': True}]",True
8,Byki6KMawAsYnmr8x,2017-06-01 21:25:16.500,{'password': {}},"[{'address': 'bjoern@eecs.berkeley.edu', 'verified': False}]",{'fullName': 'bjoern@eecs.berkeley.edu'},"[{'groupId': 'je9bo2hHLbYwWNtRd', 'role': {'_id': '55555555555555555', 'title': 'Pending'}, 'groupName': 'BiD Lab', 'isAdmin': False, 'isPending': True}]",True


In [8]:
tasks_history = pd.DataFrame(list(db.tasks_history.find({})))

# remove bad groups
tasks_history_orig_size = len(tasks_history)
tasks_history_new_size = 0

tasks_history = tasks_history[tasks_history['groupId'].isin(valid_group_ids)]
tasks_history.reset_index(drop=True, inplace=True)

# add group_pairing_id
tasks_history['group_pairing_id'] = tasks_history['groupId'] + '-' + tasks_history['pairingId']

# print change in size
tasks_history_new_size = len(tasks_history)
print('Number of Tasks\nOriginal size: {} --> New size: {}'.format(tasks_history_orig_size, tasks_history_new_size))

# display task history
tasks_history.head()

Number of Tasks
Original size: 5239 --> New size: 5208


Unnamed: 0,_id,name,userId,groupId,task,pairingId,group_pairing_id
0,k4ewZSgDHsvDFkXpX,Yongsung Kim,EDEFWcagLwCfXP5Jg,9mdkMmj4pY8Q2TwqF,i need to send out a short-survey to interviewees,nRAQpsPhsQs4zRvTL,9mdkMmj4pY8Q2TwqF-nRAQpsPhsQs4zRvTL
1,RZZWR8pABaJBKYNFu,Julian Vicens,goGr47HDwtfphJ5xK,9mdkMmj4pY8Q2TwqF,I would like to talk about different ways to manage images in iOS (objective-c / swift),nRAQpsPhsQs4zRvTL,9mdkMmj4pY8Q2TwqF-nRAQpsPhsQs4zRvTL
2,Xr3dvNreiwzq9ixrQ,Spencer Carlson,vbsF64nAgoitwrNeB,9mdkMmj4pY8Q2TwqF,Make educated guesses about the quality of my half-completed lit summary work with me,nRAQpsPhsQs4zRvTL,9mdkMmj4pY8Q2TwqF-nRAQpsPhsQs4zRvTL
3,dFpfXT8szHkp2pYgG,Leesha,aNdSTecskgeAm2St5,9mdkMmj4pY8Q2TwqF,I need help planning a latency handling feature in CrowdCheer (iOS Development),nRAQpsPhsQs4zRvTL,9mdkMmj4pY8Q2TwqF-nRAQpsPhsQs4zRvTL
4,zEMk9HQo9azvKzDye,Eureka Foong,JaEySKdKKg7LAF3Yg,9mdkMmj4pY8Q2TwqF,Installing a program using Terminal (I'm bad at command line) OR watch me write revisions,nRAQpsPhsQs4zRvTL,9mdkMmj4pY8Q2TwqF-nRAQpsPhsQs4zRvTL


In [9]:
pairings = pd.DataFrame(list(db.pairings.find({})))

# remove bad groups
pairings_orig_size = len(pairings)
pairings_new_size = 0

pairings = pairings[pairings['groupId'].isin(valid_group_ids)]

# add group_pair id
pairings['group_pairing_id'] = pairings['groupId'] + '-' + pairings['_id']
pairings.reset_index(drop=True, inplace=True)

# print change in size
pairings_new_size = len(pairings)
print('Number of Pairing Sessions\nOriginal size: {} --> New size: {}'.format(pairings_orig_size, pairings_new_size))

# display current pairings
print('Pairing count: {}, Unique group count: {}'.format(len(pairings), len(pairings.groupId.unique())))
pairings.sort_values('timestamp', ascending=False, inplace=True)
pairings.head()

Number of Pairing Sessions
Original size: 779 --> New size: 686
Pairing count: 686, Unique group count: 64


Unnamed: 0,_id,groupId,pairings,timestamp,group_pairing_id
685,G2r7FQo9Ty3R9pgp5,9mdkMmj4pY8Q2TwqF,"[{'firstUserId': 'CQhpnQnrAQMZhD9tM', 'firstUserName': 'Nell O'Rourke', 'secondUserId': 'uXSSDxWNzRfJDZYYk', 'secondUserName': 'Harrison Kwik'}, {'firstUserId': 'zmwK4tJHtwLw8pLRC', 'firstUserName': 'Garrett', 'secondUserId': 'qfNF3GukcgyG2dspM', 'secondUserName': 'Evey Huang'}, {'firstUserId': 'EDEFWcagLwCfXP5Jg', 'firstUserName': 'Yongsung Kim', 'secondUserId': 'd9T3ZrcKMRvi6kqih', 'secondUserName': 'Gus Umbelino'}, {'firstUserId': 'vbsF64nAgoitwrNeB', 'firstUserName': 'Spencer Carlson', 'secondUserId': 'MJkj24zXWKhnZQCc3', 'secondUserName': 'Daniel George Rees Lewis'}, {'firstUserId': 'Z68eenXDt84czPkmN', 'firstUserName': 'Ryan Louie', 'secondUserId': 'xQ4mPiD4TX9MJqiqj', 'secondUserName': 'Kristine Lu'}, {'firstUserId': 'aNdSTecskgeAm2St5', 'firstUserName': 'Leesha'}]",2020-08-11 20:07:57.274,9mdkMmj4pY8Q2TwqF-G2r7FQo9Ty3R9pgp5
684,eCPczjuzgb44uuWHu,BibLRuKtNNv7QEDqb,"[{'firstUserId': 'MNcYyiwih6uaTPaiP', 'firstUserName': 'Anchit Tandon', 'secondUserId': '5jKdmspschYs227qC', 'secondUserName': 'Yoonseo Choi'}, {'firstUserId': 'LXqJcjKTfeXrAiBpt', 'firstUserName': 'S.-C. LEE', 'secondUserId': '6PpK8CNityuYgnwvd', 'secondUserName': 'Seungsu Kim'}, {'firstUserId': 'brekJ742JMS5LAsdq', 'firstUserName': 'seongha eom', 'secondUserId': 'wxM77Wbd7EMaqNQQz', 'secondUserName': 'HyeonJeong Ha'}, {'firstUserId': 'zo7NaLA4jXCYnNKNk', 'firstUserName': 'Hyunwoo Kim', 'secondUserId': 'Ff9ZxbN4fdgn4TWML', 'secondUserName': 'Hyungyu Shin'}, {'firstUserId': 'CxchMhgi7NkyBoRsE', 'firstUserName': 'Kabdo Choi', 'secondUserId': 'peYZPKXhRABGFzBu7', 'secondUserName': 'Yoonjoo Lee'}, {'firstUserId': 'oJ6xdYpzb5MDcLG6R', 'firstUserName': 'biasindi'}]",2020-08-07 04:56:53.769,BibLRuKtNNv7QEDqb-eCPczjuzgb44uuWHu
683,A4KtME3CJaN9EBFjF,9mdkMmj4pY8Q2TwqF,"[{'firstUserId': 'zmwK4tJHtwLw8pLRC', 'firstUserName': 'Garrett', 'secondUserId': 'vbsF64nAgoitwrNeB', 'secondUserName': 'Spencer Carlson'}, {'firstUserId': 'qfNF3GukcgyG2dspM', 'firstUserName': 'Evey Huang', 'secondUserId': 'MJkj24zXWKhnZQCc3', 'secondUserName': 'Daniel George Rees Lewis'}, {'firstUserId': 'jkwZFtFa69FijNWyY', 'firstUserName': 'Jamie Gorson', 'secondUserId': 'd9T3ZrcKMRvi6kqih', 'secondUserName': 'Gus Umbelino'}, {'firstUserId': 'aNdSTecskgeAm2St5', 'firstUserName': 'Leesha', 'secondUserId': 'mxDCeFFW75vWtXtG6', 'secondUserName': 'Cindy Hu'}, {'firstUserId': 'Z68eenXDt84czPkmN', 'firstUserName': 'Ryan Louie'}]",2020-08-04 20:07:26.074,9mdkMmj4pY8Q2TwqF-A4KtME3CJaN9EBFjF
682,DAB5tEYFpXiCc5YJa,BibLRuKtNNv7QEDqb,"[{'firstUserId': 'EsLptEE8YdLCiGwBS', 'firstUserName': 'Jihyeong Hong', 'secondUserId': 'oJ6xdYpzb5MDcLG6R', 'secondUserName': 'biasindi'}, {'firstUserId': 'tLnn2FuTSxRQNbuDN', 'firstUserName': 'Umar Taufiqulhakim', 'secondUserId': 'SzHjeio3wb34jp965', 'secondUserName': 'Seoyoung Kim'}, {'firstUserId': 'Ff9ZxbN4fdgn4TWML', 'firstUserName': 'Hyungyu Shin', 'secondUserId': 'CxchMhgi7NkyBoRsE', 'secondUserName': 'Kabdo Choi'}, {'firstUserId': 'wo7XfZAD3SDrh468T', 'firstUserName': 'Hyerim Lee'}]",2020-08-04 04:12:26.986,BibLRuKtNNv7QEDqb-DAB5tEYFpXiCc5YJa
681,2Y8pYna3QpYnGzkMa,Ru8hz9iyiTnSFgxZw,"[{'firstUserId': 'Rng6FMcyfsd2uuaed', 'firstUserName': 'Sebastian', 'secondUserId': 'RWLiitWdaa997HKiS', 'secondUserName': 'Sandamini Ranwalage'}, {'firstUserId': '8Qweub9KPBH9jYvBv', 'firstUserName': 'Jane Barnette', 'secondUserId': 'rzQJnXitJuBvkEmb8', 'secondUserName': 'Elizabeth Hunter'}, {'firstUserId': '7nFHFiSLS3Pxe63fp', 'firstUserName': 'Jeanmarie Higgins', 'secondUserId': '83eR3WEQr7Xo56xZw', 'secondUserName': 'Yasmin Mikhaiel'}, {'firstUserId': 'jiFNEZuAaiL6GEW3r', 'firstUserName': 'Daniel Ciba', 'secondUserId': '6PKWJYm2gkKZfMwZW', 'secondUserName': 'nicole tabor'}]",2020-08-01 17:45:07.866,Ru8hz9iyiTnSFgxZw-2Y8pYna3QpYnGzkMa


In [10]:
pairs_history = pd.DataFrame(list(db.pairs_history.find({})))

# remove bad groups
pairs_history_orig_size = len(pairs_history)
pairs_history_new_size = 0

pairs_history = pairs_history[pairs_history['groupId'].isin(valid_group_ids)]

# add group_pairing_id column
pairs_history['group_pairing_id'] = pairs_history['groupId'] + '-' + pairs_history['pairingId']
pairs_history.reset_index(drop=True, inplace=True)

# print change in size
pairs_history_new_size = len(pairs_history)
print('Number of Pairs\nOriginal size: {} --> New size: {}'.format(pairs_history_orig_size, pairs_history_new_size))

# display current pairs_history
print('Unique group count: {}, Unique pairing count: {}'.format(len(pairs_history.groupId.unique()), 
                                                                len(pairs_history.group_pairing_id.unique())))
pairs_history.sort_values('timestamp', ascending=False, inplace=True)
pairs_history.head(10)

Number of Pairs
Original size: 3612 --> New size: 3592
Unique group count: 64, Unique pairing count: 686


Unnamed: 0,_id,groupId,pairingId,firstUserId,firstUserName,firstUserRole,secondUserId,secondUserName,secondUserRole,timestamp,group_pairing_id
3591,y8pXiQ8RLK3gacwHD,9mdkMmj4pY8Q2TwqF,G2r7FQo9Ty3R9pgp5,aNdSTecskgeAm2St5,Leesha,PhD Student,,,,2020-08-11 20:07:57.274,9mdkMmj4pY8Q2TwqF-G2r7FQo9Ty3R9pgp5
3590,rKYcwPrkE2dsCewK5,9mdkMmj4pY8Q2TwqF,G2r7FQo9Ty3R9pgp5,Z68eenXDt84czPkmN,Ryan Louie,PhD Student,xQ4mPiD4TX9MJqiqj,Kristine Lu,PhD Student,2020-08-11 20:07:57.274,9mdkMmj4pY8Q2TwqF-G2r7FQo9Ty3R9pgp5
3589,YY8jjomo4MswYx2FC,9mdkMmj4pY8Q2TwqF,G2r7FQo9Ty3R9pgp5,vbsF64nAgoitwrNeB,Spencer Carlson,PhD Student,MJkj24zXWKhnZQCc3,Daniel George Rees Lewis,Postdoc,2020-08-11 20:07:57.274,9mdkMmj4pY8Q2TwqF-G2r7FQo9Ty3R9pgp5
3588,97kJJpH8gtTwvB2ey,9mdkMmj4pY8Q2TwqF,G2r7FQo9Ty3R9pgp5,EDEFWcagLwCfXP5Jg,Yongsung Kim,PhD Student,d9T3ZrcKMRvi6kqih,Gus Umbelino,PhD Student,2020-08-11 20:07:57.274,9mdkMmj4pY8Q2TwqF-G2r7FQo9Ty3R9pgp5
3587,bRwKyy3Cru5E6sisM,9mdkMmj4pY8Q2TwqF,G2r7FQo9Ty3R9pgp5,zmwK4tJHtwLw8pLRC,Garrett,PhD Student,qfNF3GukcgyG2dspM,Evey Huang,PhD Student,2020-08-11 20:07:57.274,9mdkMmj4pY8Q2TwqF-G2r7FQo9Ty3R9pgp5
3586,scTosMJtHXEZBnPLw,9mdkMmj4pY8Q2TwqF,G2r7FQo9Ty3R9pgp5,CQhpnQnrAQMZhD9tM,Nell O'Rourke,Faculty,uXSSDxWNzRfJDZYYk,Harrison Kwik,PhD Student,2020-08-11 20:07:57.274,9mdkMmj4pY8Q2TwqF-G2r7FQo9Ty3R9pgp5
3585,nJzLRcxKEuovEiYan,BibLRuKtNNv7QEDqb,eCPczjuzgb44uuWHu,oJ6xdYpzb5MDcLG6R,biasindi,Undergraduate Student,,,,2020-08-07 04:56:53.769,BibLRuKtNNv7QEDqb-eCPczjuzgb44uuWHu
3584,cg9mw5HoaakfoZ3Nx,BibLRuKtNNv7QEDqb,eCPczjuzgb44uuWHu,CxchMhgi7NkyBoRsE,Kabdo Choi,Graduate Student,peYZPKXhRABGFzBu7,Yoonjoo Lee,Graduate Student,2020-08-07 04:56:53.769,BibLRuKtNNv7QEDqb-eCPczjuzgb44uuWHu
3583,3xLuSeo93qAMBP8pc,BibLRuKtNNv7QEDqb,eCPczjuzgb44uuWHu,zo7NaLA4jXCYnNKNk,Hyunwoo Kim,Graduate Student,Ff9ZxbN4fdgn4TWML,Hyungyu Shin,Graduate Student,2020-08-07 04:56:53.769,BibLRuKtNNv7QEDqb-eCPczjuzgb44uuWHu
3582,ot5XrxCB5zHC22pst,BibLRuKtNNv7QEDqb,eCPczjuzgb44uuWHu,brekJ742JMS5LAsdq,seongha eom,Undergraduate Student,wxM77Wbd7EMaqNQQz,HyeonJeong Ha,Undergraduate Student,2020-08-07 04:56:53.769,BibLRuKtNNv7QEDqb-eCPczjuzgb44uuWHu


In [11]:
tasks = pd.DataFrame(list(db.tasks.find({})))

# remove bad groups
tasks_orig_size = len(tasks)
tasks_new_size = 0

tasks = tasks[tasks['groupId'].isin(valid_group_ids)]
tasks.reset_index(drop=True, inplace=True)

# print change in size
tasks_new_size = len(tasks)
print('Number of Tasks\nOriginal size: {} --> New size: {}'.format(tasks_orig_size, tasks_new_size))

# display current tasks
tasks.head()

Number of Tasks
Original size: 1347 --> New size: 898


Unnamed: 0,_id,name,userId,groupId,task
0,qSPQiuE42yMiZJYrM,Joe Germuska,NtZ9hv3g6eLAwN2nY,Et46F6odTBmiFiDSZ,
1,9ZtF3iuf2Gs273Nq6,wise@northwestern.edu,c2bWRsNjfijQtq6pN,Et46F6odTBmiFiDSZ,
2,36BHem3sZ7vPesS9v,e-withrow@northwestern.edu,u2GAvznbx7Jbf97Hk,Et46F6odTBmiFiDSZ,
3,fYA2q2QAaahrvym9N,Julian Vicens,goGr47HDwtfphJ5xK,uPLDbfFqqdHEEkgCT,
4,pDv2qxmc3Qtgi5msk,rebecca.poulson@northwestern.edu,WTKxXpLuJAnDfgvFH,Et46F6odTBmiFiDSZ,


In [12]:
affinities = pd.DataFrame(list(db.affinities.find({})))

# remove bad groups
affinities_orig_size = len(affinities)
affinities_new_size = 0

affinities = affinities[affinities['groupId'].isin(valid_group_ids)]
affinities.reset_index(drop=True, inplace=True)

# print change in size
affinities_new_size = len(affinities)
print('Number of Current Affinities\nOriginal size: {} --> New size: {}'.format(affinities_orig_size, affinities_new_size))

# display current affinities
affinities.head()

Number of Current Affinities
Original size: 4181 --> New size: 2757


Unnamed: 0,_id,helperId,helpeeId,groupId,value
0,CBAFDuJRt4PCqMFbi,u2GAvznbx7Jbf97Hk,WTKxXpLuJAnDfgvFH,Et46F6odTBmiFiDSZ,1.0
1,sn3M9GLYLwxrdNuLf,dKco6yw8vaxbGpdrr,WTKxXpLuJAnDfgvFH,Et46F6odTBmiFiDSZ,-1.0
2,QTWuMLM39mmfKyqqk,WTKxXpLuJAnDfgvFH,dKco6yw8vaxbGpdrr,Et46F6odTBmiFiDSZ,0.33
3,cQAeZBQdFyagMjJbJ,dKco6yw8vaxbGpdrr,u2GAvznbx7Jbf97Hk,Et46F6odTBmiFiDSZ,1.0
4,zTpAK9XCN7p2Ea6Pg,u2GAvznbx7Jbf97Hk,dKco6yw8vaxbGpdrr,Et46F6odTBmiFiDSZ,1.0


In [13]:
affinities_history = pd.DataFrame(list(db.affinities_history.find({})))

# remove bad groups
affinities_history_orig_size = len(affinities_history)
affinities_history_new_size = 0

affinities_history = affinities_history[affinities_history['groupId'].isin(valid_group_ids)]

# add group_pairing_id column
affinities_history['group_pairing_id'] = affinities_history['groupId'] + '-' + affinities_history['pairingId']

# remove duplicate ratings
affinities_history.sort_values(['group_pairing_id', 'helpeeId', 'helperId'], inplace=True)
affinities_history.drop_duplicates(subset=['group_pairing_id', 'helpeeId', 'helperId'], keep='last', inplace=True)
affinities_history.reset_index(drop=True, inplace=True)

# print change in size
affinities_history_new_size = len(affinities_history)
print('Number of Past Affinities\nOriginal size: {} --> New size: {}'.format(affinities_history_orig_size, affinities_history_new_size))

# display affinity data
print('Unique Group Pairings: {}'.format(len(affinities_history.group_pairing_id.unique())))
affinities_history.head()

Number of Past Affinities
Original size: 63258 --> New size: 62914
Unique Group Pairings: 663


Unnamed: 0,_id,helperId,helpeeId,groupId,value,pairingId,group_pairing_id
0,v3nKkg77Jouf6BZ8G,GLTz7m8y7RqZCYzxx,3si95Pn6NjXTxCWcT,2rFoGTfRa9LFdpQNA,0.33,2EPbA6HkydPTdxCWD,2rFoGTfRa9LFdpQNA-2EPbA6HkydPTdxCWD
1,D2kBQDRftmygv5f4L,PWufwHDsbRaw4se4X,3si95Pn6NjXTxCWcT,2rFoGTfRa9LFdpQNA,1.0,2EPbA6HkydPTdxCWD,2rFoGTfRa9LFdpQNA-2EPbA6HkydPTdxCWD
2,R588B5nqLhmLbC4iW,f8wwqTXaifkxxoAc2,3si95Pn6NjXTxCWcT,2rFoGTfRa9LFdpQNA,0.0,2EPbA6HkydPTdxCWD,2rFoGTfRa9LFdpQNA-2EPbA6HkydPTdxCWD
3,poiynLy2tnCMNzdGf,iyRaCwz7QzxPRSi5t,3si95Pn6NjXTxCWcT,2rFoGTfRa9LFdpQNA,1.0,2EPbA6HkydPTdxCWD,2rFoGTfRa9LFdpQNA-2EPbA6HkydPTdxCWD
4,KmiSFQicDRa263Nfc,kEZXdjhfohiGxJWdu,3si95Pn6NjXTxCWcT,2rFoGTfRa9LFdpQNA,-1.0,2EPbA6HkydPTdxCWD,2rFoGTfRa9LFdpQNA-2EPbA6HkydPTdxCWD


# Cleaning Data
Make sure all pairing sessions are valid. A valid pairing session from the `pairings` table for a group-pairing ID will have $n$ members in the 'pairings' column. 

It must also have:
1. $\frac{n}{2}$ pairs in the 'pairings' column of the `pairings` table for that group-pairing ID entry.
2. $\frac{n}{2}$ rows in the `pairs_history` table for rows with the same group-pairing ID entry.
3. $n$ tasks in the `tasks_history` table for rows with the same group-pairing ID entry.

Ideally, data should have the following, but these are not guaranteed since users may not report affinities for all users in the current pool.
4. $n*(n - 1)$ total affinities in the `affinities_history` table for rows with the same group-pairing ID entry.
5. $n - 1$ affinities per person in the `affinities_history` table for rows with the same group-pairing ID entry.

## TODO
- Unchecked edge case: when pair research doesn't pair everyone given an even number of users
- Plot when different kinds of conditions are breaking as a bar plot (aggregate), and a timeseries plot (trend) to show if phenomena is ongoing

## Determine invalid group-pairing sessions

In [14]:
def count_members_in_pairing(pairing):
    """
    Counts number of members in a pairing.
    
    Input:
        pairing (list of dicts): pairings between members as lists of dicts.
    
    Output:
        (int): number of people in the pairing.
    """
    count = 0
    for pair in pairing:
        if 'firstUserId' in pair and pair['firstUserId'] is not None:
            count += 1
        if 'secondUserId' in pair and pair['secondUserId'] is not None:
            count += 1
            
    return count

In [15]:
def validate_pairing(group_pairing_id, debug=False):
    """
    Validates that all data for a pairing is good.
    
    A valid pairing with n particpants for a given group_pairing_id meets the following conditions:
    1. n/2 pairs in the 'pairings' column of the `pairings` table for that group_pairing_id entry.
    2. n/2 rows in the `pairs_history` table for rows with the same group_pairing_id entry.
    3. n tasks in the `tasks_history` table for rows with the same group_pairing_id entry.
    
    Input:
        group_pairing_id (string): pairing session for group to validate.
        debug (bool): optional parameter to print whenever invalid session is detected.
        
    Output:
        (bool): whether pairing is valid
        (list of string): conditions failed in check
    """
    # get the pairing and number of users
    curr_pairing = pairings.query("group_pairing_id == @group_pairing_id").iloc[0]['pairings']
    n = count_members_in_pairing(curr_pairing)
    pairs_count = math.ceil(n / 2)
    
    # store each condition check
    condition_checks = [False for x in range(3)]
    
    # check condition 1
    condition_checks[0] = len(curr_pairing) == pairs_count
    
    # check condition 2
    condition_checks[1] = len(pairs_history.query("group_pairing_id == @group_pairing_id")) == pairs_count
    
    # check condition 3
    temp_tasks_history = tasks_history.query("group_pairing_id == @group_pairing_id").copy(deep=True)
    if len(temp_tasks_history) > 0:
        temp_tasks_history['group_pairing_user_id'] = temp_tasks_history.apply(lambda x: '{}-{}-{}'.format(x['groupId'], x['pairingId'], x['userId']), axis=1)
        temp_tasks_history.drop_duplicates(['group_pairing_user_id'], keep=False, inplace=True)
        condition_checks[2] = len(temp_tasks_history) == n
    
    # return checks
    all_conds_valid = all(condition_checks)
    failed_conds = None
    
    if not all_conds_valid:
        failed_conds = [str(index + 1) for index, condition in enumerate(condition_checks) if not condition]
        
        if debug:
            print('Invalid Group-Pairing Session: {} | Check Conditions Failed: {}'.format(group_pairing_id, ', '.join(failed_conds)))

    return all_conds_valid, failed_conds

In [16]:
# get group
group_pairing_ids = pairings['group_pairing_id'].unique()
group_pairing_ids.sort()

# collect all invalid group-pairing sessions
invalid_group_pairings = pd.DataFrame({
    'group_pairing_id': [],
    'group_id': [],
    'pairing_id': [],
    'conditions_failed': [],
    'user_count': [],
    'task_count': [],
    'expected_task_count': [],
    'pairing_count': [],
    'expected_pairing_count': [],
    'pairs_hist_count': [],
    'expected_pairs_hist_count': [],
    'affinity_count': [],
    'expected_affinity_count': []
})

for curr_id in tqdm(group_pairing_ids):
    curr_group_id, curr_pairing_id = curr_id.split('-')
    is_valid_pairing, conditions_failed = validate_pairing(curr_id)
    
    if not is_valid_pairing:
        n = count_members_in_pairing(pairings.query("group_pairing_id == @curr_id").iloc[0]['pairings'])
        
        invalid_group_pairings = invalid_group_pairings.append({
            'group_pairing_id': curr_id,
            'group_id': curr_group_id,
            'pairing_id': curr_pairing_id,
            'conditions_failed': conditions_failed,
            'user_count': n,
            'task_count': len(tasks_history.query("group_pairing_id == @curr_id")),
            'expected_task_count': n,
            'pairing_count': len(pairings.query("group_pairing_id == @curr_id").iloc[0]['pairings']),
            'expected_pairing_count': math.ceil(n / 2),
            'pairs_hist_count': len(pairs_history.query("group_pairing_id == @curr_id")),
            'expected_pairs_hist_count': math.ceil(n / 2),
            'affinity_count': len(affinities_history.query("group_pairing_id == @curr_id")),
            'expected_affinity_count': n * (n - 1)
        }, ignore_index=True)
        
invalid_group_pairings.sort_values('conditions_failed', inplace=True)
invalid_group_pairings.reset_index(drop=True, inplace=True)
invalid_group_pairings.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for curr_id in tqdm(group_pairing_ids):


HBox(children=(FloatProgress(value=0.0, max=686.0), HTML(value='')))




Unnamed: 0,group_pairing_id,group_id,pairing_id,conditions_failed,user_count,task_count,expected_task_count,pairing_count,expected_pairing_count,pairs_hist_count,expected_pairs_hist_count,affinity_count,expected_affinity_count
0,PeACz8rabRn9BJRJc-3ksC39hwSs3ZzG9B8,PeACz8rabRn9BJRJc,3ksC39hwSs3ZzG9B8,"[1, 2]",14.0,14.0,14.0,8.0,7.0,8.0,7.0,109.0,182.0
1,Pj9mwxXYPKxjPaxST-mgoxZfxyary28EXFn,Pj9mwxXYPKxjPaxST,mgoxZfxyary28EXFn,"[1, 2]",8.0,8.0,8.0,5.0,4.0,5.0,4.0,48.0,56.0
2,9mdkMmj4pY8Q2TwqF-P8SGjH7mLk9FxCf45,9mdkMmj4pY8Q2TwqF,P8SGjH7mLk9FxCf45,"[1, 2]",12.0,12.0,12.0,7.0,6.0,7.0,6.0,132.0,132.0
3,9mdkMmj4pY8Q2TwqF-WNJjNxKHc5ubX2jwc,9mdkMmj4pY8Q2TwqF,WNJjNxKHc5ubX2jwc,"[1, 2]",16.0,16.0,16.0,9.0,8.0,9.0,8.0,234.0,240.0
4,BibLRuKtNNv7QEDqb-evGbShCDMHxvCBABe,BibLRuKtNNv7QEDqb,evGbShCDMHxvCBABe,"[1, 2]",6.0,6.0,6.0,4.0,3.0,4.0,3.0,29.0,30.0


In [17]:
invalid_group_pairings[invalid_group_pairings.conditions_failed.apply(lambda x: '3' in x)].head()

Unnamed: 0,group_pairing_id,group_id,pairing_id,conditions_failed,user_count,task_count,expected_task_count,pairing_count,expected_pairing_count,pairs_hist_count,expected_pairs_hist_count,affinity_count,expected_affinity_count
21,fduEdDA8nk5ybcYze-9deZN5msXPWbRbaty,fduEdDA8nk5ybcYze,9deZN5msXPWbRbaty,"[1, 2, 3]",3.0,0.0,3.0,3.0,2.0,3.0,2.0,6.0,6.0
22,cToFEbgXcFbrKsSrj-kTtxafCNsKoqsnhCt,cToFEbgXcFbrKsSrj,kTtxafCNsKoqsnhCt,"[1, 2, 3]",14.0,7.0,14.0,8.0,7.0,8.0,7.0,131.0,182.0
23,fduEdDA8nk5ybcYze-vyrrtwEpzzRqCACxi,fduEdDA8nk5ybcYze,vyrrtwEpzzRqCACxi,"[1, 2, 3]",3.0,0.0,3.0,3.0,2.0,3.0,2.0,6.0,6.0
24,FmHT4pnqrPXzCLCmE-EbjzaxZ6wADuYQKqq,FmHT4pnqrPXzCLCmE,EbjzaxZ6wADuYQKqq,"[1, 2, 3]",2.0,0.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0
25,9mdkMmj4pY8Q2TwqF-Q2Qffh22oZwWrRCWt,9mdkMmj4pY8Q2TwqF,Q2Qffh22oZwWrRCWt,"[1, 2, 3]",8.0,32.0,8.0,5.0,4.0,20.0,4.0,56.0,56.0


## Filter out invalid pairing sessions

In [18]:
def remove_invalid_sessions(df, df_name, exclusion_list):
    """
    Removes invalid group-pairing session ids from dataframe.
    
    Input:
        df (pandas dataframe): dataframe to remove group-pairing sessions from.
        df_name (string): name of data frame.
        exclusion_list (list of strings): group_pairing_ids to remove from df.
        
    Output:
        (pandas dataframe): cleaned dataframe
    """
    # track size for printing
    orig_size = len(df)
    new_size = 0
    
    # clean data
    df = df[~df['group_pairing_id'].isin(exclusion_list)]
    df.reset_index(drop=True, inplace=True)
    new_size = len(df)
    
    # print change in size
    print('{} Cleaning \nOrig Size: {} ==> New Size: {}'.format(df_name, orig_size, new_size), end='\n\n')
    
    # return cleaned df
    return df

In [19]:
invalid_group_pairing_ids_list = list(invalid_group_pairings['group_pairing_id'].unique())

In [20]:
tasks_history = remove_invalid_sessions(tasks_history, 'tasks_history', invalid_group_pairing_ids_list)
pairings = remove_invalid_sessions(pairings, 'pairings', invalid_group_pairing_ids_list)
pairs_history = remove_invalid_sessions(pairs_history, 'pairs_history', invalid_group_pairing_ids_list)
affinities_history = remove_invalid_sessions(affinities_history, 'affinities_history', invalid_group_pairing_ids_list)

tasks_history Cleaning 
Orig Size: 5208 ==> New Size: 4686

pairings Cleaning 
Orig Size: 686 ==> New Size: 458

pairs_history Cleaning 
Orig Size: 3592 ==> New Size: 2470

affinities_history Cleaning 
Orig Size: 62914 ==> New Size: 45732



## Add unique sequence number to each pairing session for each group
For example, first session for DTR is 1, second is 2, etc.

In [21]:
# extract sorted pairing ids
pairings_id_df = pairs_history[['groupId', 'pairingId', 'timestamp']].drop_duplicates(subset=['groupId', 'pairingId'], keep='first').sort_values(['groupId', 'timestamp']).reset_index(drop=True)
pairings_id_df['group_pairing_id'] = pairings_id_df['groupId'] + '-' + pairings_id_df['pairingId']
pairings_id_df['pairing_session_index'] = 0

# add index label for each
for groupId in pairings_id_df['groupId'].unique():
    n_row = len(pairings_id_df[pairings_id_df['groupId'] == groupId])
    pairings_id_df.loc[pairings_id_df.groupId == groupId, 'pairing_session_index'] = range(1, n_row + 1)

# show top 5 frows
pairings_id_df.head()

Unnamed: 0,groupId,pairingId,timestamp,group_pairing_id,pairing_session_index
0,2rFoGTfRa9LFdpQNA,A6d3rQwrRZHEz4qHu,2017-08-22 17:19:36.847,2rFoGTfRa9LFdpQNA-A6d3rQwrRZHEz4qHu,1
1,2rFoGTfRa9LFdpQNA,pNFhXhotBwQ6Z79Md,2017-08-22 21:54:05.882,2rFoGTfRa9LFdpQNA-pNFhXhotBwQ6Z79Md,2
2,2rFoGTfRa9LFdpQNA,eSXY7BuRX3ZhXX627,2017-10-24 20:56:43.001,2rFoGTfRa9LFdpQNA-eSXY7BuRX3ZhXX627,3
3,2rFoGTfRa9LFdpQNA,SwhcfsdjNCZcyzx3t,2017-11-28 21:48:06.568,2rFoGTfRa9LFdpQNA-SwhcfsdjNCZcyzx3t,4
4,2rFoGTfRa9LFdpQNA,SpiKfuqCoEZRLfDNK,2018-01-16 21:42:19.584,2rFoGTfRa9LFdpQNA-SpiKfuqCoEZRLfDNK,5


In [22]:
# add pairing index to earlier data frames
tasks_history = tasks_history.merge(pairings_id_df[['group_pairing_id', 'pairing_session_index']], on=['group_pairing_id'])
pairings = pairings.merge(pairings_id_df[['group_pairing_id', 'pairing_session_index']], on=['group_pairing_id'])
pairs_history = pairs_history.merge(pairings_id_df[['group_pairing_id', 'pairing_session_index']], on=['group_pairing_id'])
affinities_history = affinities_history.merge(pairings_id_df[['group_pairing_id', 'pairing_session_index']], on=['group_pairing_id'])

# Summary Stats

## Groups

In [23]:
# number of unique groups
print('Number of unique groups: {}'.format(len(groups)))

Number of unique groups: 103


In [24]:
# total number of pairing sessions
print('Number of pairing sessions: {}'.format(len(pairs_history['group_pairing_id'].unique())))

Number of pairing sessions: 458


In [25]:
# groups that have participated in a pairing session
print('Number of unique groups who have participated in at least one session: {}'.format(len(pairs_history['groupId'].unique())))

Number of unique groups who have participated in at least one session: 36


In [26]:
# groups by number of pairing sessions
pairing_count_bygroup = pairs_history.drop_duplicates('group_pairing_id').groupby('groupId')['groupId'].size().to_frame(name = 'pairingCount').reset_index()
pairing_count_bygroup_named = pairing_count_bygroup.merge(groups, left_on='groupId', right_on='_id')[['groupId', 'groupName', 'description', 'creatorName', 'pairingCount']]
pairing_count_bygroup_named.sort_values('pairingCount', ascending=False, inplace=True)

print('Number of pairing sessions by group')
pairing_count_bygroup_named.head()

Number of pairing sessions by group


Unnamed: 0,groupId,groupName,description,creatorName,pairingCount
6,9mdkMmj4pY8Q2TwqF,Delta Lab,an interdisciplinary research group and design studio at Northwestern University,Haoqi Zhang,163
31,sM3z5FkZfsABqcj3g,"Design, Technology, and Research",DTR,Haoqi Zhang,92
8,BibLRuKtNNv7QEDqb,KIXLAB,The KAIST Interaction Lab,Juho Kim,84
23,cToFEbgXcFbrKsSrj,IDEAL,Wei Chen's research lab,Wei Chen,25
16,PeACz8rabRn9BJRJc,AY17 MSC 538-0,Graduate Class: Workplace Learning & Communities of Practice 2017,Amy Hauenstein,15


In [27]:
# distribution of pairings per group
pairing_count_bygroup_named['pairingCount'].describe()

count    36.000000 
mean     12.722222 
std      32.735835 
min      1.000000  
25%      1.000000  
50%      2.000000  
75%      5.000000  
max      163.000000
Name: pairingCount, dtype: float64

In [28]:
# separate out groups and group creators for labeling
temp_groups = groups[['_id', 'groupName', 'description', 'creatorName']].rename(columns={'_id': 'groupId'})                      
all_group_counts = pairing_count_bygroup_named.merge(temp_groups,
                                                     how='outer',
                                                     on=['groupId', 'groupName', 'description', 'creatorName'])
all_group_counts.fillna(0, inplace=True)
all_group_counts.rename(columns={
    'groupId': 'Unique Identifier',
    'groupName': 'Group Name',
    'description': 'Group Description',
    'creatorName': 'Group Creator',
    'pairingCount': 'Number of Pairing Sessions Held'
}, inplace=True)


# show groups and write to csv
all_group_counts
all_group_counts.to_csv('./documents/groups_pairing-counts.csv', index=False)

## Individuals

In [29]:
# number of unique accounts
print('Number of Unique users: {}'.format(len(users)))

Number of Unique users: 1180


In [30]:
# number of unique users who have paricipated in at least one session
print('Number of Unique Users who have participated in pairings: {}'.format(len(set(pairs_history['firstUserId'].unique()).union(set(pairs_history['secondUserId'].unique())))))

Number of Unique Users who have participated in pairings: 399


In [31]:
# get individuals users and their pairing sessions
pairs_firstusers = pairs_history[['firstUserId', 'group_pairing_id']].rename(columns={'firstUserId': 'userId'})
pairs_secondusers = pairs_history[['secondUserId', 'group_pairing_id']].rename(columns={'secondUserId': 'userId'})

# aggregate
pairs_usergroups = pd.concat([pairs_firstusers, pairs_secondusers]).dropna().drop_duplicates()
user_pairing_count = pairs_usergroups.groupby('userId')['group_pairing_id'].size().to_frame(name = 'pairingCount').reset_index()

# add in people's names
user_pairing_count_named = user_pairing_count.merge(users, left_on='userId', right_on='_id')[['userId', 'profile', 'pairingCount']]
user_pairing_count_named.sort_values('pairingCount', ascending=False, inplace=True)

print('Number of pairing sessions by user')
user_pairing_count_named.head()

Number of pairing sessions by user


Unnamed: 0,userId,profile,pairingCount
213,aNdSTecskgeAm2St5,"{'fullName': 'Leesha', 'avatar': 'https://imgbin.com/png/wKHccVQ4/pac-man-cherry-post-it-note-t-shirt-sticker-png'}",152
378,zmwK4tJHtwLw8pLRC,"{'fullName': 'Garrett', 'avatar': 'https://cdn2.scratch.mit.edu/get_image/gallery/2090239_170x100.png'}",142
200,Z68eenXDt84czPkmN,"{'fullName': 'Ryan Louie', 'avatar': 'https://farm9.staticflickr.com/8281/7630259612_1e199cd81a.jpg'}",140
292,mdhFQ6PNiAhfP7ce2,{'fullName': 'Kapil Garg'},131
83,EDEFWcagLwCfXP5Jg,"{'fullName': 'Yongsung Kim', 'avatar': ''}",130


In [32]:
# distribution of pairings per user
user_pairing_count_named['pairingCount'].describe()

count    384.000000
mean     11.851562 
std      22.518257 
min      1.000000  
25%      1.000000  
50%      3.000000  
75%      12.000000 
max      152.000000
Name: pairingCount, dtype: float64

# Isolate DTR Pairing Data

In [33]:
dtr_group_id = 'sM3z5FkZfsABqcj3g'

In [34]:
# create initial filters on the data for DTR only
dtr_pairs_history = pairs_history.query("groupId == @dtr_group_id").reset_index(drop=True)
dtr_pairs_history = dtr_pairs_history[['group_pairing_id',
                                       'firstUserId', 'firstUserName', 'secondUserId', 'secondUserName', 'pairing_session_index']]

dtr_task_history = tasks_history.query("groupId == @dtr_group_id").drop_duplicates(['task', 'userId', 'group_pairing_id']).reset_index(drop=True)

dtr_affinities_history = affinities_history.query("groupId == @dtr_group_id").reset_index(drop=True)
dtr_affinities_history = dtr_affinities_history[['group_pairing_id', 'helpeeId', 'helperId', 'value', 'pairing_session_index']]

In [35]:
# duplicate pairs data for non-nan entries
dtr_pairs_history_dup = dtr_pairs_history[~dtr_pairs_history['secondUserId'].isnull()]
dtr_pairs_history_dup = dtr_pairs_history_dup[['group_pairing_id', 'secondUserId', 'secondUserName', 'firstUserId', 'firstUserName', 'pairing_session_index']]
dtr_pairs_history_dup.columns = ['group_pairing_id', 'helperId', 'helperName', 'helpeeId', 'helpeeName', 'pairing_session_index']

# combine back with original dtr_pairs_history df
dtr_pairs_history.columns = ['group_pairing_id', 'helperId', 'helperName', 'helpeeId', 'helpeeName', 'pairing_session_index']
dtr_pairs_history = dtr_pairs_history.append(dtr_pairs_history_dup, ignore_index=True)

In [36]:
dtr_pairs_history = dtr_pairs_history.sort_values(['pairing_session_index', 'group_pairing_id', 'helperId']).reset_index(drop=True)
dtr_pairs_history.head(10)

Unnamed: 0,group_pairing_id,helperId,helperName,helpeeId,helpeeName,pairing_session_index
0,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,6rZbYn3cbQ9KNLRM5,Meg Grasse,aNdSTecskgeAm2St5,Leesha,1
1,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,EDEFWcagLwCfXP5Jg,Yongsung Kim,Rse39xrxtP6xRHsbK,AlainaKafkes2017@u.northwestern.edu,1
2,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,N3LsK5NJeKm8PkBx8,Allison Sun,mdhFQ6PNiAhfP7ce2,Kapil Garg,1
3,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,Rse39xrxtP6xRHsbK,AlainaKafkes2017@u.northwestern.edu,EDEFWcagLwCfXP5Jg,Yongsung Kim,1
4,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,TFtNL3sYbSSGykQJE,Ryan Madden,aupdNzYu8WmNEi4e5,Alex Kaldjian,1
5,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,XjH8WJxEXFgTwdi3o,Sameer Srivastava,nDHZGzczDWyqvyFhp,Sarah Lim,1
6,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,aNdSTecskgeAm2St5,Leesha,6rZbYn3cbQ9KNLRM5,Meg Grasse,1
7,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,aupdNzYu8WmNEi4e5,Alex Kaldjian,TFtNL3sYbSSGykQJE,Ryan Madden,1
8,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,cupTmXWNEZ2N7vxPk,Katie George,iEHKgJBH7hNSroEjw,Greg Kim,1
9,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,iEHKgJBH7hNSroEjw,Greg Kim,cupTmXWNEZ2N7vxPk,Katie George,1


In [37]:
# add task requests made by the helpee
dtr_pairs_tasks = dtr_pairs_history.merge(dtr_task_history[['group_pairing_id', 'userId', 'task', 'pairing_session_index']],
                                          left_on=['group_pairing_id', 'helpeeId', 'pairing_session_index'],
                                          right_on=['group_pairing_id', 'userId', 'pairing_session_index'],
                                          how='left')
dtr_pairs_tasks['paired'] = True
del dtr_pairs_tasks['userId']
dtr_pairs_tasks.head()

Unnamed: 0,group_pairing_id,helperId,helperName,helpeeId,helpeeName,pairing_session_index,task,paired
0,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,6rZbYn3cbQ9KNLRM5,Meg Grasse,aNdSTecskgeAm2St5,Leesha,1,I need to think through & implement test cases for automated runner tracking.,True
1,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,EDEFWcagLwCfXP5Jg,Yongsung Kim,Rse39xrxtP6xRHsbK,AlainaKafkes2017@u.northwestern.edu,1,Scaffolding feedback,True
2,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,N3LsK5NJeKm8PkBx8,Allison Sun,mdhFQ6PNiAhfP7ce2,Kapil Garg,1,Finish up scenarios and study design,True
3,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,Rse39xrxtP6xRHsbK,AlainaKafkes2017@u.northwestern.edu,EDEFWcagLwCfXP5Jg,Yongsung Kim,1,i need help with my fellowship research statement,True
4,sM3z5FkZfsABqcj3g-QrbK5SDrfk75E67qj,TFtNL3sYbSSGykQJE,Ryan Madden,aupdNzYu8WmNEi4e5,Alex Kaldjian,1,coming up with prototyping examples for exploratory study,True


In [38]:
# add helper's ability to help. keep all affinities even if people weren't actually paired (how=right; for only paired people, use how='left')
dtr_pairs_tasks_affinities = dtr_pairs_tasks.merge(dtr_affinities_history,
                                                   left_on=['group_pairing_id', 'helperId', 'helpeeId', 'pairing_session_index'],
                                                   right_on=['group_pairing_id', 'helperId', 'helpeeId', 'pairing_session_index'],
                                                   how='right')

# fill with 0 affinity if person has no partner
dtr_pairs_tasks_affinities.loc[(~dtr_pairs_tasks_affinities['helpeeId'].isnull()) & (dtr_pairs_tasks_affinities['value'].isnull()), 'value'] = 0

# set paired to false for any unparied people (i.e., all rated affinties for people who weren't paired)
dtr_pairs_tasks_affinities.loc[dtr_pairs_tasks_affinities.paired.isnull(), 'paired'] = False

# fill blank names
dtr_name_dict = {row['userId']: row['name'] for index, row in dtr_task_history[['userId',  'name']].iterrows()}
dtr_pairs_tasks_affinities['helperName'] = dtr_pairs_tasks_affinities['helperName'].fillna(dtr_pairs_tasks_affinities['helperId']).replace(dtr_name_dict)
dtr_pairs_tasks_affinities['helpeeName'] = dtr_pairs_tasks_affinities['helpeeName'].fillna(dtr_pairs_tasks_affinities['helpeeId']).replace(dtr_name_dict)

# fill in blank tasks
dtr_pairs_tasks_affinities = dtr_pairs_tasks_affinities.merge(dtr_task_history[['group_pairing_id', 'userId', 'task']],
                                                              left_on=['group_pairing_id', 'helpeeId'],
                                                              right_on=['group_pairing_id', 'userId'])
del dtr_pairs_tasks_affinities['task_x']
dtr_pairs_tasks_affinities.rename(columns={'task_y': 'task'}, inplace=True)

# sort table and print
dtr_pairs_tasks_affinities.head()

Unnamed: 0,group_pairing_id,helperId,helperName,helpeeId,helpeeName,pairing_session_index,paired,value,userId,task
0,sM3z5FkZfsABqcj3g-2CgSkRTMERna4KKdX,5EJncSsf5AStw6maE,Nneoma Oradiegwu,3cb7eEv3gY3xiBCk7,Daniel Zhu,24,False,0.0,3cb7eEv3gY3xiBCk7,Could someone go through my research canvas with me and give me a critique of my design arguments + gaps I should fill in?
1,sM3z5FkZfsABqcj3g-2CgSkRTMERna4KKdX,9Wcpa2nSnEX8k7kMN,Maggie Lou,3cb7eEv3gY3xiBCk7,Daniel Zhu,24,False,0.66,3cb7eEv3gY3xiBCk7,Could someone go through my research canvas with me and give me a critique of my design arguments + gaps I should fill in?
2,sM3z5FkZfsABqcj3g-2CgSkRTMERna4KKdX,9gcjpT4daYZmZDi2F,Armaan Shah,3cb7eEv3gY3xiBCk7,Daniel Zhu,24,False,0.33,3cb7eEv3gY3xiBCk7,Could someone go through my research canvas with me and give me a critique of my design arguments + gaps I should fill in?
3,sM3z5FkZfsABqcj3g-2CgSkRTMERna4KKdX,HnvDuWqAnAqHbXxLT,Sanfeng Wang,3cb7eEv3gY3xiBCk7,Daniel Zhu,24,True,0.66,3cb7eEv3gY3xiBCk7,Could someone go through my research canvas with me and give me a critique of my design arguments + gaps I should fill in?
4,sM3z5FkZfsABqcj3g-2CgSkRTMERna4KKdX,LG96v2nkrkJhWstC4,Olivia Barnett,3cb7eEv3gY3xiBCk7,Daniel Zhu,24,False,0.0,3cb7eEv3gY3xiBCk7,Could someone go through my research canvas with me and give me a critique of my design arguments + gaps I should fill in?


In [39]:
# replace values with 1 - 5
value_mappings = {
    '-1.0': 1,
    '0.0':  2,
    '0.33': 3,
    '0.66': 4,
    '1.0':  5
}

dtr_pairs_tasks_affinities['value'] = dtr_pairs_tasks_affinities['value'].astype(str)
dtr_pairs_tasks_affinities.replace({'value': value_mappings}, inplace=True)

# replace names
name_mappings = {
    'richardhuang2019@u.northwestern.edu': 'Richard Huang',
    'AlainaKafkes2017@u.northwestern.edu': 'Alaina Kafkes',
    'judylee2021@u.northwestern.edu': 'Judy Lee',
    'Leesha': 'Leesha Maliakal',
    'andrew': 'Andrew Finke',
    'Garrett': 'Garrett Hedman'
}
dtr_pairs_tasks_affinities.replace({'helperName': name_mappings, 'helpeeName': name_mappings}, inplace=True)

# more intutive column names
dtr_pairs_tasks_affinities.rename(columns={'task': 'helpeeRequest', 'value': 'helperAbilityToHelp'}, inplace=True)

# reorder columns
dtr_pairs_tasks_affinities = dtr_pairs_tasks_affinities[['group_pairing_id', 'pairing_session_index', 'helperId', 'helperName',
                                                         'helpeeId', 'helpeeName', 'helpeeRequest', 'helperAbilityToHelp', 'paired']]

In [40]:
# remove nans for initial analysis, and save out csv
dtr_pairs_tasks_affinities.dropna(how='any',axis=0).to_csv('./documents/dtr-pair-research-pairings.csv',
                                                           index=False)

In [41]:
dtr_pairs_tasks_affinities

Unnamed: 0,group_pairing_id,pairing_session_index,helperId,helperName,helpeeId,helpeeName,helpeeRequest,helperAbilityToHelp,paired
0,sM3z5FkZfsABqcj3g-2CgSkRTMERna4KKdX,24,5EJncSsf5AStw6maE,Nneoma Oradiegwu,3cb7eEv3gY3xiBCk7,Daniel Zhu,Could someone go through my research canvas with me and give me a critique of my design arguments + gaps I should fill in?,2,False
1,sM3z5FkZfsABqcj3g-2CgSkRTMERna4KKdX,24,9Wcpa2nSnEX8k7kMN,Maggie Lou,3cb7eEv3gY3xiBCk7,Daniel Zhu,Could someone go through my research canvas with me and give me a critique of my design arguments + gaps I should fill in?,4,False
2,sM3z5FkZfsABqcj3g-2CgSkRTMERna4KKdX,24,9gcjpT4daYZmZDi2F,Armaan Shah,3cb7eEv3gY3xiBCk7,Daniel Zhu,Could someone go through my research canvas with me and give me a critique of my design arguments + gaps I should fill in?,3,False
3,sM3z5FkZfsABqcj3g-2CgSkRTMERna4KKdX,24,HnvDuWqAnAqHbXxLT,Sanfeng Wang,3cb7eEv3gY3xiBCk7,Daniel Zhu,Could someone go through my research canvas with me and give me a critique of my design arguments + gaps I should fill in?,4,True
4,sM3z5FkZfsABqcj3g-2CgSkRTMERna4KKdX,24,LG96v2nkrkJhWstC4,Olivia Barnett,3cb7eEv3gY3xiBCk7,Daniel Zhu,Could someone go through my research canvas with me and give me a critique of my design arguments + gaps I should fill in?,2,False
...,...,...,...,...,...,...,...,...,...
17614,sM3z5FkZfsABqcj3g-zxRFfvAsSTD9MQGWa,91,fSsiAiF3qdkhdgYqk,Kieran Bondy,zmwK4tJHtwLw8pLRC,Garrett Hedman,Ask me to explain my research to you (so I can figure out how to best structure an explanation of my research) :D,1,False
17615,sM3z5FkZfsABqcj3g-zxRFfvAsSTD9MQGWa,91,ia6Mh9bm7Wg3K4pkc,Mary Truong,zmwK4tJHtwLw8pLRC,Garrett Hedman,Ask me to explain my research to you (so I can figure out how to best structure an explanation of my research) :D,5,False
17616,sM3z5FkZfsABqcj3g-zxRFfvAsSTD9MQGWa,91,jxctk3DqhWorsPMXS,Natalie Ghidali,zmwK4tJHtwLw8pLRC,Garrett Hedman,Ask me to explain my research to you (so I can figure out how to best structure an explanation of my research) :D,5,False
17617,sM3z5FkZfsABqcj3g-zxRFfvAsSTD9MQGWa,91,mdhFQ6PNiAhfP7ce2,Kapil Garg,zmwK4tJHtwLw8pLRC,Garrett Hedman,Ask me to explain my research to you (so I can figure out how to best structure an explanation of my research) :D,4,False


## Summary Stats from DTR

In [42]:
# number of pairing sessions
print('Number of Pairing Sessions: {}'.format(len(dtr_pairs_history['group_pairing_id'].unique())))

Number of Pairing Sessions: 92


In [43]:
# number of unique users
print('Number of Unique Users: {}'.format(len(set(dtr_pairs_history['helperId'].unique()).union(set(dtr_pairs_history['helpeeId'].unique())))))

Number of Unique Users: 74


In [44]:
# total number of tasks
print('Total number of tasks: {}'.format(len(dtr_task_history)))

Total number of tasks: 1356


In [45]:
# number of users/tasks per session
dtr_task_history.groupby('group_pairing_id')['userId'].count().describe()

count    92.000000
mean     14.739130
std      4.113418 
min      1.000000 
25%      12.000000
50%      14.000000
75%      17.000000
max      24.000000
Name: userId, dtype: float64

In [46]:
# total number of specified affinities
print('Total number of affinities: {}'.format(len(dtr_affinities_history)))

Total number of affinities: 17619


In [47]:
# number of affinities per session
dtr_affinities_history.groupby(['group_pairing_id'])['value'].count().describe()

count    91.000000 
mean     193.615385
std      101.461516
min      14.000000 
25%      118.500000
50%      170.000000
75%      254.500000
max      535.000000
Name: value, dtype: float64

In [48]:
# number of affinities per user per session
dtr_affinities_history.groupby(['group_pairing_id', 'helperId'])['value'].count().describe()

count    1314.000000
mean     13.408676  
std      4.416574   
min      1.000000   
25%      11.000000  
50%      13.000000  
75%      16.000000  
max      23.000000  
Name: value, dtype: float64

# Isolate Delta Lab Data

In [49]:
delta_group_id = '9mdkMmj4pY8Q2TwqF'

In [50]:
# create initial filters on the data for Delta only
delta_pairs_history = pairs_history.query("groupId == @delta_group_id").reset_index(drop=True)
delta_pairs_history = delta_pairs_history[['group_pairing_id',
                                       'firstUserId', 'firstUserName', 'secondUserId', 'secondUserName', 'pairing_session_index']]

delta_task_history = tasks_history.query("groupId == @delta_group_id").drop_duplicates(['task', 'userId', 'group_pairing_id']).reset_index(drop=True)

delta_affinities_history = affinities_history.query("groupId == @delta_group_id").reset_index(drop=True)
delta_affinities_history = delta_affinities_history[['group_pairing_id', 'helpeeId', 'helperId', 'value', 'pairing_session_index']]

In [51]:
# duplicate pairs data for non-nan entries
delta_pairs_history_dup = delta_pairs_history[~delta_pairs_history['secondUserId'].isnull()]
delta_pairs_history_dup = delta_pairs_history_dup[['group_pairing_id', 'secondUserId', 'secondUserName', 'firstUserId', 'firstUserName', 'pairing_session_index']]
delta_pairs_history_dup.columns = ['group_pairing_id', 'helperId', 'helperName', 'helpeeId', 'helpeeName', 'pairing_session_index']

# combine back with original delta_pairs_history df
delta_pairs_history.columns = ['group_pairing_id', 'helperId', 'helperName', 'helpeeId', 'helpeeName', 'pairing_session_index']
delta_pairs_history = delta_pairs_history.append(delta_pairs_history_dup, ignore_index=True)

In [52]:
delta_pairs_history = delta_pairs_history.sort_values(['pairing_session_index', 'group_pairing_id', 'helperId']).reset_index(drop=True)
delta_pairs_history.head(10)

Unnamed: 0,group_pairing_id,helperId,helperName,helpeeId,helpeeName,pairing_session_index
0,9mdkMmj4pY8Q2TwqF-N23iLvjp2GWcsHYd5,57MnWENtTDkXRYhcL,Gulu,zBZSGgrZFfW5KH5vj,Natalia Smirnov,1
1,9mdkMmj4pY8Q2TwqF-N23iLvjp2GWcsHYd5,6iR9Z64HEJDcD8qbu,Matt Easterday,MJkj24zXWKhnZQCc3,Daniel George Rees Lewis,1
2,9mdkMmj4pY8Q2TwqF-N23iLvjp2GWcsHYd5,EDEFWcagLwCfXP5Jg,Yongsung Kim,,,1
3,9mdkMmj4pY8Q2TwqF-N23iLvjp2GWcsHYd5,JaEySKdKKg7LAF3Yg,Eureka Foong,gynuaAvfp3gAd4Gyo,eharburg@gmail.com,1
4,9mdkMmj4pY8Q2TwqF-N23iLvjp2GWcsHYd5,KYnkykoMwd9fbBbWB,Julie Hui,aNdSTecskgeAm2St5,Leesha,1
5,9mdkMmj4pY8Q2TwqF-N23iLvjp2GWcsHYd5,MJkj24zXWKhnZQCc3,Daniel George Rees Lewis,6iR9Z64HEJDcD8qbu,Matt Easterday,1
6,9mdkMmj4pY8Q2TwqF-N23iLvjp2GWcsHYd5,aNdSTecskgeAm2St5,Leesha,KYnkykoMwd9fbBbWB,Julie Hui,1
7,9mdkMmj4pY8Q2TwqF-N23iLvjp2GWcsHYd5,gynuaAvfp3gAd4Gyo,eharburg@gmail.com,JaEySKdKKg7LAF3Yg,Eureka Foong,1
8,9mdkMmj4pY8Q2TwqF-N23iLvjp2GWcsHYd5,zBZSGgrZFfW5KH5vj,Natalia Smirnov,57MnWENtTDkXRYhcL,Gulu,1
9,9mdkMmj4pY8Q2TwqF-soiecrpv6CRPTqmkd,PavTL8zD9664wvtfB,Haoqi Zhang,,,2


In [53]:
# add task requests made by the helpee
delta_pairs_tasks = delta_pairs_history.merge(delta_task_history[['group_pairing_id', 'userId', 'task', 'pairing_session_index']],
                                          left_on=['group_pairing_id', 'helpeeId', 'pairing_session_index'],
                                          right_on=['group_pairing_id', 'userId', 'pairing_session_index'],
                                          how='left')
delta_pairs_tasks['paired'] = True
del delta_pairs_tasks['userId']
delta_pairs_tasks.head()

Unnamed: 0,group_pairing_id,helperId,helperName,helpeeId,helpeeName,pairing_session_index,task,paired
0,9mdkMmj4pY8Q2TwqF-N23iLvjp2GWcsHYd5,57MnWENtTDkXRYhcL,Gulu,zBZSGgrZFfW5KH5vj,Natalia Smirnov,1,Sit next to me while I fix revisions on my paper to resubmit to a journal :],True
1,9mdkMmj4pY8Q2TwqF-N23iLvjp2GWcsHYd5,6iR9Z64HEJDcD8qbu,Matt Easterday,MJkj24zXWKhnZQCc3,Daniel George Rees Lewis,1,Read CSCW R&R and edit it -- also find citations that back up our proxy baseline for the study,True
2,9mdkMmj4pY8Q2TwqF-N23iLvjp2GWcsHYd5,EDEFWcagLwCfXP5Jg,Yongsung Kim,,,1,,True
3,9mdkMmj4pY8Q2TwqF-N23iLvjp2GWcsHYd5,JaEySKdKKg7LAF3Yg,Eureka Foong,gynuaAvfp3gAd4Gyo,eharburg@gmail.com,1,Help me revise my lit for CheerOn,True
4,9mdkMmj4pY8Q2TwqF-N23iLvjp2GWcsHYd5,KYnkykoMwd9fbBbWB,Julie Hui,aNdSTecskgeAm2St5,Leesha,1,"I need help testing CrowdCheer outside! You just have to stand there and pretend to be a spectator, then give me feedback on the interaction.",True


In [54]:
# add helper's ability to help. keep all affinities even if people weren't actually paired (how=right; for only paired people, use how='left')
delta_pairs_tasks_affinities = delta_pairs_tasks.merge(delta_affinities_history,
                                                   left_on=['group_pairing_id', 'helperId', 'helpeeId', 'pairing_session_index'],
                                                   right_on=['group_pairing_id', 'helperId', 'helpeeId', 'pairing_session_index'],
                                                   how='right')

# fill with 0 affinity if person has no partner
delta_pairs_tasks_affinities.loc[(~delta_pairs_tasks_affinities['helpeeId'].isnull()) & (delta_pairs_tasks_affinities['value'].isnull()), 'value'] = 0

# set paired to false for any unparied people (i.e., all rated affinties for people who weren't paired)
delta_pairs_tasks_affinities.loc[delta_pairs_tasks_affinities.paired.isnull(), 'paired'] = False

# fill blank names
delta_name_dict = {row['userId']: row['name'] for index, row in delta_task_history[['userId',  'name']].iterrows()}
delta_pairs_tasks_affinities['helperName'] = delta_pairs_tasks_affinities['helperName'].fillna(delta_pairs_tasks_affinities['helperId']).replace(delta_name_dict)
delta_pairs_tasks_affinities['helpeeName'] = delta_pairs_tasks_affinities['helpeeName'].fillna(delta_pairs_tasks_affinities['helpeeId']).replace(delta_name_dict)

# fill in blank tasks
delta_pairs_tasks_affinities = delta_pairs_tasks_affinities.merge(delta_task_history[['group_pairing_id', 'userId', 'task']],
                                                              left_on=['group_pairing_id', 'helpeeId'],
                                                              right_on=['group_pairing_id', 'userId'])
del delta_pairs_tasks_affinities['task_x']
delta_pairs_tasks_affinities.rename(columns={'task_y': 'task'}, inplace=True)

# sort table and print
delta_pairs_tasks_affinities.head()

Unnamed: 0,group_pairing_id,helperId,helperName,helpeeId,helpeeName,pairing_session_index,paired,value,userId,task
0,9mdkMmj4pY8Q2TwqF-2Kfo9toi7yxEoB5sk,CQhpnQnrAQMZhD9tM,Nell O'Rourke,6iR9Z64HEJDcD8qbu,Matt Easterday,45,True,1.0,6iR9Z64HEJDcD8qbu,prepare talk slides on network improvement and Loft
1,9mdkMmj4pY8Q2TwqF-2Kfo9toi7yxEoB5sk,EDEFWcagLwCfXP5Jg,Yongsung Kim,6iR9Z64HEJDcD8qbu,Matt Easterday,45,False,-1.0,6iR9Z64HEJDcD8qbu,prepare talk slides on network improvement and Loft
2,9mdkMmj4pY8Q2TwqF-2Kfo9toi7yxEoB5sk,JaEySKdKKg7LAF3Yg,Eureka Foong,6iR9Z64HEJDcD8qbu,Matt Easterday,45,False,0.66,6iR9Z64HEJDcD8qbu,prepare talk slides on network improvement and Loft
3,9mdkMmj4pY8Q2TwqF-2Kfo9toi7yxEoB5sk,KYnkykoMwd9fbBbWB,Julie Hui,6iR9Z64HEJDcD8qbu,Matt Easterday,45,False,1.0,6iR9Z64HEJDcD8qbu,prepare talk slides on network improvement and Loft
4,9mdkMmj4pY8Q2TwqF-2Kfo9toi7yxEoB5sk,MJkj24zXWKhnZQCc3,Daniel George Rees Lewis,6iR9Z64HEJDcD8qbu,Matt Easterday,45,False,1.0,6iR9Z64HEJDcD8qbu,prepare talk slides on network improvement and Loft


In [55]:
# replace values with 1 - 5
value_mappings = {
    '-1.0': 1,
    '0.0':  2,
    '0.33': 3,
    '0.66': 4,
    '1.0':  5
}

delta_pairs_tasks_affinities['value'] = delta_pairs_tasks_affinities['value'].astype(str)
delta_pairs_tasks_affinities.replace({'value': value_mappings}, inplace=True)

# replace names
name_mappings = {
    'Leesha': 'Leesha Maliakal',
    'Garrett': 'Garrett Hedman',
    'eharburg@gmail.com': 'Emily Harburg'
}
delta_pairs_tasks_affinities.replace({'helperName': name_mappings, 'helpeeName': name_mappings}, inplace=True)

# more intutive column names
delta_pairs_tasks_affinities.rename(columns={'task': 'helpeeRequest', 'value': 'helperAbilityToHelp'}, inplace=True)

# reorder columns
delta_pairs_tasks_affinities = delta_pairs_tasks_affinities[['group_pairing_id', 'pairing_session_index', 'helperId', 'helperName',
                                                         'helpeeId', 'helpeeName', 'helpeeRequest', 'helperAbilityToHelp', 'paired']]

In [56]:
# remove nans for initial analysis, and save out csv
delta_pairs_tasks_affinities.dropna(how='any',axis=0).to_csv('./documents/delta-pair-research-pairings.csv',
                                                           index=False)

## Summary Stats from Delta

In [57]:
# number of pairing sessions
print('Number of Pairing Sessions: {}'.format(len(delta_pairs_history['group_pairing_id'].unique())))

Number of Pairing Sessions: 163


In [58]:
# number of unique users
print('Number of Unique Users: {}'.format(len(set(delta_pairs_history['helperId'].unique()).union(set(delta_pairs_history['helpeeId'].unique())))))

Number of Unique Users: 39


In [59]:
# total number of tasks
print('Total number of tasks: {}'.format(len(delta_task_history)))

Total number of tasks: 1498


In [60]:
# number of users/tasks per session
delta_task_history.groupby('group_pairing_id')['userId'].count().describe()

count    163.000000
mean     9.190184  
std      2.932475  
min      1.000000  
25%      6.500000  
50%      10.000000 
75%      11.000000 
max      16.000000 
Name: userId, dtype: float64

In [61]:
# total number of specified affinities
print('Total number of affinities: {}'.format(len(delta_affinities_history)))

Total number of affinities: 13092


In [62]:
# number of affinities per session
delta_affinities_history.groupby(['group_pairing_id'])['value'].count().describe()

count    161.000000
mean     81.316770 
std      48.216494 
min      1.000000  
25%      41.000000 
50%      84.000000 
75%      110.000000
max      234.000000
Name: value, dtype: float64

In [63]:
# number of affinities per user per session
delta_affinities_history.groupby(['group_pairing_id', 'helperId'])['value'].count().describe()

count    1482.000000
mean     8.834008   
std      2.726216   
min      1.000000   
25%      7.000000   
50%      9.000000   
75%      10.000000  
max      15.000000  
Name: value, dtype: float64