## Load libraries and global functions

In [1]:
# analysis
import collections
import random
import math
import pandas as pd
import numpy as np

In [2]:
# data loading
from copy import deepcopy
from pymongo import MongoClient

In [3]:
# code performance
import time
import multiprocessing as mp

## Fetch Pairing Data from [pairresearch.io](http://pairresearch.io/)

In [4]:
uri = 'mongodb://delta:delta@ds011419.mlab.com:11419/pair-research'
dbName = 'pair-research'
client = MongoClient(uri)
db = client[dbName]
db.collection_names()

  """


['affinities',
 'meteor_accounts_loginServiceConfiguration',
 'tasks_history',
 'groups',
 'users',
 'objectlabs-system.admin.collections',
 'pairs_history',
 'tasks',
 'system.indexes',
 'pairings',
 'objectlabs-system',
 'affinities_history']

In [5]:
groups = pd.DataFrame(list(db.groups.find({})))

# remove testing groups
group_creator_ignore_list = ['Demo Admin', 'ykykykykykykykykykyk', 'Stella', 'Kevin Northwestern',
                             'Kevin Chen', 'Leesha', 'Jennie']
group_ignore_ids = groups[groups['creatorName'].isin(group_creator_ignore_list)]['_id'].unique()

# subset groups by id
groups_orig_size = len(groups)
groups_new_size = 0

groups = groups[~groups['_id'].isin(group_ignore_ids)]
groups.reset_index(drop=True, inplace=True)

# get valid group ids from remaining groups
valid_group_ids = groups['_id'].unique()

# print change in size
groups_new_size = len(groups)
print('Number of Groups \nOriginal size: {} --> New size: {}'.format(groups_orig_size, groups_new_size))

# display task history
groups.head()

Number of Groups 
Original size: 614 --> New size: 73


Unnamed: 0,_id,groupName,description,creatorId,creatorName,roles,creationDate,members,active,activePairing
0,uPLDbfFqqdHEEkgCT,Beatles,Rock and Roll Band,goGr47HDwtfphJ5xK,Julian Vicens,"[{'title': 'Guitar', '_id': 'oB3qMqXdTJNqR6vbZ...",2016-08-10 18:55:16.164,"[{'fullName': 'Julian Vicens', 'userId': 'goGr...",True,
1,Et46F6odTBmiFiDSZ,Knight Lab Testing,Knight Lab taking Pair Research for a spin,NtZ9hv3g6eLAwN2nY,Joe Germuska,"[{'title': 'Admin', '_id': 's2JKkhE9XC6GPW5ev'...",2016-07-18 21:21:54.117,"[{'fullName': 'Joe Germuska', 'userId': 'NtZ9h...",True,nnN46Abcc78AAtqKf
2,kY7xHo6c5m5tCiQMH,Knight Lab Pair Research,Thursdays at 2:30,u2GAvznbx7Jbf97Hk,Emily Withrow,"[{'title': 'Professor', '_id': 'q3PJXDZpMMhcZB...",2016-09-28 19:17:10.709,"[{'fullName': 'Emily Withrow', 'userId': 'u2GA...",False,
3,KEo62WdN5WSkHa9Hh,Knight Lab Pair Research,Thursdays at 2:30,u2GAvznbx7Jbf97Hk,Emily Withrow,"[{'title': 'Professor', '_id': '6L6YwxgDwpqgoY...",2016-09-29 15:15:15.184,"[{'fullName': 'Emily Withrow', 'userId': 'u2GA...",False,
4,qPnf2DHHihugATnxD,Segal Design Cluster,an intellectual community for design faculty a...,PavTL8zD9664wvtfB,Haoqi Zhang,"[{'title': 'Professor', '_id': 'sSNgzD6So2kz95...",2016-11-10 18:38:04.379,"[{'fullName': 'Haoqi Zhang', 'userId': 'PavTL8...",True,52meFWjxGNoAqTJxx


In [6]:
users = pd.DataFrame(list(db.users.find({})))

# users must be in at least one valid group
valid_group_ids_set = set(valid_group_ids)
users['valid_user'] = users['groups'].apply(lambda x: bool(valid_group_ids_set & set([y['groupId'] for y in x])))

# remove invalid users
users_orig_size = len(users)
users_new_size = 0

users = users[users['valid_user']]

# print change in size
users_new_size = len(users)
print('Number of Users \nOriginal size: {} --> New size: {}'.format(users_orig_size, users_new_size))

# display users
users.head()

Number of Users 
Original size: 1041 --> New size: 966


Unnamed: 0,_id,createdAt,services,emails,profile,groups,valid_user
1,BPQ7hyoHgghctHPqq,2016-08-29 18:24:50.295,{'password': {'bcrypt': '$2a$10$1.nd.WyfVggPpg...,"[{'address': 'egerber@northwestern.edu', 'veri...",{'fullName': 'Liz Gerber'},"[{'groupId': '9mdkMmj4pY8Q2TwqF', 'role': {'_i...",True
2,bZEjadPH7KrjM9PfD,2016-11-10 19:19:34.147,{'password': {}},"[{'address': 'ampiper@northwestern.edu', 'veri...",{'fullName': 'ampiper@northwestern.edu'},"[{'groupId': 'qPnf2DHHihugATnxD', 'role': {'_i...",True
5,4nAboBfRx5RMJg68G,2017-03-27 14:33:17.771,{'password': {}},"[{'address': 'g-danko@northwestern.edu', 'veri...",{'fullName': 'g-danko@northwestern.edu'},"[{'groupId': 'u4kjJC55DPMLpR8bC', 'role': {'ti...",True
6,9iEAAD9Y54n4hMy3D,2017-03-27 14:39:27.572,{'password': {}},"[{'address': 'a-prachand@northwestern.edu', 'v...",{'fullName': 'a-prachand@northwestern.edu'},"[{'groupId': 'u4kjJC55DPMLpR8bC', 'role': {'ti...",True
8,Byki6KMawAsYnmr8x,2017-06-01 21:25:16.500,{'password': {}},"[{'address': 'bjoern@eecs.berkeley.edu', 'veri...",{'fullName': 'bjoern@eecs.berkeley.edu'},"[{'groupId': 'je9bo2hHLbYwWNtRd', 'role': {'_i...",True


In [7]:
tasks_history = pd.DataFrame(list(db.tasks_history.find({})))

# remove bad groups
tasks_history_orig_size = len(tasks_history)
tasks_history_new_size = 0

tasks_history = tasks_history[tasks_history['groupId'].isin(valid_group_ids)]
tasks_history.reset_index(drop=True, inplace=True)

# add group_pairing_id
tasks_history['group_pairing_id'] = tasks_history['groupId'] + '-' + tasks_history['pairingId']

# print change in size
tasks_history_new_size = len(tasks_history)
print('Number of Tasks\nOriginal size: {} --> New size: {}'.format(tasks_history_orig_size, tasks_history_new_size))

# display task history
tasks_history.head()

Number of Tasks
Original size: 4119 --> New size: 4107


Unnamed: 0,_id,name,userId,groupId,task,pairingId,group_pairing_id
0,k4ewZSgDHsvDFkXpX,Yongsung Kim,EDEFWcagLwCfXP5Jg,9mdkMmj4pY8Q2TwqF,i need to send out a short-survey to interviewees,nRAQpsPhsQs4zRvTL,9mdkMmj4pY8Q2TwqF-nRAQpsPhsQs4zRvTL
1,RZZWR8pABaJBKYNFu,Julian Vicens,goGr47HDwtfphJ5xK,9mdkMmj4pY8Q2TwqF,I would like to talk about different ways to m...,nRAQpsPhsQs4zRvTL,9mdkMmj4pY8Q2TwqF-nRAQpsPhsQs4zRvTL
2,Xr3dvNreiwzq9ixrQ,Spencer Carlson,vbsF64nAgoitwrNeB,9mdkMmj4pY8Q2TwqF,Make educated guesses about the quality of my ...,nRAQpsPhsQs4zRvTL,9mdkMmj4pY8Q2TwqF-nRAQpsPhsQs4zRvTL
3,dFpfXT8szHkp2pYgG,Leesha,aNdSTecskgeAm2St5,9mdkMmj4pY8Q2TwqF,I need help planning a latency handling featur...,nRAQpsPhsQs4zRvTL,9mdkMmj4pY8Q2TwqF-nRAQpsPhsQs4zRvTL
4,zEMk9HQo9azvKzDye,Eureka Foong,JaEySKdKKg7LAF3Yg,9mdkMmj4pY8Q2TwqF,Installing a program using Terminal (I'm bad a...,nRAQpsPhsQs4zRvTL,9mdkMmj4pY8Q2TwqF-nRAQpsPhsQs4zRvTL


In [8]:
pairings = pd.DataFrame(list(db.pairings.find({})))

# remove bad groups
pairings_orig_size = len(pairings)
pairings_new_size = 0

pairings = pairings[pairings['groupId'].isin(valid_group_ids)]

# add group_pair id
pairings['group_pairing_id'] = pairings['groupId'] + '-' + pairings['_id']
pairings.reset_index(drop=True, inplace=True)

# print change in size
pairings_new_size = len(pairings)
print('Number of Pairing Sessions\nOriginal size: {} --> New size: {}'.format(pairings_orig_size, pairings_new_size))

# display current pairings
print('Pairing count: {}, Unique group count: {}'.format(len(pairings), len(pairings.groupId.unique())))
pairings.sort_values('timestamp', ascending=False, inplace=True)
pairings.head()

Number of Pairing Sessions
Original size: 637 --> New size: 558
Pairing count: 558, Unique group count: 47


Unnamed: 0,_id,groupId,pairings,timestamp,group_pairing_id
557,2oYh3uMnKpkAdepuF,9mdkMmj4pY8Q2TwqF,"[{'firstUserId': 'aNdSTecskgeAm2St5', 'firstUs...",2019-08-14 20:32:17.057,9mdkMmj4pY8Q2TwqF-2oYh3uMnKpkAdepuF
556,Z5jWjm4txMp9DTt8g,9mdkMmj4pY8Q2TwqF,"[{'firstUserId': 'vbsF64nAgoitwrNeB', 'firstUs...",2019-08-14 20:32:14.247,9mdkMmj4pY8Q2TwqF-Z5jWjm4txMp9DTt8g
555,Sk28TfvSR9sGYfkFS,9mdkMmj4pY8Q2TwqF,"[{'firstUserId': 'zmwK4tJHtwLw8pLRC', 'firstUs...",2019-08-07 20:23:03.568,9mdkMmj4pY8Q2TwqF-Sk28TfvSR9sGYfkFS
554,DCgn5ujPLrR4m4yNJ,7Bp8SQN7rXjbYW5zi,"[{'firstUserId': '97ynrbrFHfJEEbmr9', 'firstUs...",2019-08-05 19:35:36.762,7Bp8SQN7rXjbYW5zi-DCgn5ujPLrR4m4yNJ
553,iFK4pXLbjetbR6zxm,9mdkMmj4pY8Q2TwqF,"[{'firstUserId': '7yemYfg5RHGKux8YH', 'firstUs...",2019-07-31 20:22:58.853,9mdkMmj4pY8Q2TwqF-iFK4pXLbjetbR6zxm


In [9]:
pairs_history = pd.DataFrame(list(db.pairs_history.find({})))

# remove bad groups
pairs_history_orig_size = len(pairs_history)
pairs_history_new_size = 0

pairs_history = pairs_history[pairs_history['groupId'].isin(valid_group_ids)]

# add group_pairing_id column
pairs_history['group_pairing_id'] = pairs_history['groupId'] + '-' + pairs_history['pairingId']
pairs_history.reset_index(drop=True, inplace=True)

# print change in size
pairs_history_new_size = len(pairs_history)
print('Number of Pairs\nOriginal size: {} --> New size: {}'.format(pairs_history_orig_size, pairs_history_new_size))

# display current pairs_history
print('Unique group count: {}, Unique pairing count: {}'.format(len(pairs_history.groupId.unique()), 
                                                                len(pairs_history.group_pairing_id.unique())))
pairs_history.sort_values('timestamp', ascending=False, inplace=True)
pairs_history.head()

Number of Pairs
Original size: 2911 --> New size: 2903
Unique group count: 47, Unique pairing count: 558


Unnamed: 0,_id,groupId,pairingId,firstUserId,firstUserName,firstUserRole,secondUserId,secondUserName,secondUserRole,timestamp,group_pairing_id
2902,X6BtqY2KAyQRqmP5s,9mdkMmj4pY8Q2TwqF,2oYh3uMnKpkAdepuF,Cu5jmoaSGy3QrsCE5,Patricia Song,Undergraduate Students,,,,2019-08-14 20:32:17.057,9mdkMmj4pY8Q2TwqF-2oYh3uMnKpkAdepuF
2901,P8pmHAErjPpDh346x,9mdkMmj4pY8Q2TwqF,2oYh3uMnKpkAdepuF,BPQ7hyoHgghctHPqq,Liz Gerber,Faculty,PavTL8zD9664wvtfB,Haoqi Zhang,Faculty,2019-08-14 20:32:17.057,9mdkMmj4pY8Q2TwqF-2oYh3uMnKpkAdepuF
2900,bLWv76vGLuHof4STL,9mdkMmj4pY8Q2TwqF,2oYh3uMnKpkAdepuF,LXTzT8KpQBpyQdXBD,Gobi Dasu,PhD Student,xQ4mPiD4TX9MJqiqj,Kristine Lu,PhD Student,2019-08-14 20:32:17.057,9mdkMmj4pY8Q2TwqF-2oYh3uMnKpkAdepuF
2899,LCYzNMFko2Eg3uuJC,9mdkMmj4pY8Q2TwqF,2oYh3uMnKpkAdepuF,EDEFWcagLwCfXP5Jg,Yongsung Kim,PhD Student,6iR9Z64HEJDcD8qbu,Matt Easterday,Faculty,2019-08-14 20:32:17.057,9mdkMmj4pY8Q2TwqF-2oYh3uMnKpkAdepuF
2898,ZA6AMhgoDRfEjmgsM,9mdkMmj4pY8Q2TwqF,2oYh3uMnKpkAdepuF,vbsF64nAgoitwrNeB,Spencer Carlson,PhD Student,LJHoMDQBnXfXtTDYi,GG Guitart,Undergraduate Students,2019-08-14 20:32:17.057,9mdkMmj4pY8Q2TwqF-2oYh3uMnKpkAdepuF


In [10]:
tasks = pd.DataFrame(list(db.tasks.find({})))

# remove bad groups
tasks_orig_size = len(tasks)
tasks_new_size = 0

tasks = tasks[tasks['groupId'].isin(valid_group_ids)]
tasks.reset_index(drop=True, inplace=True)

# print change in size
tasks_new_size = len(tasks)
print('Number of Tasks\nOriginal size: {} --> New size: {}'.format(tasks_orig_size, tasks_new_size))

# display current tasks
tasks.head()

Number of Tasks
Original size: 1165 --> New size: 747


Unnamed: 0,_id,name,userId,groupId,task
0,qSPQiuE42yMiZJYrM,Joe Germuska,NtZ9hv3g6eLAwN2nY,Et46F6odTBmiFiDSZ,
1,9ZtF3iuf2Gs273Nq6,wise@northwestern.edu,c2bWRsNjfijQtq6pN,Et46F6odTBmiFiDSZ,
2,36BHem3sZ7vPesS9v,e-withrow@northwestern.edu,u2GAvznbx7Jbf97Hk,Et46F6odTBmiFiDSZ,
3,fYA2q2QAaahrvym9N,Julian Vicens,goGr47HDwtfphJ5xK,uPLDbfFqqdHEEkgCT,
4,pDv2qxmc3Qtgi5msk,rebecca.poulson@northwestern.edu,WTKxXpLuJAnDfgvFH,Et46F6odTBmiFiDSZ,


In [11]:
affinities = pd.DataFrame(list(db.affinities.find({})))

# remove bad groups
affinities_orig_size = len(affinities)
affinities_new_size = 0

affinities = affinities[affinities['groupId'].isin(valid_group_ids)]
affinities.reset_index(drop=True, inplace=True)

# print change in size
affinities_new_size = len(affinities)
print('Number of Current Affinities\nOriginal size: {} --> New size: {}'.format(affinities_orig_size, affinities_new_size))

# display current affinities
affinities.head()

Number of Current Affinities
Original size: 3338 --> New size: 1931


Unnamed: 0,_id,helperId,helpeeId,groupId,value
0,CBAFDuJRt4PCqMFbi,u2GAvznbx7Jbf97Hk,WTKxXpLuJAnDfgvFH,Et46F6odTBmiFiDSZ,1.0
1,sn3M9GLYLwxrdNuLf,dKco6yw8vaxbGpdrr,WTKxXpLuJAnDfgvFH,Et46F6odTBmiFiDSZ,-1.0
2,QTWuMLM39mmfKyqqk,WTKxXpLuJAnDfgvFH,dKco6yw8vaxbGpdrr,Et46F6odTBmiFiDSZ,0.33
3,cQAeZBQdFyagMjJbJ,dKco6yw8vaxbGpdrr,u2GAvznbx7Jbf97Hk,Et46F6odTBmiFiDSZ,1.0
4,zTpAK9XCN7p2Ea6Pg,u2GAvznbx7Jbf97Hk,dKco6yw8vaxbGpdrr,Et46F6odTBmiFiDSZ,1.0


In [12]:
affinities_history = pd.DataFrame(list(db.affinities_history.find({})))

# remove bad groups
affinities_history_orig_size = len(affinities_history)
affinities_history_new_size = 0

affinities_history = affinities_history[affinities_history['groupId'].isin(valid_group_ids)]

# add group_pairing_id column
affinities_history['group_pairing_id'] = affinities_history['groupId'] + '-' + affinities_history['pairingId']

# remove duplicate ratings
affinities_history.sort_values(['group_pairing_id', 'helpeeId', 'helperId'], inplace=True)
affinities_history.drop_duplicates(subset=['group_pairing_id', 'helpeeId', 'helperId'], keep='last', inplace=True)
affinities_history.reset_index(drop=True, inplace=True)

# print change in size
affinities_history_new_size = len(affinities_history)
print('Number of Past Affinities\nOriginal size: {} --> New size: {}'.format(affinities_history_orig_size, affinities_history_new_size))

# display affinity data
print('Unique Group Pairings: {}'.format(len(affinities_history.group_pairing_id.unique())))
affinities_history.head()

Number of Past Affinities
Original size: 49608 --> New size: 49338
Unique Group Pairings: 540


Unnamed: 0,_id,helperId,helpeeId,groupId,value,pairingId,group_pairing_id
0,v3nKkg77Jouf6BZ8G,GLTz7m8y7RqZCYzxx,3si95Pn6NjXTxCWcT,2rFoGTfRa9LFdpQNA,0.33,2EPbA6HkydPTdxCWD,2rFoGTfRa9LFdpQNA-2EPbA6HkydPTdxCWD
1,D2kBQDRftmygv5f4L,PWufwHDsbRaw4se4X,3si95Pn6NjXTxCWcT,2rFoGTfRa9LFdpQNA,1.0,2EPbA6HkydPTdxCWD,2rFoGTfRa9LFdpQNA-2EPbA6HkydPTdxCWD
2,R588B5nqLhmLbC4iW,f8wwqTXaifkxxoAc2,3si95Pn6NjXTxCWcT,2rFoGTfRa9LFdpQNA,0.0,2EPbA6HkydPTdxCWD,2rFoGTfRa9LFdpQNA-2EPbA6HkydPTdxCWD
3,poiynLy2tnCMNzdGf,iyRaCwz7QzxPRSi5t,3si95Pn6NjXTxCWcT,2rFoGTfRa9LFdpQNA,1.0,2EPbA6HkydPTdxCWD,2rFoGTfRa9LFdpQNA-2EPbA6HkydPTdxCWD
4,KmiSFQicDRa263Nfc,kEZXdjhfohiGxJWdu,3si95Pn6NjXTxCWcT,2rFoGTfRa9LFdpQNA,-1.0,2EPbA6HkydPTdxCWD,2rFoGTfRa9LFdpQNA-2EPbA6HkydPTdxCWD


## Isolate DTR Pairing Data
TODO: there's still a lot of nan's here. Investigate why that's the case.

In [13]:
dtr_group_id = 'sM3z5FkZfsABqcj3g'

In [14]:
dtr_task_history = tasks_history.drop_duplicates(['task', 'userId', 'group_pairing_id'])

dtr_task_history = dtr_task_history[dtr_task_history['groupId'] == dtr_group_id].reset_index(drop=True)
dtr_pairs_history = pairs_history[pairs_history['groupId'] == dtr_group_id].reset_index(drop=True)
dtr_affinities_history = affinities_history[affinities_history['groupId'] == dtr_group_id].reset_index(drop=True)

dtr_task_history = dtr_task_history[['group_pairing_id', 'userId', 'task']]
dtr_pairs_history = dtr_pairs_history[['group_pairing_id',
                                       'firstUserId', 'firstUserName', 'secondUserId', 'secondUserName']]
dtr_affinities_history = dtr_affinities_history[['group_pairing_id',
                                                 'helpeeId', 'helperId', 'value']]

In [15]:
# create single dataframe with pairing and affinity ratings 
dtr_pairs_affinities = dtr_affinities_history.merge(dtr_pairs_history,
                                                    left_on=['group_pairing_id', 'helpeeId', 'helperId'],
                                                    right_on=['group_pairing_id', 'firstUserId', 'secondUserId'],
                                                    how='right')

# replace values with 1 - 5
value_mappings = {
    '-1.0': 1,
    '0.0':  2,
    '0.33': 3,
    '0.66': 4,
    '1.0':  5
}

dtr_pairs_affinities['value'] = dtr_pairs_affinities['value'].astype(str)
dtr_pairs_affinities.replace({'value': value_mappings}, inplace=True)

# remove merge cols
dtr_pairs_affinities.drop(['firstUserId', 'secondUserId'], axis=1, inplace=True)

In [16]:
# add task description to data frame
dtr_pairs_affinities = dtr_pairs_affinities.merge(dtr_task_history,
                                                  left_on=['group_pairing_id', 'helpeeId'],
                                                  right_on=['group_pairing_id', 'userId'],
                                                  how='left')

In [17]:
# cleanup columns
del dtr_pairs_affinities['userId']
dtr_pairs_affinities.rename(columns={
    'firstUserName': 'helpeeName',
    'secondUserName': 'helperName',
    'value': 'affinity'
}, inplace=True)

dtr_pairs_affinities = dtr_pairs_affinities[['group_pairing_id',
                                             'helperId', 'helperName', 'helpeeId', 'helpeeName',
                                             'task', 'affinity']]

# replace names
name_replacements = {
    'richardhuang2019@u.northwestern.edu': 'Richard Huang',
    'AlainaKafkes2017@u.northwestern.edu': 'Alaina Kafkes',
    'judylee2021@u.northwestern.edu': 'Judy Lee',
    'Leesha': 'Leesha Maliakal',
    'andrew': 'Andrew Finke'
}
dtr_pairs_affinities.replace({'helperName': name_replacements, 'helpeeName': name_replacements}, inplace=True)

In [18]:
dtr_pairs_affinities.head()

Unnamed: 0,group_pairing_id,helperId,helperName,helpeeId,helpeeName,task,affinity
0,sM3z5FkZfsABqcj3g-2CgSkRTMERna4KKdX,Z68eenXDt84czPkmN,Ryan Louie,5EJncSsf5AStw6maE,Nneoma Oradiegwu,Reflect on help seeking behaviors,4
1,sM3z5FkZfsABqcj3g-2CgSkRTMERna4KKdX,3cb7eEv3gY3xiBCk7,Daniel Zhu,HnvDuWqAnAqHbXxLT,Sanfeng Wang,I need help with organizing and testing a stud...,4
2,sM3z5FkZfsABqcj3g-2CgSkRTMERna4KKdX,N2ZQJJLByPFP2DA2S,Samuel Naser,PqHjaxzy7KijCWRjm,Andrew Finke,Make sure I don't make any spelling / grammar ...,5
3,sM3z5FkZfsABqcj3g-2CgSkRTMERna4KKdX,LG96v2nkrkJhWstC4,Olivia Barnett,aNdSTecskgeAm2St5,Leesha Maliakal,help me debug an issue with deploying a node s...,2
4,sM3z5FkZfsABqcj3g-2CgSkRTMERna4KKdX,9Wcpa2nSnEX8k7kMN,Maggie Lou,afyZT6q2HepoCS76m,Caroline Grace Alexander,Develop protocol on how to analyze my videos,3


In [19]:
dtr_pairs_affinities.to_csv('./documents/dtr-pair-research-pairings.csv', index=False)