In [78]:
# HIDDEN
import warnings
# Ignore numpy dtype warnings. These warnings are caused by an interaction
# between numpy and Cython and can be safely ignored.
# Reference: https://stackoverflow.com/a/40846742
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
import toolz.curried as tz

sns.set()
sns.set_context('talk')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option('display.max_rows', 7)
pd.set_option('display.max_columns', 8)
pd.set_option('precision', 2)
# This option stops scientific notation for pandas
# pd.set_option('display.float_format', '{:.2f}'.format)

In [2]:
# HIDDEN
def df_interact(df, nrows=7, ncols=7):
    '''
    Outputs sliders that show rows and columns of df
    '''
    def peek(row=0, col=0):
        return df.iloc[row:row + nrows, col:col + ncols]

    row_arg = (0, len(df), nrows) if len(df) > nrows else fixed(0)
    col_arg = ((0, len(df.columns), ncols)
               if len(df.columns) > ncols else fixed(0))
    
    interact(peek, row=row_arg, col=col_arg)
    print('({} rows, {} columns) total'.format(df.shape[0], df.shape[1]))

def display_df(df, rows=pd.options.display.max_rows,
               cols=pd.options.display.max_columns):
    with pd.option_context('display.max_rows', rows,
                           'display.max_columns', cols):
        display(df)

# Group Formation

In [88]:
pairings = pd.read_csv('COGS 108 Project Groups - [export] Requested Pairings.csv')
groups = pairings[['pid1', 'pid2', 'pid3']]
groups.index.rename('id', inplace=True)
groups

Unnamed: 0_level_0,pid1,pid2,pid3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,A14037068,A15303394,
1,A13929238,A13810245,
2,A15747488,A14495064,A13895637
...,...,...,...
439,A11997642,,
440,A11863609,A15697945,
441,A11707607,,


In [316]:
sections = pairings[['pid1', 'section']]
sections

Unnamed: 0_level_0,pid1,section
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,A14037068,A08
1,A13929238,A01
2,A15747488,A07
...,...,...
439,A11997642,A08
440,A11863609,A02
441,A11707607,A04


In [90]:
display_df(sections['section'].value_counts().sort_index(), 8)

A01    60
A02    62
A03    60
A04    60
A05    62
A06    60
A07    50
A08    28
Name: section, dtype: int64

In [112]:
def make_triplet(seq):
    return frozenset(i for i in seq if isinstance(i, str))

unique_groups = (groups
 .assign(triplet=groups.apply(make_triplet, axis=1))
 .groupby('triplet')
 .first()
 .reset_index()
)
unique_groups

Unnamed: 0,triplet,pid1,pid2,pid3
0,"(A14037068, A15303394)",A14037068,A15303394,
1,(A14392785),A14392785,,
2,(A14400378),A14400378,,
...,...,...,...,...
289,(A15712710),A15712710,,
290,"(A15512992, A13687390)",A15512992,A13687390,
291,(A11707607),A11707607,,


Although every triplet of rows is unique, some PIDs appear multiple times because some students didn't fill out the survey or forgot about a teammate:

In [161]:
from collections import Counter
flatten = lambda it: [item for group in it for item in group]

counts = tz.compose(Counter, flatten)

rows = unique_groups.itertuples(index=False)
duplicates = pd.Series(
    pid for pid, count in counts(rows).items()
    if count > 1
)
duplicates

0     A14808385
1     A13613249
2     A14008664
        ...    
25    A15099386
26    A14901222
27    A15380637
Length: 28, dtype: object

In [145]:
df_interact(duplicates.to_frame())

interactive(children=(IntSlider(value=0, description='row', max=28, step=7), Output()), _dom_classes=('widget-…

(28 rows, 1 columns) total


In [143]:
def get_rows_with_pid(pid):
    return unique_groups.query('pid1 == @pid or pid2 == @pid or pid3 == @pid')

get_rows_with_pid('A14008664')

Unnamed: 0,triplet,pid1,pid2,pid3
20,"(A12781194, A14008664)",A14008664,A12781194,
106,"(A14809086, A12781194, A14008664)",A12781194,A14008664,A14809086


We resolve this by merging all groups with overlapping triplets. Note that this assumes that all students would have requested each other if they filled out the form properly. Since PIDs are relatively hard to acquire without permission, if a group member knows your PID I think it's safe to assume that they talked to you about joining their group.

Students will be unhappy if they expected to be matched with a group but didn't. In in the case of unrequited love, students might just assume their other group members were randomly paired.

In [189]:
import operator
merge_sets = tz.reduce(operator.or_)

def all_unique(counter):
    return max(counter.values()) == 1

triplets = frozenset(unique_groups['triplet'])
while not all_unique(counts(triplets)):
    [(pid, _)] = counts(triplets).most_common(1)
    dups = {t for t in triplets if pid in t}
    merged = merge_sets(dups)
    triplets = triplets - dups | {merged}

print(len(triplets))

273


In [190]:
final_groups = pd.DataFrame({'group': list(triplets)})
final_groups

Unnamed: 0,group
0,"(A14899660, A14577145)"
1,"(A13571998, A13388277, A14035418, A13849771)"
2,(A15127848)
...,...
270,"(A15099386, A15223258, A14901222)"
271,(A14101673)
272,(A13464556)


Did we leave any students out?

In [192]:
sections[~sections['pid1'].isin(set(flatten(final_groups['group'])))]

Unnamed: 0_level_0,pid1,section
id,Unnamed: 1_level_1,Unnamed: 2_level_1


## Merging Groups

Let's merge groups together, keeping students grouped within sections. Each group's section is the most popular section within that group.

In [206]:
final_groups

Unnamed: 0,group
0,"(A14899660, A14577145)"
1,"(A13571998, A13388277, A14035418, A13849771)"
2,(A15127848)
...,...
270,"(A15099386, A15223258, A14901222)"
271,(A14101673)
272,(A13464556)


In [207]:
def most_popular_section(group):
    secs = sections.query('pid1 in @group')['section']
    return secs.value_counts().index[0]

final_groups['section'] = final_groups['group'].apply(most_popular_section)
final_groups

Unnamed: 0,group,section
0,"(A14899660, A14577145)",A02
1,"(A13571998, A13388277, A14035418, A13849771)",A02
2,(A15127848),A05
...,...,...
270,"(A15099386, A15223258, A14901222)",A05
271,(A14101673),A06
272,(A13464556),A07


For each section, we'll merge groups together until all groups are between 4-6 students.

In [304]:
@tz.curry
def between(lower, upper, seq):
    return [i for i in seq if lower <= len(i) <= upper]

too_small = between(0, 3)
smaller_than = between(0)
choose = np.random.choice

def groups_ok(groups):
    return len(too_small(groups)) == 0

def make_groups_for_section(section):
    groups = frozenset(final_groups.query('section == @section')['group'])
    while not groups_ok(groups):
        g1 = choose(too_small(groups))
        g2 = choose(smaller_than(6 - len(g1), groups - {g1}))
        groups = groups - {g1, g2} | {g1 | g2}
    return groups

In [315]:
from tqdm import tqdm_notebook as t
from itertools import chain
def flatmap(f, items):
    return list(chain.from_iterable(map(f, items)))

section_labels = sorted(final_groups['section'].unique())

# How did I know to use 12 as the seed?
# Yup, I tried difference seeds until this cell stopped erroring.
np.random.seed(12)
merged_groups = flatten([make_groups_for_section(s) for s in section_labels])

len(merged_groups)

85

In [320]:
groups = pd.DataFrame([[pid, group_num]
                       for group_num, group in enumerate(merged_groups)
                       for pid in group
                      ], columns=['pid', 'group'])
groups

Unnamed: 0,pid,group
0,A13427893,0
1,A13595990,0
2,A13404308,0
...,...,...
439,A14755453,84
440,A14773804,84
441,A13600871,84


In [321]:
groups.to_csv('project_groups.csv', index=False)
!head project_groups.csv

pid,group
A13427893,0
A13595990,0
A13404308,0
A13543182,0
A14104559,0
A14745974,0
A12961029,1
A15337753,1
A12859924,1


Back to GSheets!