# Introducción
## Objetivo
Este programa se tuiliza para asignar grupos a estudiantes siguiendo las prioridades:
* Todos los estudiantes (o la mayor cantidad posible), deberían ser asignados a un grupo que hayan marcado como deseable
* Los estudiantes que presentan certificado laboral/deportivo recibem prioridad al momento de la asignación grupos

## Resultado
El programa genera como archivo de salida una tabla en la que se indica el grupo asignado a cada estudiante. En una columna adicional se indica si el turno asignado se encuentra entre los elegidos por el estudiante.

## Reglas
### Sobre los grupos
* Existe un numero $n$ de grupos
* Cada grupo $n_i$ tiene un numero determinado de cupos $c_i$
* Los grupos estan agregados en $S$ *supergrupos* que pueden corresponder a turnos (Matutino, Vespertino, Nocturno), salones de clase, centros de estudio, etc. Cada grupo $n_i$ pertenece a un único supergrupo $S_j$

### Sobre los estudiantes
* Existen $m$ estudiantes. Identificados por un número de cedula o pasaporte.
* Cada estudiante debe elegir al menos 1 grupo en al menos 2 supergrupos diferentes (Todos los estudiantes presentan como mínimo 2 opciones de grupo y 2 opciones de supergrupo).
* Los estudiantes pueden presentar certificado laboral o deportivo. En este caso los estudiantes tienen prioridad en la selección de grupos

# Parametros
A continuación se detallan los parametros necesarios para ejecutar el programa. Algunos de estos pueden tener un valor por defecto.

In [60]:
# Parametros del programa
# Parametros de los grupos
archivo_grupos = 'Grupos_Disponibles.txt'
columna_supergrupos = 'Turno'
columna_min_estudiantes = 'Min Estudiantes'
columna_max_estudiantes = 'Max Estudiantes'

# Parametros de los estudiantes
archivo_estudiantes = 'Formulario_Estudiantes.txt'
columnas_supergrupos = 'Grupos mañana___Grupos tarde___Grupos noche'
columnas_certificados = 'certificado'
columnas_identificacion = 'Cedula___pasaporte'

In [2]:
# Modulos
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re

In [3]:
grupos = pd.read_csv(archivo_grupos, sep='\t', index_col=0)
grupos.head()

Unnamed: 0_level_0,Turno,Min Estudiantes,Max Estudiantes
ID grupo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Turno 1,1,35
2,Turno 1,1,35
3,Turno 1,1,35
4,Turno 1,1,35
5,Turno 1,1,35


In [69]:
estudiantes = pd.read_csv(archivo_estudiantes, sep='\t')
estudiantes.head()

Unnamed: 0,Marca temporal,Correo,Cedula,pasaporte,Nacionali,NOMBRE completo,APELLIDOS,Fecha nacimiento,depto 2023,localidad 2024,barrio,udelar previa,otros estud,Grupos mañana,Grupos tarde,Grupos noche,TRABAJA,Deporte,certificado
0,26/02/2024 12:48:29,florencialado01@gmail.com,5165495,,Uruguaya,Florencia,Lado Casaglia,03/04/2002,CANELONES,Canelones La Floresta,No vivo en montevideo,NO,NO,,27,,No,NO,
1,26/02/2024 14:04:46,florencialado01@gmail.com,5165495,,Uruguaya,Florencia,Lado Casaglia,03/04/2002,CANELONES,La floresta canelones,No vivo en montevideo,SI,NO,,14,,No,NO,
2,26/02/2024 15:59:44,diegoodera9@gmail.com,44281309,,Uruguaya,Diego Maximiliano,Odera Piñeyro,14/02/1992,MONTEVIDEO,Montevideo,Guaycuru 2884(barrio Reducto),SI,NO,,9,15.0,SI,NO,https://drive.google.com/open?id=1A4_v2vYuuRva...
3,27/02/2024 21:53:05,diegoodera9@gmail.com,44281309,,Uruguayo,Diego Maximiliano,Odera Piñeyro,14/02/1992,MONTEVIDEO,Montevideo,Reducto,SI,NO,,66,15.0,SI,NO,https://drive.google.com/open?id=1VlUhv9N7JFFB...
4,26/02/2024 15:18:35,claumansilla46197@gmail.com,46197390,,Oriental,Claudia Mariana,Mansilla Goicoechea,24/08/1993,MALDONADO,Maldonado,No,SI,NO,1.0,9,15.0,SI,NO,


In [5]:
# parsear argumentos
super_group = columna_supergrupos
max_stds_col = columna_max_estudiantes
min_stds_col = columna_min_estudiantes

super_group_cols = columnas_supergrupos.split('___')
cert_cols = columnas_certificados.split('___')
id_cols = columnas_identificacion.split('___')

In [5]:
def preprocess_groups(file, super_group_col, min_students_col, max_students_col):
    groups = pd.read_csv(file, sep='\t', index_col=0)
    groups['SG'] = grupos[super_group_col].replace({val:idx for idx, val in enumerate(grupos.Turno.unique())})
    
    # ensure min and max student columns have standard names
    groups['min_students'] = groups[min_students_col]
    groups['max_students'] = groups[max_students_col]
preprocess_groups(grupos, columna_supergrupos, columna_min_estudiantes, columna_max_estudiantes)
grupos

  groups['SG'] = grupos[super_group_col].replace({val:idx for idx, val in enumerate(grupos.Turno.unique())})


Unnamed: 0_level_0,Turno,Min Estudiantes,Max Estudiantes,SG,min_students,max_students
ID grupo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Turno 1,1,35,0,1,35
2,Turno 1,1,35,0,1,35
3,Turno 1,1,35,0,1,35
4,Turno 1,1,35,0,1,35
5,Turno 1,1,35,0,1,35
...,...,...,...,...,...,...
54,Turno 3,1,35,2,1,35
69,Turno 3,1,35,2,1,35
70,Turno 3,1,35,2,1,35
71,Turno 3,1,35,2,1,35


In [None]:
# load data
def preprocess_groups(file, sg_col, min_students_col, max_students_col):
    groups = pd.read_csv(file, sep='\t', index_col=0)
    groups['SG'] = grupos[sg_col].replace({val:idx for idx, val in enumerate(grupos.Turno.unique())})
    
    # ensure min and max student columns have standard names
    groups['min_students'] = groups[min_students_col]
    groups['max_students'] = groups[max_students_col]
    return groups

def preprocess_students(file, id_cols):
    students = pd.read_csv(file, sep='\t')
    
    id_cols = id_cols.split('___')
    
    # identify ID col for each students
    collect_ids(students, id_cols)
    
    students['ID'] = ''
    students['Doc_type'] = ''

    for col in id_cols[::-1]:
        col_ids = students.loc[~students[col].isna(), col]
        students.loc[col_ids.index, 'ID'] = col_ids.values
        students.loc[col_ids.index, 'Doc_type'] = col
    # format ids, remove dots and dashes
    students['ID'] = students.ID.apply(lambda x : re.sub('[.]', '', re.sub('-.*', '', x))).apply(lambda x : x[:7])
    return students

# get group demand
def build_demand_table(students, groups, sg_cols):
    uniq_students = students.ID.unique()
    demand_tab = pd.DataFrame(False, index=uniq_students, columns=groups.index)
    sg_demand_tab = pd.DataFrame(0, index=uniq_students, columns=groups.SG.unique())
    
    # mark groups demanded by each student
    for ID, subtab in students.groupby('ID'):
        # merge all group options
        student_opts = ','.join(subtab[sg_cols].fillna('').agg(','.join, axis=1).values)
        # remove weird values, leading commas, empty spaces
        student_opts = re.sub('^,', '', re.sub(',$', '', re.sub(',,+', ',', re.sub(' ','', student_opts))))
        # generate options array
        try:
            student_opts = np.unique(student_opts.split(',')).astype(int)
            # mark student options
            demand_tab.loc[ID, student_opts] = True
        except:
            pass

    # remove weird groups (error when registering option number)
    weird_groups = demand_tab.drop(columns=groups.index)
    demand_tab = demand_tab[groups.index]

    # fill sg demand tab
    for sg, subtab in groups.groupby('SG'):
        sg_demand_tab[sg] = demand_tab[subtab.index].sum(axis=1)
    return demand_tab, sg_demand_tab, weird_groups

# build priority series
def get_certified_students(students, cert_cols):
    uniq_students = students.ID.unique()
    # establish priority students (value 0 so they appear first when sorting by priority)
    certified = pd.Series(False, index=uniq_students)
    certified[students.loc[~students[cert_cols].isna().values, 'ID'].unique()] = True
    return certified

def get_infringing_students(sg_demand, certified, min_groups, min_sg):
    infractions = (sg_demand > min_groups).sum(axis=1) < min_sg
    infractions = infractions & ~certified
    infractions = infractions.loc[infractions].index
    return infractions

def get_no_group_students(demand):
    no_group_students = (demand.sum(axis=1) == 0)
    no_group_students.loc[no_group_students].index
    return no_group_students

def build_priority(students, cert_cols, demand, sg_demand, min_groups=2, min_sg=2):
    certified = get_certified_students(students, cert_cols)
    priority = certified.apply(lambda x : 0 if x else 1)
    
    priority.loc[get_infringing_students(sg_demand, certified, min_groups, min_sg)] = 2
    priority.loc[get_no_group_students(demand)] = 3
    return priority

# pre summary

def double_sorting(demand_tab, groups, priority):
    # calculate max iterations (size of largest group)
    max_iterations = groups.loc[demand_tab.columns].max_students.max()
    # prepare placement table
    placement_tab = pd.DataFrame(False, index=demand_tab.index, columns=demand_tab.columns)

    # count student options and detect certified
    student_data = demand_tab.sum(axis=1).to_frame(name='options')
    student_data['priority'] = priority
    # sort by certification status & options
    student_data.sort_values(['priority', 'options'])

    def sort_groups(dtab):
        # sort demand table by group demand, filter out depleted groups
        group_demand = dtab.sum(axis=0).sort_values()
        group_demand = group_demand[group_demand > 0]
        dtab_out = dtab[group_demand.index].copy()
        return dtab_out
    demand_tab2 = sort_groups(demand_tab)

    # iterate
    for i in range(max_iterations):
        # always start placing by the least required groups
        for grp in demand_tab2.columns:
            # check that group hasn't been depleted in the current iteration
            if demand_tab2[grp].sum() > 0:
                grp_student = demand_tab2.index[np.argmax(demand_tab2[grp])] # select the most prioritary student for the current group (certified & least options)
                # place selected student and remove them from the demand table
                placement_tab.loc[grp_student, grp] = True
                demand_tab2.drop(index=grp_student, inplace=True)
        # recalculate demand, resort groups by least demand, drop not demanded groups
        demand_tab2 = sort_groups(demand_tab2)

    # announce total unplaced students
    total_students = demand_tab.shape[0]
    placed_students = placement_tab[placement_tab.any(axis=1)].index.values
    unplaced_students = demand_tab2
    return placement_tab, unplaced_students

In [91]:
def main(students_file,
         groups_file,
         id_cols,
         sg_students,
         sg_groups,
         min_students_col,
         max_students_col,
         cert_cols,
         min_groups=2,
         min_sg=2):
    # load data
    students = preprocess_students(students_file, id_cols)
    groups = preprocess_groups(groups_file, sg_groups, min_students_col, max_students_col)

    # build demand
    demand, sg_demand, weird_demand = build_demand_table(students, groups, sg_students)

    # build priority
    priority = build_priority(students, cert_cols, demand, sg_demand, min_groups, min_sg)

    # distribute
    placement, unplaced = double_sorting(demand, groups, priority)

In [46]:


def summarize_demand(demand, groups):
    total_demand = demand.sum(axis=0)
    options_tab = pd.DataFrame(index=demand.index, columns=groups.SG.unique())
    for sg, subtab in groups.groupby('SG'):
        options_tab[sg] = demand[subtab.index].sum(axis=1)
    options_tab['Total_options'] = options_tab.sum(axis=1)
    return total_demand, options_tab

def pre_report(demand_tab, groups, certified, total_demand, infractions):
    # report number of groups
    print(f'Grupos: {demand_tab.shape[1]}')
    for sg, subtab in groups.groupby('SG'):
        print(f'\tSupergrupo {sg}: {subtab.shape[0]} grupos')
    print()

    # report number of studens
    print(f'Estudiantes validos: {demand_tab.shape[0]}')
    print(f'Estudiantes certificados: {certified.sum()}')
    print(f'EStudiantes en infraccion: {infractions.sum()}')
demand_tab, weird_groups = build_demand_table(students, grupos, super_group_cols)
certified = get_certified_students(students, cert_cols)
total_demand, options_tab = summarize_demand(demand_tab, grupos)
infractions = get_infringing_students(options_tab, certified, 1, 2)

pre_report(demand_tab, grupos, certified, total_demand, infractions)

Grupos: 72
	Supergrupo 0: 32 grupos
	Supergrupo 1: 24 grupos
	Supergrupo 2: 16 grupos

Estudiantes validos: 1770
Estudiantes certificados: 395
EStudiantes en infraccion: 704


#### Distribution method 3
Assign students without dividing supergroups

In [11]:
# distribution method 3
whole_placement, whole_unplaced = double_sorting(demand_tab, grupos, certified)

Done assigning!
	Assigned 1697 students of 1770


ID grupo,70,69,53,54,52,71,72,35,34,33,...,10,2,8,6,24,38,9,13,14,1
Assigned,18,18,18,19,19,19,20,21,21,21,...,26,26,26,26,35,35,35,35,35,35


	Failed to assign 73 students


ID grupo,9,1,14,24,13,38
55828178,False,False,False,False,False,True
55841013,False,False,False,True,False,True
55905188,False,False,False,False,True,True
55906994,False,False,False,False,True,True
55916981,False,False,False,True,True,False
...,...,...,...,...,...,...
65956486,False,False,False,True,False,True
66239586,False,False,False,False,True,True
66447858,True,False,False,False,False,False
5573221-6,False,True,False,False,False,False


In [49]:
grupos.query('SG == 2').index

Index([15, 16, 17, 18, 33, 34, 35, 36, 51, 52, 53, 54, 69, 70, 71, 72], dtype='int64', name='ID grupo')

In [55]:
whole_placement.sum(axis=0).loc[grupos.query('SG == 2').index].sort_values()

ID grupo
53    18
69    18
70    18
52    19
54    19
71    19
72    20
17    21
33    21
34    21
35    21
16    22
18    23
15    24
51    24
36    25
dtype: int64

#### Acomodación
Seleccionar los grupos llenos, determinar si alguno de los estudiantes asignados puede transferirse a un grupo con espacio disponible

In [12]:
def get_transferrable_students(placement_tab, demand_tab, max_students=35):
    student_counts = placement_tab.sum(axis=0)
    full_groups = student_counts[student_counts == max_students].index.values
    students_in_full = placement_tab[placement_tab[full_groups].any(axis=1)].index.values

    transferrable_students = demand_tab.loc[students_in_full].drop(columns=full_groups)
    transferrable_students = transferrable_students.loc[transferrable_students.any(axis=1), transferrable_students.any(axis=0)]
    return transferrable_students

def arrange_transfer(transferrable, placement_tab, max_students=35):
    avail_places = max_students - placement_tab[transferrable.columns].sum(axis=0)
    avail_places = avail_places[avail_places > 0].sort_values(ascending=False)

    transfer_series = pd.Series(-1, index=transferrable.index)

    # sort transferrable students by options
    transferrable = transferrable.iloc[np.argsort(transferrable.sum(axis=1))].copy()
    for grp, places in avail_places.items():
        group_students = transferrable[grp]
        group_students = group_students.loc[group_students].iloc[:places].index.values
        transfer_series[group_students] = grp
        transferrable.drop(index=group_students, inplace=True)
    transfer_series = transfer_series[transfer_series > -1]
    return transfer_series

def transfer(placement_tab, transfer_series):
    placement_tab2 = placement_tab.copy()
    placement_tab2.loc[transfer_series.index] = False
    for student, grp in transfer_series.items():
        placement_tab2.loc[student, grp] = True
    return placement_tab2

def accomodate(placement_tab, unplaced_students, max_students=35):
    avail_places = max_students - placement_tab[unplaced_students.columns].sum(axis=0)
    avail_places = avail_places[avail_places > 0].sort_values()
    unplaced_students = unplaced_students[avail_places.index]

    # sort unplaced students by number of options
    unplaced_students = unplaced_students.iloc[np.argsort(unplaced_students.sum(axis=1))].copy()
    for grp in unplaced_students.columns:
        grp_students = unplaced_students[grp]
        grp_students = grp_students[grp_students].index.values[avail_places[grp]]

        placement_tab.loc[grp_students, grp] = True
        unplaced_students.drop(index=grp_students, inplace=True)
    return unplaced_students
transferrable = get_transferrable_students(whole_placement, demand_tab)

t_series = arrange_transfer(transferrable, whole_placement)

placement2 = transfer(whole_placement, t_series)

unplaced2 = accomodate(placement2, whole_unplaced)

In [13]:
unplaced2

ID grupo,1,14,24,13,38,9
55828178,False,False,False,False,True,False
66447858,False,False,False,False,False,True
65428736,True,False,False,False,False,False
63915701,True,False,False,False,False,False
59380297,True,False,False,False,False,False
...,...,...,...,...,...,...
56050948,False,True,True,True,True,False
56041212,False,True,True,True,True,False
56032166,False,True,True,True,True,False
57446330,False,True,True,True,True,False
