# Main Document to Load and Traverse Data

> Expected Data: school admissions preferences, number of students admitted per attribute category, etc.

In [1]:
import pandas as pd
import numpy as np
import os

from tqdm import tqdm

cwd = os.getcwd()
testing = False  # ******

## Schema:
Nutrition Label:
- School Label:
  - Name
  - ID
  - AP courses offered
  - Location
  - Subject Specialization
  - Accomodations
  - Admission Preferences
   - gpa cutoff
   - zone
   - subject specification


- Student Label:
  - Name
  - Location
  - Zone
  - School ID
  - Subject preference
  - AP course preferences

# Task 1) Load student demographics and preferences

In [2]:
student_df = pd.read_csv(cwd + "/Data/student_info_with_demographics.csv")
student_df

Unnamed: 0,Student_Id,Residential_District,swd,poverty,sex,ell,ENI,Black,Hispanic,Multi-Racial,White,Asian,Native American,Missing Race/Ethnicity Data,school,Math_score,ELA_score
0,student_36332,Residential District Unknown,0,0,1,0,0.723,0,1,0,0,0,0,0,30Q127,2.000000,3.125000
1,student_36144,Residential District Unknown,0,1,0,0,0.684,0,1,0,0,0,0,0,27Q137,2.153846,2.750000
2,student_37038,Residential District Unknown,1,1,1,0,0.881,0,1,0,0,0,0,0,24Q061,2.846154,1.753425
3,student_614,Residential District Unknown,0,0,1,0,0.191,0,0,0,1,0,0,0,02M114,4.275862,4.181818
4,student_21981,Residential District Unknown,1,0,1,0,0.304,0,0,0,1,0,0,0,02M312,4.017241,3.125000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71245,student_22462,Residential District 01,0,0,1,0,0.650,0,1,0,0,0,0,0,01M450,2.000000,1.972603
71246,student_30857,Residential District 01,1,0,1,0,0.621,0,0,0,0,1,0,0,01M184,3.000000,3.125000
71247,student_57833,Residential District 01,0,0,0,0,0.256,1,0,0,0,0,0,0,01M539,2.615385,4.030303
71248,student_32259,Residential District 01,0,1,0,0,0.950,0,0,0,0,1,0,0,01M188,3.083333,2.000000


In [26]:
ranking_df = pd.read_csv(cwd + "/Data/student_prefs.csv")
ranking_df

Unnamed: 0,Student_Id,School,Rank,Rating
0,student_68963,02M411,0,1.000000
1,student_68963,02M376,1,0.500000
2,student_68963,02M316,2,0.333333
3,student_68963,02M438,3,0.250000
4,student_68963,01M448,4,0.200000
...,...,...,...,...
492363,student_36332,02M414,2,0.333333
492364,student_36332,20K490,3,0.250000
492365,student_36332,18K637,4,0.200000
492366,student_36332,28Q686,5,0.166667


# Task 2) Generate & Export Student Preferences
> Note: decide how many schools each student should rank and which preference

### Case 1: full random with no weight
### *Case 2: full random, ordered by popularity buckets
### Case 3: popularity_bucket-weighted randomness
### *Case 4: popularity_bucket-weighted randomness, ordered by popularity buckets

In [27]:
student_ids = list(ranking_df["Student_Id"].unique())  # get student ids
if testing:  # subset of data for testing
    student_ids = student_ids[:200]

# GET NUMBER OF SCHOOLS RANKED PER STUDENTS
full_list = False  # **students rank 12 schools, ELSE students rank schools based on real-world distribution
student_school_counts = {}  # student_id: #schools to choose
max_num_schools = 12  # maximum number of rankings a student can have
if full_list:
    student_school_counts = dict([[sid, max_num_schools] for sid in student_ids])
else:
    student_school_counts = dict([[sid, 0] for sid in student_ids])
    for i, row in tqdm(ranking_df.iterrows()):  # for each ranking, add 1 to student
        assert row["Student_Id"] in student_school_counts, row["Student_Id"]
        student_school_counts[row["Student_Id"]] += 1

# show descriptive stats (sanity check)
pd.DataFrame(student_school_counts.values()).describe()

1707it [00:00, 8808.31it/s]


AssertionError: student_57782

In [5]:
school_ids = list(ranking_df["School"].unique())

# MAKING STUDENT_RANKINGS STRUCTURE
unknown_val = "nan"  # placeholder for no rankings
student_rankings = dict([ [sid, [unknown_val for i in range(max_num_schools)]] for sid in student_ids])  # initializing dict

for sid in tqdm(student_rankings.keys()):
    # CASE 1: RANDOM IMPLEMENTATION
    student_rankings[sid] = np.random.choice(school_ids, size=student_school_counts[sid], replace=False).tolist()

    # TODO IMPLEMENT CASE 2 AND 3 AND 4 ---> CHECK POPULARITY RATE, WHAT PERCENTAGE OF SYNTHETIC STUDENTS HAVE APPLIED
    # ARE THE MOST POPULAR SCHOOLS GETTING THE MOST APPLICATIONS (num applicants) -- sanity check for simulation results
print("\nExample Student Ranking\n", student_rankings[np.random.choice(student_ids)])

100%|███████████████████████████████████████| 71250/71250 [00:06<00:00, 11336.40it/s]


Example Student Ranking
 ['15K497', '31R028', '05M148', '15K684', '18K637', '03M299', '02M544', '12X446', '09X564', '02M418', '15K429', '18K633']





In [6]:
# stats for sanity check
rank_avg = np.mean([len(rank) for sid, rank in student_rankings.items()])
print("Current average number of schools ranked per student:", rank_avg)

Current average number of schools ranked per student: 12.0


In [7]:
# saving student rankings
np.save(cwd+"/Data/Generated/student_rankings.npy", student_rankings)
print("saved at", "/Data/Generated/student_rankings.npy")

saved at /Data/Generated/student_rankings.npy


# Task 3) Generate and Export Additional Student Attributes (for school ranking)
### Step 1: Add Lottery number
### Step 2: Add gpa placeholder
### Step 3: Add high/med/low seat
### Step 4: Add 5-group gpa for screen schools

In [8]:
import uuid
# adding lotterly number
lnums = [uuid.uuid4().hex for i in range(len(student_df))]
student_df["Lottery"] = lnums
student_df

Unnamed: 0,Student_Id,Residential_District,swd,poverty,sex,ell,ENI,Black,Hispanic,Multi-Racial,White,Asian,Native American,Missing Race/Ethnicity Data,school,Math_score,ELA_score,Lottery
0,student_36332,Residential District Unknown,0,0,1,0,0.723,0,1,0,0,0,0,0,30Q127,2.000000,3.125000,e343c3bb4ca4459f89e6502045868679
1,student_36144,Residential District Unknown,0,1,0,0,0.684,0,1,0,0,0,0,0,27Q137,2.153846,2.750000,0ef095774a72470eb51c792fc3e8b43a
2,student_37038,Residential District Unknown,1,1,1,0,0.881,0,1,0,0,0,0,0,24Q061,2.846154,1.753425,09fc1c11eccd4df6975e9fad3536c40d
3,student_614,Residential District Unknown,0,0,1,0,0.191,0,0,0,1,0,0,0,02M114,4.275862,4.181818,025b848c53ac4d5e91c7fbdf8c531bd6
4,student_21981,Residential District Unknown,1,0,1,0,0.304,0,0,0,1,0,0,0,02M312,4.017241,3.125000,3266fb6861614386ae040c2004aaa1de
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71245,student_22462,Residential District 01,0,0,1,0,0.650,0,1,0,0,0,0,0,01M450,2.000000,1.972603,65e71d7e444148258e68b82532f4af8c
71246,student_30857,Residential District 01,1,0,1,0,0.621,0,0,0,0,1,0,0,01M184,3.000000,3.125000,27772b1ec98941759ae97d156b8ec4c8
71247,student_57833,Residential District 01,0,0,0,0,0.256,1,0,0,0,0,0,0,01M539,2.615385,4.030303,251da548af5e4c76b92e06fde70c93f7
71248,student_32259,Residential District 01,0,1,0,0,0.950,0,0,0,0,1,0,0,01M188,3.083333,2.000000,8f2fe548af1641a4b2243181c815bd6b


In [9]:
# adding gpa, seat, and screen attributes

def add_fake_gpa(students):
    """ adds a gpa attribute based on the students ELA and Math scores """
    students["fake_gpa"] = students["Math_score"] + students["ELA_score"]
    max_score = max(students["fake_gpa"])
    students["fake_gpa"] = students["fake_gpa"].apply(lambda score: round(4*score/max_score, 2))  # gpa-normalized
    return students
    
def add_edopt(students):
    """ adds gpa percentile in 3rds for ed opt schools """
    lower = students["fake_gpa"].quantile(1/3)
    upper = students["fake_gpa"].quantile(2/3)
    seated = lambda x: 3 if x >= upper else (2 if x >= lower else 1)
    students["seat"] = students["fake_gpa"].apply(seated)
    return students

def add_screen(students):
    """ adds gpa percentile in 5ths for screen schools """
    lower1 = students["fake_gpa"].quantile(1/5)
    lower2 = students["fake_gpa"].quantile(2/5)
    upper1 = students["fake_gpa"].quantile(3/5)
    upper2 = students["fake_gpa"].quantile(4/5)
    def screened(gpa):
        if gpa >= upper2:
            return 5
        elif gpa >= upper1:
            return 4
        elif gpa >= lower2:
            return 3
        elif gpa >= lower1:
            return 2
        else:
            return 1
        
    students["screen"] = students["fake_gpa"].apply(lambda x: screened(x))
    return students

student_df = add_fake_gpa(student_df)
student_df = add_edopt(student_df)
student_df = add_screen(student_df)
student_df

Unnamed: 0,Student_Id,Residential_District,swd,poverty,sex,ell,ENI,Black,Hispanic,Multi-Racial,...,Asian,Native American,Missing Race/Ethnicity Data,school,Math_score,ELA_score,Lottery,fake_gpa,seat,screen
0,student_36332,Residential District Unknown,0,0,1,0,0.723,0,1,0,...,0,0,0,30Q127,2.000000,3.125000,e343c3bb4ca4459f89e6502045868679,2.29,2,3
1,student_36144,Residential District Unknown,0,1,0,0,0.684,0,1,0,...,0,0,0,27Q137,2.153846,2.750000,0ef095774a72470eb51c792fc3e8b43a,2.19,2,2
2,student_37038,Residential District Unknown,1,1,1,0,0.881,0,1,0,...,0,0,0,24Q061,2.846154,1.753425,09fc1c11eccd4df6975e9fad3536c40d,2.05,1,2
3,student_614,Residential District Unknown,0,0,1,0,0.191,0,0,0,...,0,0,0,02M114,4.275862,4.181818,025b848c53ac4d5e91c7fbdf8c531bd6,3.77,3,5
4,student_21981,Residential District Unknown,1,0,1,0,0.304,0,0,0,...,0,0,0,02M312,4.017241,3.125000,3266fb6861614386ae040c2004aaa1de,3.19,3,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71245,student_22462,Residential District 01,0,0,1,0,0.650,0,1,0,...,0,0,0,01M450,2.000000,1.972603,65e71d7e444148258e68b82532f4af8c,1.77,1,1
71246,student_30857,Residential District 01,1,0,1,0,0.621,0,0,0,...,1,0,0,01M184,3.000000,3.125000,27772b1ec98941759ae97d156b8ec4c8,2.73,2,4
71247,student_57833,Residential District 01,0,0,0,0,0.256,1,0,0,...,0,0,0,01M539,2.615385,4.030303,251da548af5e4c76b92e06fde70c93f7,2.96,3,4
71248,student_32259,Residential District 01,0,1,0,0,0.950,0,0,0,...,1,0,0,01M188,3.083333,2.000000,8f2fe548af1641a4b2243181c815bd6b,2.27,2,3


In [10]:
# saving school demographics with updated attributes
np.save(cwd+"/Data/Generated/student_demographics.npy", student_df)
print("saved at", "/Data/Generated/student_demographics.npy")

saved at /Data/Generated/student_demographics.npy


## Step 3.5) Add profiles for school rankings

In [11]:
import random

schools_criterion = ["District", "Lottery", "ELA_Score", "Math_Score", "Poverty_Index", "fake_gpa", "seat", "screen"]

# BASIC SCHOOL PROFILES (intuition)
# Note: these weights are in powers of 10 so they are closer to discrete ordering than pure ratios
district_school = [3, 1, 0, 0, 0, 0, 0, 0]
balanced_school = [3, 1, 2, 2, 1, 0, 0, 0]
stem_school =     [3, 1, 2, 5, 1, 0, 0, 0]  # Note: most schools weight ELA and math equivalently (less realistic)
lottery_school =  [1, 5, 0, 0, 0, 0, 0, 0]
elite_school =    [0, 1, 4, 4, 0, 0, 0, 0]
basic_school_profiles = [district_school, balanced_school, stem_school, lottery_school, elite_school]

def basic_rand_pref():
    """ generate a random school preference profile """
    return random.choice(basic_school_profiles)

In [12]:
"""
Open schools: exlcusively lottery
Ed Opt schools: 1/3rd seat for high/medium/low gpa (known) + lottery for inter-group tiebreaker
Screen schools: 5 group distribution of gpa but highest to lowest (known)
Audition/Assessment: some sort of examination with ranking (school provides a score) + lottery tiebreaker

Note: most have set-asides & zoning regulations
Note: careful of skew (students with high seats disproportionally apply more to more popular schools)
# Open, Ed Opt, Screened, Audtion/Assessment
"""
#  ["District", "Lottery", "ELA_Score", "Math_Score", "Poverty_Index", "gpa", "seat", "screen"]
open_school = ["Lottery"]  # [0, 1, 0, 0, 0, 0, 0, 0]
edopt_school = ["seat", "Lottery"]  # [0, 1, 0, 0, 0, 0, 2, 0]
screen_school = [0, 0, 0, 0, 0, 0, 0, 1]
# audition_school = [0, 1, 0, 0, 0, 0, 0]
complex_school_profiles = [open_school, edopt_school, screen_school]

def complex_rand_pref():
    """ generate a random school preference profile """
    return random.choice(complex_school_profiles)

def lottery_rand_pref():
    """ generate a random school preference profile """
    return open_school

In [None]:
# TODO: ensure order by discrete --> think SQL 'order_by' (next attribute is only looked at for tie-breakers)
# prioirity students always take presidence
# Note: recall you can affect capacity of school
# TODO: add student preferences proportional to school policy
# TODO: add distance from school's popularity
# TODO: take a better look at student preference generation
# TODO: add school district
# get_district = dict([[sid, random.choice(student_df["Residential_District"].unique())] for sid in school_ids])
# get_district[random.choice(school_ids)]

# Next steps: make one-shot function & seperate python file 

In [13]:
# generating school profiles
school_preferences = dict([[school, lottery_rand_pref()] for school in school_ids])
school_preferences[random.choice(school_ids)]

[0, 1, 0, 0, 0, 0, 0, 0]

## Task 3: Generate School Rankings from profiles
### Subgoal) Create randomized school profiles
#### Case 1: all open school (lottery only)
#### Case 2: add screen and or ed opt policies
#### Case 3: add policy-weighted randomness for variation between schools

In [14]:
student_df.sort_values("Lottery", ascending=True)

Unnamed: 0,Student_Id,Residential_District,swd,poverty,sex,ell,ENI,Black,Hispanic,Multi-Racial,...,Asian,Native American,Missing Race/Ethnicity Data,school,Math_score,ELA_score,Lottery,fake_gpa,seat,screen
9783,student_27866,Residential District 29,0,0,1,0,0.633,0,0,0,...,1,0,0,29Q109,4.034483,4.015152,0000480d6df540b783cbd1160d4d3bdb,3.59,3,5
62444,student_27566,Residential District 07,0,1,0,0,0.947,1,0,0,...,0,0,0,07X005,1.986667,3.312500,00008208192341009022df39a788e580,2.36,2,3
13137,student_37337,Residential District 28,0,1,0,0,0.751,1,0,0,...,0,0,0,28Q008,2.076923,3.687500,0000f902caa74eb082824eb97ff2d843,2.57,2,3
13769,student_43096,Residential District 28,0,1,1,0,0.789,0,0,0,...,0,0,0,28Q217,1.906667,3.687500,0001dac1243d4217abccb49ce95e74b8,2.50,2,3
58416,student_41219,Residential District 09,0,1,0,0,0.907,0,1,0,...,0,0,0,09X594,2.000000,1.972603,000208da96154aa8b0c930fa283872c5,1.77,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17234,student_57227,Residential District 27,0,1,1,0,0.612,0,0,0,...,1,0,0,27Q297,2.153846,3.875000,fff7ebc7008349d0a70ff5156add8f75,2.69,2,4
22698,student_70528,Residential District 24,1,0,1,0,0.881,0,1,0,...,0,0,0,24Q061,1.760000,1.890411,fffa476dc15c4e02935d1efa4216a2fe,1.63,1,1
6384,student_66854,Residential District 30,1,1,0,0,0.731,0,0,0,...,1,0,0,30Q230,1.920000,1.972603,fffb92dbece145fbace4aaa9c885cbcb,1.74,1,1
55099,student_42836,Residential District 10,0,1,1,1,0.905,0,1,0,...,0,0,0,10X244,2.923077,2.187500,fffd410f17f1440a833933ff5003ee9a,2.28,2,3


In [18]:
# super basic lottery ranking
def gen_lottery_ranking(df):
    """ slightly cleaner version of:
    lottery_list = student_df.sort_values("Lottery", ascending=True)["Lottery"].tolist()
    """
    student_lottery = df[["Student_Id", "Lottery"]].copy(deep=True).sort_values("Lottery", ascending=True).reset_index(drop=True)
    final_ranking = []
    for i, row in student_lottery.iterrows():  # percentile of current sorting
        final_ranking.append(i/len(student_lottery))
    student_lottery["rank"] = final_ranking
    # note: if done properly this is redundant - use indicies to check
    student_lottery.sort_values("rank", ascending=True, inplace=True)
    return student_lottery

ranked_df = gen_lottery_ranking(student_df)
lottery_list = ranked_df["rank"]
school_rankings = dict([[school, lottery_list] for school in school_ids])
ranked_df

Unnamed: 0,Student_Id,Lottery,rank
0,student_27866,0000480d6df540b783cbd1160d4d3bdb,0.000000
1,student_27566,00008208192341009022df39a788e580,0.000014
2,student_37337,0000f902caa74eb082824eb97ff2d843,0.000028
3,student_43096,0001dac1243d4217abccb49ce95e74b8,0.000042
4,student_41219,000208da96154aa8b0c930fa283872c5,0.000056
...,...,...,...
71245,student_57227,fff7ebc7008349d0a70ff5156add8f75,0.999930
71246,student_70528,fffa476dc15c4e02935d1efa4216a2fe,0.999944
71247,student_66854,fffb92dbece145fbace4aaa9c885cbcb,0.999958
71248,student_42836,fffd410f17f1440a833933ff5003ee9a,0.999972


In [16]:
# saving school rankings
np.save(cwd+"/Data/Generated/school_rankings.npy", school_rankings)
print("saved at", "/Data/Generated/school_rankings.npy")
print()
print(len(student_df), "students ranked alongside", len(school_rankings), "schools")

saved at /Data/Generated/school_rankings.npy

71250 students ranked alongside 437 schools


In [None]:
def insert_student(student_data, df):
    """ insert a student into a ranked category """
    # TODO: find a more efficient way of doing this
    df.append(student_data, ignore_index=True)
    return gen_lottery_ranking(df)

In [None]:
#TODO: clean up whats below! use pandas sort_by and a list of attribute names rather than numeric conversions
#      to efficiently deal with rankings

In [22]:
bounds = [0.25, 0.75]  # upper and lower bound for categorical properties like poverty & district (arbitrarily top 25% and bottom 75%)

def load_static_student_rankings(students, verbose=True):
    """ creates a dict of normalized rankings for each trait per school:
     {student_id: [criteria_a_ranking, criteria_b_ranking, ...], ...}
     """
    student_ids = list(students["Student_Id"].unique())
    
    # preload static rankings
    students = students.sample(frac=1).reset_index(drop=True)  # shuffle indicies
    if verbose:
        print("Preloading student rankings...")
    student_lottery = students[["Student_Id", "Lottery"]].copy(deep=True).sort_values("Lottery", ascending=True).reset_index(drop=True)
    student_ela = students[["Student_Id", "ELA_score"]].copy(deep=True).sort_values("ELA_score", ascending=False).reset_index(drop=True)
    student_math = students[["Student_Id", "Math_score"]].copy(deep=True).sort_values("Math_score", ascending=False).reset_index(drop=True)
    
    # merge rankings into a dict for quicker access
    if verbose:
        print("Merging student rankings...")
    student_placements = dict([[sid, [0, 0, 0, 0, 0]] for sid in student_ids])
    for i in tqdm(range(len(students))):  # add normalized ranking to placement
        # Note: district is dynamic and will be done during ranking function below
        student_placements[student_lottery.iloc[i]["Student_Id"]][1] = i/len(students)
        student_placements[student_ela.iloc[i]["Student_Id"]][2] = i/len(students)
        student_placements[student_math.iloc[i]["Student_Id"]][3] = i/len(students)
        if students.iloc[i]["poverty"] == 1:  # categorical
            student_placements[students.iloc[i]["Student_Id"]][4] = bounds[0]
        else:
            student_placements[students.iloc[i]["Student_Id"]][4] = bounds[1]
    return student_placements

student_places = load_static_student_rankings(student_df)
student_places[random.choice(list(student_df["Student_Id"].unique()))]

Preloading student rankings...
Merging student rankings...


100%|████████████████████████████████████████| 71250/71250 [00:27<00:00, 2559.83it/s]


[0, 0.9957754385964912, 0.1521543859649123, 0.046287719298245614, 0.25]

In [25]:
# Note: higher data-rate needed to complete: jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10 

def rank_students(school_ids, school_prefs, students, student_placements, verbose=True):
    """ creates school rankings structure based on phenotypical criteria:
    Criterion A) District
    Criterion B) Lottery (some should be lottery only)
    Criterion C) ELA+Math Score
    Criterion D) Poverty Index (proxy for race/gender)
    Criterion E) ...
    
    school_ids = list of all schools
    school_prefs = {school_id: preference list} (only if they have them)
    students = dataframe of student info (only need dynamic rankings and id)
    student_placements = {student_id: inverse_percentile_per_criterion }
    
    student_avg_rank = {student_id: mean(rankings)}  aka proxy for how requested this student is on avg (used for distribution checking later on)
    
    - derive unique formula for each school to rate preferences
        ex: total_rank = District_rank*10^2, Math_rank*10^1, ELA_rank*10^1, Lottery_rank*10^0
    - TODO: generate buckets for scores based on distribution
    
    Note: priority seats passed to gale-shapley code
    Note: O(school*students) runtime (aka order of 10mil calculations aka slow af)
    """
    
    school_rankings = {}
    student_ids = list(students["Student_Id"].unique())
    student_rank_info = dict([[sid, []] for sid in student_ids])
    
    # TODO: add code to inject 1 student into pre-simulated tables
    
    # create preferences per school
    if verbose:
        print("Loading rankings per school...")
    test_out = 0
    for school in tqdm(school_ids):
        test_out += 1
        if school in school_prefs:  # if preference specified
            if school_prefs[school][0] != 0:  # handle dynamic rankings (district)
                my_dist = lambda dist: bounds[0] if dist == get_district[school] else bounds[1]
                for i, row in students.iterrows():
                    student_placements[row["Student_Id"]][0] = my_dist(row["Residential_District"])
                
            # creating combined ranking
            combined_df = students[["Student_Id"]].copy(deep=True)
            combined = []
            for i, row in combined_df.iterrows():  # for each student
                score = 0
                for j, prio in enumerate(school_prefs[school]):  # for each preference
                    if prio > 0:
                        score += student_placements[row["Student_Id"]][j] * (10**prio)
                if (test_out%100)==2 and verbose and i == 5:  # testing
                    print("Score =", score, "for...")
                    print("Student:", student_placements[row["Student_Id"]])
                    print("School:", school_prefs[school])
                combined.append(score)
            combined_df["Combination_Rank"] = combined
            combined_df = combined_df.sort_values("Combination_Rank", ascending=True).reset_index(drop=True)
            school_rankings[school] = list(combined_df["Student_Id"])
            for i, this_student in enumerate(school_rankings[school]):
                student_rank_info[this_student].append(i+1)
            
        else:  # if no preference specified, randomize
            np.random.shuffle(student_ids)  # randomize id's
            school_rankings[sid] = student_ids

    print("Example School Ranking\n", school_rankings[np.random.choice(school_ids)][:5], "...")
    
    # averaging student rankings
    student_avg_rank = {}
    net_avg = []  # sanity check on averages (should be approx median of population size)
    for sid, ranks in student_rank_info.items():
        this_mean = np.mean(ranks)
        student_avg_rank[sid] = this_mean
        net_avg.append(this_mean)
    print("\nThe Average Mean-Ranking for each student is", np.mean(net_avg), "\ngiven", len(student_ids), "students...")
        
    return school_rankings, student_placements, student_avg_rank

school_rankings, placements, student_avg_rank = rank_students(school_ids, school_preferences, student_df, student_places)

Loading rankings per school...


  0%|                                                | 1/437 [00:03<22:46,  3.13s/it]

Score = 83562.58771929824 for...
Student: [0.75, 0.8355508771929825, 0.2503157894736842, 0.47577543859649124, 0.25]
School: [1, 5, 0, 0, 0]


 23%|██████████▋                                   | 101/437 [09:32<32:58,  5.89s/it]

Score = 75008.35550877193 for...
Student: [0.75, 0.8355508771929825, 0.2503157894736842, 0.47577543859649124, 0.25]
School: [5, 1, 0, 0, 0]


 46%|█████████████████████▏                        | 201/437 [18:38<18:10,  4.62s/it]

Score = 833.4646315789474 for...
Student: [0.75, 0.8355508771929825, 0.2503157894736842, 0.47577543859649124, 0.25]
School: [3, 1, 2, 2, 1]


 69%|███████████████████████████████▋              | 301/437 [39:48<10:52,  4.80s/it]

Score = 48363.43094736842 for...
Student: [0.75, 0.8355508771929825, 0.2503157894736842, 0.47577543859649124, 0.25]
School: [3, 1, 2, 5, 1]


 92%|██████████████████████████████████████████▏   | 401/437 [48:57<03:36,  6.01s/it]

Score = 48363.43094736842 for...
Student: [0.75, 0.8355508771929825, 0.2503157894736842, 0.47577543859649124, 0.25]
School: [3, 1, 2, 5, 1]


100%|██████████████████████████████████████████████| 437/437 [52:27<00:00,  7.20s/it]

Example School Ranking
 ['student_5132', 'student_1984', 'student_68434', 'student_65709', 'student_37654'] ...





In [26]:
# saving school rankings
np.save(cwd+"/Data/Generated/school_rankings.npy", school_rankings)
print("saved at", "/Data/Generated/school_rankings.npy")
print()
print(len(student_df), "students ranked alongside", len(school_rankings), "schools")

saved at /Data/school_rankings.npy


## Task 4: Check the distribution
> look at students per school to ensure the algorithm worked properly

In [None]:
# TODO collinearity matrix between ranking and student attributes like test scores/lottery number
# Note: gale-shapley priority => 'set asside'
# TODO compute post-ranking statistics

# INTENDED OUTPUT

### student_rankings = {student_id : [school_ids_ranked]}

### school_rankings = {school ids : [student_ids_ranked]}