# Main Document to Load and Traverse Data

> Expected Data: school admissions preferences, number of students admitted per attribute category, etc.

In [1]:
import pandas as pd
import numpy as np
import os
import random

from tqdm import tqdm

cwd = os.getcwd()
testing = False  # ******
# TODO: make 1shot code simulation(#schools, #students) = resulting selection (Note: seats, student # choices, scores, etc. can be assumed)

## Schema:
Nutrition Label:
- School Label:
  - Name
  - ID
  - AP courses offered
  - Location
  - Subject Specialization
  - Accomodations
  - Admission Preferences
   - gpa cutoff
   - zone
   - subject specification


- Student Label:
  - Name
  - Location
  - Zone
  - School ID
  - Subject preference
  - AP course preferences

# Task 1) Load student demographics and preferences

In [2]:
student_df = pd.read_csv(cwd + "/Data/student_info_with_demographics.csv")
student_df

Unnamed: 0,Student_Id,Residential_District,swd,poverty,sex,ell,ENI,Black,Hispanic,Multi-Racial,White,Asian,Native American,Missing Race/Ethnicity Data,school,Math_score,ELA_score
0,student_36332,Residential District Unknown,0,0,1,0,0.723,0,1,0,0,0,0,0,30Q127,2.000000,3.125000
1,student_36144,Residential District Unknown,0,1,0,0,0.684,0,1,0,0,0,0,0,27Q137,2.153846,2.750000
2,student_37038,Residential District Unknown,1,1,1,0,0.881,0,1,0,0,0,0,0,24Q061,2.846154,1.753425
3,student_614,Residential District Unknown,0,0,1,0,0.191,0,0,0,1,0,0,0,02M114,4.275862,4.181818
4,student_21981,Residential District Unknown,1,0,1,0,0.304,0,0,0,1,0,0,0,02M312,4.017241,3.125000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71245,student_22462,Residential District 01,0,0,1,0,0.650,0,1,0,0,0,0,0,01M450,2.000000,1.972603
71246,student_30857,Residential District 01,1,0,1,0,0.621,0,0,0,0,1,0,0,01M184,3.000000,3.125000
71247,student_57833,Residential District 01,0,0,0,0,0.256,1,0,0,0,0,0,0,01M539,2.615385,4.030303
71248,student_32259,Residential District 01,0,1,0,0,0.950,0,0,0,0,1,0,0,01M188,3.083333,2.000000


In [3]:
ranking_df = pd.read_csv(cwd + "/Data/student_prefs.csv")
ranking_df

Unnamed: 0,Student_Id,School,Rank,Rating
0,student_68963,02M411,0,1.000000
1,student_68963,02M376,1,0.500000
2,student_68963,02M316,2,0.333333
3,student_68963,02M438,3,0.250000
4,student_68963,01M448,4,0.200000
...,...,...,...,...
492363,student_36332,02M414,2,0.333333
492364,student_36332,20K490,3,0.250000
492365,student_36332,18K637,4,0.200000
492366,student_36332,28Q686,5,0.166667


# Task 2) Generate & Export Student Preferences
> Note: decide how many schools each student should rank and which preference

### Case 1: full random with no weight
### *Case 2: full random, ordered by popularity buckets
### Case 3: popularity_bucket-weighted randomness
### *Case 4: popularity_bucket-weighted randomness, ordered by popularity buckets

In [4]:
student_ids = list(ranking_df["Student_Id"].unique())  # get student ids
if testing:  # subset of data for testing
    student_ids = student_ids[:200]

# GET NUMBER OF SCHOOLS RANKED PER STUDENTS
full_list = False  # **students rank 12 schools, ELSE students rank schools based on real-world distribution
student_school_counts = {}  # student_id: #schools to choose
max_num_schools = 12  # maximum number of rankings a student can have
if full_list:
    student_school_counts = dict([[sid, max_num_schools] for sid in student_ids])
else:
    student_school_counts = dict([[sid, 0] for sid in student_ids])
    for i, row in tqdm(ranking_df.iterrows()):  # for each ranking, add 1 to student
        assert row["Student_Id"] in student_school_counts, row["Student_Id"]
        student_school_counts[row["Student_Id"]] += 1

# show descriptive stats (sanity check)
pd.DataFrame(student_school_counts.values()).describe()

492368it [00:28, 17157.04it/s]


Unnamed: 0,0
count,71250.0
mean,6.910428
std,2.187714
min,1.0
25%,6.0
50%,7.0
75%,8.0
max,12.0


In [5]:
school_ids = list(ranking_df["School"].unique())

# MAKING STUDENT_RANKINGS STRUCTURE
unknown_val = "nan"  # placeholder for no rankings
student_rankings = dict([ [sid, [unknown_val for i in range(max_num_schools)]] for sid in student_ids])  # initializing dict

for sid in tqdm(student_rankings.keys()):
    # CASE 1: RANDOM IMPLEMENTATION
    student_rankings[sid] = np.random.choice(school_ids, size=student_school_counts[sid], replace=False).tolist()

    # TODO IMPLEMENT CASE 2 AND 3 AND 4 ---> CHECK POPULARITY RATE, WHAT PERCENTAGE OF SYNTHETIC STUDENTS HAVE APPLIED
    # ARE THE MOST POPULAR SCHOOLS GETTING THE MOST APPLICATIONS (num applicants) -- sanity check for simulation results
print("\nExample Student Ranking\n", student_rankings[np.random.choice(student_ids)])

100%|████████████████████████████████████████| 71250/71250 [00:09<00:00, 7413.05it/s]


Example Student Ranking
 ['17K546']





In [6]:
# stats for sanity check
rank_avg = np.mean([len(rank) for sid, rank in student_rankings.items()])
print("Current average number of schools ranked per student:", rank_avg)

Current average number of schools ranked per student: 6.910428070175438


In [7]:
# saving student rankings
np.save(cwd+"/Data/Generated/student_rankings_stage1.npy", student_rankings)
print("saved at", "/Data/Generated/student_rankings_stage1.npy")

saved at /Data/Generated/student_rankings_stage1.npy


## Subtask) Deriving School Popularity via True Applicant Rate

In [8]:
""" Data: https://github.com/KoraSHughes/DataLife/blob/main/BackEnd/Data/school_directory.xlsx
DV-EF = applicants
EG-ER = applicant/seats
MN-MY = total seats

TODO parse csv, used tommaso's structure - derive popularity for stage 2
"""
# placeholder data
school_to_pop = dict([[sid, random.randint(1, 5)] for sid in school_ids])
school_to_pop  # Note: dict is more efficient than dataframe for sorting

{'02M411': 5,
 '02M376': 2,
 '02M316': 2,
 '02M438': 1,
 '01M448': 2,
 '02M298': 4,
 '03M402': 4,
 '01M696': 1,
 '02M412': 2,
 '02M416': 2,
 '02M294': 4,
 '01M292': 3,
 '25Q281': 2,
 '02M543': 5,
 '02M418': 5,
 '02M422': 3,
 '05M362': 2,
 '02M551': 3,
 '02M414': 4,
 '02M439': 2,
 '04M435': 3,
 '02M288': 3,
 '01M539': 2,
 '01M450': 3,
 '03M541': 1,
 '02M605': 1,
 '02M400': 4,
 '02M615': 4,
 '02M600': 2,
 '32K168': 5,
 '03M479': 2,
 '02M407': 4,
 '02M139': 1,
 '02M545': 2,
 '27Q309': 2,
 '02M282': 5,
 '02M308': 3,
 '15K684': 2,
 '02M413': 1,
 '07X625': 2,
 '30Q580': 4,
 '02M580': 3,
 '02M305': 1,
 '24Q560': 4,
 '13K605': 5,
 '02M534': 1,
 '02M135': 5,
 '21K690': 1,
 '06M540': 2,
 '02M489': 5,
 '02M392': 4,
 '01M515': 1,
 '06M346': 5,
 '04M555': 2,
 '27Q260': 2,
 '02M374': 5,
 '02M630': 1,
 '02M546': 5,
 '22K555': 2,
 '04M610': 1,
 '10X439': 1,
 '05M369': 1,
 '24Q299': 3,
 '20K445': 2,
 '02M419': 1,
 '02M399': 1,
 '02M533': 1,
 '02M260': 2,
 '13K439': 3,
 '32K556': 1,
 '13K527': 2,
 '13K3

In [9]:
# save generated popularity
translation = [[sid, pop] for sid, pop in school_to_pop.items()]
np.save(cwd+"/Data/Generated/school_demographics.npy", pd.DataFrame(translation, columns=["Name", "Popularity"]))
print("saved at", "/Data/Generated/school_demographics.npy")

saved at /Data/Generated/school_demographics.npy


In [10]:
# add popularity-ordering to ranking
s2_student_rankings = {}
for sid, ranks in tqdm(student_rankings.items()):
    new_ranks = sorted(ranks, key=lambda x: tmp_dict[x], reverse=True)  # sort by dict vals
    s2_student_rankings[sid] = new_ranks  # append new rank

100%|██████████████████████████████████████| 71250/71250 [00:00<00:00, 201241.18it/s]


In [11]:
# saving student rankings
np.save(cwd+"/Data/Generated/student_rankings_stage2.npy", s2_student_rankings)
print("saved at", "/Data/Generated/student_rankings_stage2.npy")

saved at /Data/Generated/student_rankings_stage2.npy


In [17]:
# add popularity weighted choice
s3_student_rankings = {}
weighted_schools = []
for school, prio in tqdm(school_to_pop.items()):  # generate school list weighted by popularity
    for i in range(prio):
        weighted_schools.append(school)
        
for sid, ranks in tqdm(student_rankings.items()):  # choose from weighted list
    new_ranks = np.random.choice(weighted_schools, size=student_school_counts[sid], replace=False).tolist()
    s3_student_rankings[sid] = new_ranks

100%|██████████████████████████████████████████| 437/437 [00:00<00:00, 692710.07it/s]
100%|████████████████████████████████████████| 71250/71250 [00:23<00:00, 3058.38it/s]


In [18]:
# saving student rankings
np.save(cwd+"/Data/Generated/student_rankings_stage3.npy", s3_student_rankings)
print("saved at", "/Data/Generated/student_rankings_stage3.npy")

saved at /Data/Generated/student_rankings_stage3.npy


In [19]:
# add popularity weighted choice AND popularity-ordering to ranking
s4_student_rankings = {}
for sid, ranks in tqdm(s3_student_rankings.items()):
    new_ranks = sorted(ranks, key=lambda x: school_to_pop[x], reverse=True)  # sort by dict vals
    s4_student_rankings[sid] = new_ranks  # append new rank

100%|██████████████████████████████████████| 71250/71250 [00:00<00:00, 227640.32it/s]


In [20]:
# saving student rankings
np.save(cwd+"/Data/Generated/student_rankings_stage4.npy", s4_student_rankings)
print("saved at", "/Data/Generated/student_rankings_stage4.npy")

saved at /Data/Generated/student_rankings_stage4.npy


In [23]:
import uuid
# adding lotterly number
lnums = [uuid.uuid4().hex for i in range(len(student_df))]
student_df["Lottery"] = lnums
student_df

Unnamed: 0,Student_Id,Residential_District,swd,poverty,sex,ell,ENI,Black,Hispanic,Multi-Racial,White,Asian,Native American,Missing Race/Ethnicity Data,school,Math_score,ELA_score,Lottery
0,student_36332,Residential District Unknown,0,0,1,0,0.723,0,1,0,0,0,0,0,30Q127,2.000000,3.125000,9e18dbb709934b65b7716e8d163068f6
1,student_36144,Residential District Unknown,0,1,0,0,0.684,0,1,0,0,0,0,0,27Q137,2.153846,2.750000,fb47ffbb329c4a04a0fd93a855737aba
2,student_37038,Residential District Unknown,1,1,1,0,0.881,0,1,0,0,0,0,0,24Q061,2.846154,1.753425,ae6f9342d3034178846d380edc9da9a9
3,student_614,Residential District Unknown,0,0,1,0,0.191,0,0,0,1,0,0,0,02M114,4.275862,4.181818,4a4d4d2aa38e4b8f88142398100f1381
4,student_21981,Residential District Unknown,1,0,1,0,0.304,0,0,0,1,0,0,0,02M312,4.017241,3.125000,f55fc65d3eda4767a0a43e228d14b6d9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71245,student_22462,Residential District 01,0,0,1,0,0.650,0,1,0,0,0,0,0,01M450,2.000000,1.972603,348516aef3cd4ea3b790ae3622b9c1f7
71246,student_30857,Residential District 01,1,0,1,0,0.621,0,0,0,0,1,0,0,01M184,3.000000,3.125000,866035f5f161433c85bfedf4b1ac0ace
71247,student_57833,Residential District 01,0,0,0,0,0.256,1,0,0,0,0,0,0,01M539,2.615385,4.030303,8f0304e27f0a421e9969f492da772f9a
71248,student_32259,Residential District 01,0,1,0,0,0.950,0,0,0,0,1,0,0,01M188,3.083333,2.000000,143ddaa0e8d248ae8fc138cf19bf5c8d


In [30]:
# adding gpa, seat, and screen attributes

def add_fake_gpa(students):
    """ adds a gpa attribute based on the students ELA and Math scores """
    students["fake_gpa"] = students["Math_score"] + students["ELA_score"]
    max_score = max(students["fake_gpa"])
    students["fake_gpa"] = students["fake_gpa"].apply(lambda score: round(4*score/max_score, 2))  # gpa-normalized
    return students
    
def add_edopt(students):
    """ adds gpa percentile in 3rds for ed opt schools """
    lower = students["fake_gpa"].quantile(1/3)
    upper = students["fake_gpa"].quantile(2/3)
    seated = lambda x: 1 if x >= upper else (2 if x >= lower else 3)
    students["seat"] = students["fake_gpa"].apply(seated)
    return students

def add_screen(students):
    """ adds gpa percentile in 5ths for screen schools """
    lower1 = students["fake_gpa"].quantile(1/5)
    lower2 = students["fake_gpa"].quantile(2/5)
    upper1 = students["fake_gpa"].quantile(3/5)
    upper2 = students["fake_gpa"].quantile(4/5)
    def screened(gpa):
        if gpa >= upper2:
            return 1
        elif gpa >= upper1:
            return 2
        elif gpa >= lower2:
            return 3
        elif gpa >= lower1:
            return 4
        else:
            return 5
        
    students["screen"] = students["fake_gpa"].apply(lambda x: screened(x))
    return students

student_df = add_fake_gpa(student_df)
student_df = add_edopt(student_df)
student_df = add_screen(student_df)
student_df

Unnamed: 0,Student_Id,Residential_District,swd,poverty,sex,ell,ENI,Black,Hispanic,Multi-Racial,...,Asian,Native American,Missing Race/Ethnicity Data,school,Math_score,ELA_score,Lottery,fake_gpa,seat,screen
0,student_36332,Residential District Unknown,0,0,1,0,0.723,0,1,0,...,0,0,0,30Q127,2.000000,3.125000,9e18dbb709934b65b7716e8d163068f6,2.29,2,3
1,student_36144,Residential District Unknown,0,1,0,0,0.684,0,1,0,...,0,0,0,27Q137,2.153846,2.750000,fb47ffbb329c4a04a0fd93a855737aba,2.19,2,4
2,student_37038,Residential District Unknown,1,1,1,0,0.881,0,1,0,...,0,0,0,24Q061,2.846154,1.753425,ae6f9342d3034178846d380edc9da9a9,2.05,3,4
3,student_614,Residential District Unknown,0,0,1,0,0.191,0,0,0,...,0,0,0,02M114,4.275862,4.181818,4a4d4d2aa38e4b8f88142398100f1381,3.77,1,1
4,student_21981,Residential District Unknown,1,0,1,0,0.304,0,0,0,...,0,0,0,02M312,4.017241,3.125000,f55fc65d3eda4767a0a43e228d14b6d9,3.19,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71245,student_22462,Residential District 01,0,0,1,0,0.650,0,1,0,...,0,0,0,01M450,2.000000,1.972603,348516aef3cd4ea3b790ae3622b9c1f7,1.77,3,5
71246,student_30857,Residential District 01,1,0,1,0,0.621,0,0,0,...,1,0,0,01M184,3.000000,3.125000,866035f5f161433c85bfedf4b1ac0ace,2.73,2,2
71247,student_57833,Residential District 01,0,0,0,0,0.256,1,0,0,...,0,0,0,01M539,2.615385,4.030303,8f0304e27f0a421e9969f492da772f9a,2.96,1,2
71248,student_32259,Residential District 01,0,1,0,0,0.950,0,0,0,...,1,0,0,01M188,3.083333,2.000000,143ddaa0e8d248ae8fc138cf19bf5c8d,2.27,2,3


In [31]:
# saving school demographics with updated attributes
np.save(cwd+"/Data/Generated/student_demographics.npy", student_df)
print("saved at", "/Data/Generated/student_demographics.npy")

saved at /Data/Generated/student_demographics.npy


## Step 3.5) Add profiles for school rankings

In [32]:
schools_criterion = ["District", "Lottery", "ELA_Score", "Math_Score", "Poverty_Index", "fake_gpa", "seat", "screen"]

# BASIC SCHOOL PROFILES (intuition)
# Note: these weights are in powers of 10 so they are closer to discrete ordering than pure ratios
district_school = [3, 1, 0, 0, 0, 0, 0, 0]
balanced_school = [3, 1, 2, 2, 1, 0, 0, 0]
stem_school =     [3, 1, 2, 5, 1, 0, 0, 0]  # Note: most schools weight ELA and math equivalently (less realistic)
lottery_school =  [1, 5, 0, 0, 0, 0, 0, 0]
elite_school =    [0, 1, 4, 4, 0, 0, 0, 0]
basic_school_profiles = [district_school, balanced_school, stem_school, lottery_school, elite_school]

def basic_rand_pref():
    """ generate a random school preference profile """
    return random.choice(basic_school_profiles)

In [35]:
"""
Open schools: exlcusively lottery
Ed Opt schools: 1/3rd seat for high/medium/low gpa (known) + lottery for inter-group tiebreaker
Screen schools: 5 group distribution of gpa but highest to lowest (known) + lottery tie breaker
Audition/Assessment: some sort of examination with ranking (school provides a score) + lottery tiebreaker

Note: most have set-asides & zoning regulations
Note: careful of skew (students with high seats disproportionally apply more to more popular schools)
# Open, Ed Opt, Screened, Audtion/Assessment
"""
#  ["District", "Lottery", "ELA_Score", "Math_Score", "Poverty_Index", "gpa", "seat", "screen"]
open_school = ["Lottery"]  # [0, 1, 0, 0, 0, 0, 0, 0]
edopt_school = ["seat", "Lottery"]  # [0, 1, 0, 0, 0, 0, 2, 0]
screen_school = ["screen", "Lottery"]  # [0, 0, 0, 0, 0, 0, 0, 1]
# audition_school = [0, 1, 0, 0, 0, 0, 0]
complex_school_profiles = [open_school, edopt_school, screen_school]

student_df.sort_values(edopt_school, ascending=True)

Unnamed: 0,Student_Id,Residential_District,swd,poverty,sex,ell,ENI,Black,Hispanic,Multi-Racial,...,Asian,Native American,Missing Race/Ethnicity Data,school,Math_score,ELA_score,Lottery,fake_gpa,seat,screen
41814,student_38213,Residential District 17,0,1,1,0,0.802,0,1,0,...,0,0,0,17K340,3.333333,3.750000,00043016949748b992872c52f9dcff61,3.16,1,1
54110,student_69632,Residential District 10,0,0,1,0,0.926,0,1,0,...,0,0,0,10X080,3.666667,4.060606,0008429e50224d0fbd6d3fd1e50a3dd8,3.45,1,1
62394,student_19288,Residential District 07,0,1,1,0,0.949,0,1,0,...,0,0,0,07X343,3.083333,3.625000,00087c24850f41b3a1c7bb89e669b1cf,2.99,1,2
48134,student_60971,Residential District 13,0,0,0,0,0.391,0,0,0,...,0,0,0,13K492,4.241379,2.437500,000d44b2824f4c75ad1d8de72a018f59,2.98,1,2
50508,student_57490,Residential District 11,0,1,1,0,0.798,1,0,0,...,0,0,0,11X144,3.583333,3.687500,000d750a87954552806b3d48549bb894,3.24,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60853,student_43389,Residential District 08,0,1,1,0,0.837,0,1,0,...,0,0,0,08X467,1.813333,1.876712,ffefb10c65964eb9bfb8786b31b76a19,1.65,3,5
50545,student_1499,Residential District 11,1,0,1,0,0.699,1,0,0,...,0,0,0,11X181,1.826667,1.972603,ffefc7f2bff04133b913b7fd3150f7e8,1.70,3,5
49612,student_48762,Residential District 12,1,1,0,0,0.922,0,1,0,...,0,0,0,12X267,1.786667,2.250000,fff2890fba9140ffbe57c313daeb6a49,1.80,3,5
51811,student_54061,Residential District 11,0,0,0,0,0.767,0,0,0,...,1,0,0,11X529,1.560000,2.812500,fff945c531fe43c4b0a6e37bad998239,1.95,3,4


In [55]:
students_open = student_df.sort_values(open_school, ascending=True)["Student_Id"].tolist()
students_edopt = student_df.sort_values(edopt_school, ascending=True)["Student_Id"].tolist()
students_screen = student_df.sort_values(screen_school, ascending=True)["Student_Id"].tolist()
student_ordering_choices = [students_open, students_edopt, students_screen]

def gen_school_choices(ids, choice_weights):
    assert len(choice_weights) == len(student_ordering_choices), "choices should follow schema:"+str(len(student_ordering_choices))
    weighted_choices = ""  # creates a string representing indicies of the choices
    for i, weight in enumerate(choice_weights):
        for j in range(weight):
            weighted_choices += str(i)
    school_choices = {}  # generate student choices
    for sid in ids:
        choice_type = int(weighted_choices[np.random.randint(len(weighted_choices))])
#         this_choice = student_ordering_choices[choice_type]
        school_choices[sid] = choice_type
    return school_choices

In [None]:
# TODO: ensure order by discrete --> think SQL 'order_by' (next attribute is only looked at for tie-breakers)
# prioirity students always take presidence
# Note: recall you can affect capacity of school
# TODO: add student preferences proportional to school policy
# TODO: add distance from school's popularity
# TODO: take a better look at student preference generation
# TODO: add school district
# get_district = dict([[sid, random.choice(student_df["Residential_District"].unique())] for sid in school_ids])
# get_district[random.choice(school_ids)]

# Next steps: make one-shot function & seperate python file 

## Task 3: Generate School Rankings from profiles
### Subgoal) Create randomized school profiles
#### Case 1: all open school (lottery only)
#### Case 2: add screen and or ed opt policies
#### Case 3: add policy-weighted randomness for variation between schools

In [56]:
# generate profile
all_open = gen_school_choices(school_ids, [1,0,0])
# saving school rankings
np.save(cwd+"/Data/Generated/school_rankings_open.npy", all_open)
print("saved at", "/Data/Generated/school_rankings_open.npy")

saved at /Data/Generated/school_rankings_open.npy


In [57]:
# generate profile
all_edopt = gen_school_choices(school_ids, [0,1,0])
# saving school rankings
np.save(cwd+"/Data/Generated/school_rankings_edopt.npy", all_edopt)
print("saved at", "/Data/Generated/school_rankings_edopt.npy")

saved at /Data/Generated/school_rankings_edopt.npy


In [58]:
# generate profile
all_screen = gen_school_choices(school_ids, [0,0,1])
# saving school rankings
np.save(cwd+"/Data/Generated/school_rankings_screen.npy", all_screen)
print("saved at", "/Data/Generated/school_rankings_screen.npy")

saved at /Data/Generated/school_rankings_screen.npy


In [59]:
# generate profile
all_combo = gen_school_choices(school_ids, [1,1,1])
# saving school rankings
np.save(cwd+"/Data/Generated/school_rankings_combo.npy", all_combo)
print("saved at", "/Data/Generated/school_rankings_combo.npy")

saved at /Data/Generated/school_rankings_combo.npy


In [61]:
print(len(student_df), "students ranked alongside", len(all_combo), "schools")

71250 students ranked alongside 437 schools


In [None]:
def insert_student(student_data, df):
    """ insert a student into a ranked category """
    # TODO: find a more efficient way of doing this
    df.append(student_data, ignore_index=True)
    return gen_lottery_ranking(df)

In [None]:
#TODO: clean up whats below! use pandas sort_by and a list of attribute names rather than numeric conversions
#      to efficiently deal with rankings

In [22]:
bounds = [0.25, 0.75]  # upper and lower bound for categorical properties like poverty & district (arbitrarily top 25% and bottom 75%)

def load_static_student_rankings(students, verbose=True):
    """ creates a dict of normalized rankings for each trait per school:
     {student_id: [criteria_a_ranking, criteria_b_ranking, ...], ...}
     """
    student_ids = list(students["Student_Id"].unique())
    
    # preload static rankings
    students = students.sample(frac=1).reset_index(drop=True)  # shuffle indicies
    if verbose:
        print("Preloading student rankings...")
    student_lottery = students[["Student_Id", "Lottery"]].copy(deep=True).sort_values("Lottery", ascending=True).reset_index(drop=True)
    student_ela = students[["Student_Id", "ELA_score"]].copy(deep=True).sort_values("ELA_score", ascending=False).reset_index(drop=True)
    student_math = students[["Student_Id", "Math_score"]].copy(deep=True).sort_values("Math_score", ascending=False).reset_index(drop=True)
    
    # merge rankings into a dict for quicker access
    if verbose:
        print("Merging student rankings...")
    student_placements = dict([[sid, [0, 0, 0, 0, 0]] for sid in student_ids])
    for i in tqdm(range(len(students))):  # add normalized ranking to placement
        # Note: district is dynamic and will be done during ranking function below
        student_placements[student_lottery.iloc[i]["Student_Id"]][1] = i/len(students)
        student_placements[student_ela.iloc[i]["Student_Id"]][2] = i/len(students)
        student_placements[student_math.iloc[i]["Student_Id"]][3] = i/len(students)
        if students.iloc[i]["poverty"] == 1:  # categorical
            student_placements[students.iloc[i]["Student_Id"]][4] = bounds[0]
        else:
            student_placements[students.iloc[i]["Student_Id"]][4] = bounds[1]
    return student_placements

student_places = load_static_student_rankings(student_df)
student_places[random.choice(list(student_df["Student_Id"].unique()))]

Preloading student rankings...
Merging student rankings...


100%|████████████████████████████████████████| 71250/71250 [00:27<00:00, 2559.83it/s]


[0, 0.9957754385964912, 0.1521543859649123, 0.046287719298245614, 0.25]

In [25]:
# Note: higher data-rate needed to complete: jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10 

def rank_students(school_ids, school_prefs, students, student_placements, verbose=True):
    """ creates school rankings structure based on phenotypical criteria:
    Criterion A) District
    Criterion B) Lottery (some should be lottery only)
    Criterion C) ELA+Math Score
    Criterion D) Poverty Index (proxy for race/gender)
    Criterion E) ...
    
    school_ids = list of all schools
    school_prefs = {school_id: preference list} (only if they have them)
    students = dataframe of student info (only need dynamic rankings and id)
    student_placements = {student_id: inverse_percentile_per_criterion }
    
    student_avg_rank = {student_id: mean(rankings)}  aka proxy for how requested this student is on avg (used for distribution checking later on)
    
    - derive unique formula for each school to rate preferences
        ex: total_rank = District_rank*10^2, Math_rank*10^1, ELA_rank*10^1, Lottery_rank*10^0
    - TODO: generate buckets for scores based on distribution
    
    Note: priority seats passed to gale-shapley code
    Note: O(school*students) runtime (aka order of 10mil calculations aka slow af)
    """
    
    school_rankings = {}
    student_ids = list(students["Student_Id"].unique())
    student_rank_info = dict([[sid, []] for sid in student_ids])
    
    # TODO: add code to inject 1 student into pre-simulated tables
    
    # create preferences per school
    if verbose:
        print("Loading rankings per school...")
    test_out = 0
    for school in tqdm(school_ids):
        test_out += 1
        if school in school_prefs:  # if preference specified
            if school_prefs[school][0] != 0:  # handle dynamic rankings (district)
                my_dist = lambda dist: bounds[0] if dist == get_district[school] else bounds[1]
                for i, row in students.iterrows():
                    student_placements[row["Student_Id"]][0] = my_dist(row["Residential_District"])
                
            # creating combined ranking
            combined_df = students[["Student_Id"]].copy(deep=True)
            combined = []
            for i, row in combined_df.iterrows():  # for each student
                score = 0
                for j, prio in enumerate(school_prefs[school]):  # for each preference
                    if prio > 0:
                        score += student_placements[row["Student_Id"]][j] * (10**prio)
                if (test_out%100)==2 and verbose and i == 5:  # testing
                    print("Score =", score, "for...")
                    print("Student:", student_placements[row["Student_Id"]])
                    print("School:", school_prefs[school])
                combined.append(score)
            combined_df["Combination_Rank"] = combined
            combined_df = combined_df.sort_values("Combination_Rank", ascending=True).reset_index(drop=True)
            school_rankings[school] = list(combined_df["Student_Id"])
            for i, this_student in enumerate(school_rankings[school]):
                student_rank_info[this_student].append(i+1)
            
        else:  # if no preference specified, randomize
            np.random.shuffle(student_ids)  # randomize id's
            school_rankings[sid] = student_ids

    print("Example School Ranking\n", school_rankings[np.random.choice(school_ids)][:5], "...")
    
    # averaging student rankings
    student_avg_rank = {}
    net_avg = []  # sanity check on averages (should be approx median of population size)
    for sid, ranks in student_rank_info.items():
        this_mean = np.mean(ranks)
        student_avg_rank[sid] = this_mean
        net_avg.append(this_mean)
    print("\nThe Average Mean-Ranking for each student is", np.mean(net_avg), "\ngiven", len(student_ids), "students...")
        
    return school_rankings, student_placements, student_avg_rank

school_rankings, placements, student_avg_rank = rank_students(school_ids, school_preferences, student_df, student_places)

Loading rankings per school...


  0%|                                                | 1/437 [00:03<22:46,  3.13s/it]

Score = 83562.58771929824 for...
Student: [0.75, 0.8355508771929825, 0.2503157894736842, 0.47577543859649124, 0.25]
School: [1, 5, 0, 0, 0]


 23%|██████████▋                                   | 101/437 [09:32<32:58,  5.89s/it]

Score = 75008.35550877193 for...
Student: [0.75, 0.8355508771929825, 0.2503157894736842, 0.47577543859649124, 0.25]
School: [5, 1, 0, 0, 0]


 46%|█████████████████████▏                        | 201/437 [18:38<18:10,  4.62s/it]

Score = 833.4646315789474 for...
Student: [0.75, 0.8355508771929825, 0.2503157894736842, 0.47577543859649124, 0.25]
School: [3, 1, 2, 2, 1]


 69%|███████████████████████████████▋              | 301/437 [39:48<10:52,  4.80s/it]

Score = 48363.43094736842 for...
Student: [0.75, 0.8355508771929825, 0.2503157894736842, 0.47577543859649124, 0.25]
School: [3, 1, 2, 5, 1]


 92%|██████████████████████████████████████████▏   | 401/437 [48:57<03:36,  6.01s/it]

Score = 48363.43094736842 for...
Student: [0.75, 0.8355508771929825, 0.2503157894736842, 0.47577543859649124, 0.25]
School: [3, 1, 2, 5, 1]


100%|██████████████████████████████████████████████| 437/437 [52:27<00:00,  7.20s/it]

Example School Ranking
 ['student_5132', 'student_1984', 'student_68434', 'student_65709', 'student_37654'] ...





In [26]:
# saving school rankings
np.save(cwd+"/Data/Generated/school_rankings.npy", school_rankings)
print("saved at", "/Data/Generated/school_rankings.npy")
print()
print(len(student_df), "students ranked alongside", len(school_rankings), "schools")

saved at /Data/school_rankings.npy


## Task 4: Check the distribution
> look at students per school to ensure the algorithm worked properly

In [None]:
# TODO collinearity matrix between ranking and student attributes like test scores/lottery number
# Note: gale-shapley priority => 'set asside'
# TODO compute post-ranking statistics

# INTENDED OUTPUT

### student_rankings = {student_id : [school_ids_ranked]}

### school_rankings = {school ids : [student_ids_ranked]}