# Main Document to Load and Traverse Data

> Expected Data: school admissions preferences, number of students admitted per attribute category, etc.

In [1]:
import pandas as pd
import numpy as np
import os
import random

from tqdm import tqdm

cwd = os.getcwd()
testing = False  # ******
# TODO: make 1shot code simulation(#schools, #students) = resulting selection (Note: seats, student # choices, scores, etc. can be assumed)

## Schema:
Nutrition Label:
- School Label:
  - Name
  - ID
  - AP courses offered
  - Location
  - Subject Specialization
  - Accomodations
  - Admission Preferences
   - gpa cutoff
   - zone
   - subject specification


- Student Label:
  - Name
  - Location
  - Zone
  - School ID
  - Subject preference
  - AP course preferences

# Task 1) Load student demographics and preferences

In [2]:
student_df = pd.read_csv(cwd + "/Data/student_info_with_demographics.csv")
student_df

Unnamed: 0,Student_Id,Residential_District,swd,poverty,sex,ell,ENI,Black,Hispanic,Multi-Racial,White,Asian,Native American,Missing Race/Ethnicity Data,school,Math_score,ELA_score
0,student_36332,Residential District Unknown,0,0,1,0,0.723,0,1,0,0,0,0,0,30Q127,2.000000,3.125000
1,student_36144,Residential District Unknown,0,1,0,0,0.684,0,1,0,0,0,0,0,27Q137,2.153846,2.750000
2,student_37038,Residential District Unknown,1,1,1,0,0.881,0,1,0,0,0,0,0,24Q061,2.846154,1.753425
3,student_614,Residential District Unknown,0,0,1,0,0.191,0,0,0,1,0,0,0,02M114,4.275862,4.181818
4,student_21981,Residential District Unknown,1,0,1,0,0.304,0,0,0,1,0,0,0,02M312,4.017241,3.125000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71245,student_22462,Residential District 01,0,0,1,0,0.650,0,1,0,0,0,0,0,01M450,2.000000,1.972603
71246,student_30857,Residential District 01,1,0,1,0,0.621,0,0,0,0,1,0,0,01M184,3.000000,3.125000
71247,student_57833,Residential District 01,0,0,0,0,0.256,1,0,0,0,0,0,0,01M539,2.615385,4.030303
71248,student_32259,Residential District 01,0,1,0,0,0.950,0,0,0,0,1,0,0,01M188,3.083333,2.000000


In [3]:
ranking_df = pd.read_csv(cwd + "/Data/student_prefs.csv")
ranking_df

Unnamed: 0,Student_Id,School,Rank,Rating
0,student_68963,02M411,0,1.000000
1,student_68963,02M376,1,0.500000
2,student_68963,02M316,2,0.333333
3,student_68963,02M438,3,0.250000
4,student_68963,01M448,4,0.200000
...,...,...,...,...
492363,student_36332,02M414,2,0.333333
492364,student_36332,20K490,3,0.250000
492365,student_36332,18K637,4,0.200000
492366,student_36332,28Q686,5,0.166667


# Task 2) Generate & Export Student Preferences
> Note: decide how many schools each student should rank and which preference

### Case 1: full random with no weight
### *Case 2: full random, ordered by popularity buckets
### Case 3: popularity_bucket-weighted randomness
### *Case 4: popularity_bucket-weighted randomness, ordered by popularity buckets

In [4]:
student_ids = list(ranking_df["Student_Id"].unique())  # get student ids
if testing:  # subset of data for testing
    student_ids = student_ids[:200]

# GET NUMBER OF SCHOOLS RANKED PER STUDENTS
full_list = True  # **students rank 12 schools, ELSE students rank schools based on real-world distribution
student_school_counts = {}  # student_id: #schools to choose
max_num_schools = 12  # maximum number of rankings a student can have
if full_list:
    student_school_counts = dict([[sid, max_num_schools] for sid in student_ids])
else:
    student_school_counts = dict([[sid, 0] for sid in student_ids])
    for i, row in tqdm(ranking_df.iterrows()):  # for each ranking, add 1 to student
        assert row["Student_Id"] in student_school_counts, row["Student_Id"]
        student_school_counts[row["Student_Id"]] += 1

# show descriptive stats (sanity check)
pd.DataFrame(student_school_counts.values()).describe()

Unnamed: 0,0
count,71250.0
mean,12.0
std,0.0
min,12.0
25%,12.0
50%,12.0
75%,12.0
max,12.0


In [5]:
school_ids = list(ranking_df["School"].unique())

# MAKING STUDENT_RANKINGS STRUCTURE
unknown_val = "nan"  # placeholder for no rankings
student_rankings = dict([ [sid, [unknown_val for i in range(max_num_schools)]] for sid in student_ids])  # initializing dict

for sid in tqdm(student_rankings.keys()):
    # CASE 1: RANDOM IMPLEMENTATION
    student_rankings[sid] = np.random.choice(school_ids, size=student_school_counts[sid], replace=False).tolist()

    # TODO IMPLEMENT CASE 2 AND 3 AND 4 ---> CHECK POPULARITY RATE, WHAT PERCENTAGE OF SYNTHETIC STUDENTS HAVE APPLIED
    # ARE THE MOST POPULAR SCHOOLS GETTING THE MOST APPLICATIONS (num applicants) -- sanity check for simulation results
print("\nExample Student Ranking\n", student_rankings[np.random.choice(student_ids)])

100%|████████████████████████████████████████| 71250/71250 [00:09<00:00, 7234.20it/s]



Example Student Ranking
 ['07X427', '13K595', '08X367', '11X455', '29Q265', '19K764', '24Q485', '17K646', '02M427', '15K667', '31R600', '02M418']


In [6]:
# stats for sanity check
rank_avg = np.mean([len(rank) for sid, rank in student_rankings.items()])
print("Current average number of schools ranked per student:", rank_avg)

Current average number of schools ranked per student: 12.0


In [7]:
# saving student rankings
np.save(cwd+"/Data/Generated/student_rankings_stage1.npy", student_rankings)
print("saved at", "/Data/Generated/student_rankings_stage1.npy")

saved at /Data/Generated/student_rankings_stage1.npy


## Subtask) Deriving School Popularity via True Applicant Rate

In [8]:
""" Data: https://github.com/KoraSHughes/DataLife/blob/main/BackEnd/Data/school_directory.xlsx
DV-EF = applicants
EG-ER = applicant/seats
MN-MY = total seats

TODO parse csv, used tommaso's structure - derive popularity for stage 2
"""
# placeholder data
school_to_pop = dict([[sid, random.randint(1, 5)] for sid in school_ids])
school_to_pop  # Note: dict is more efficient than dataframe for sorting

{'02M411': 5,
 '02M376': 3,
 '02M316': 3,
 '02M438': 5,
 '01M448': 3,
 '02M298': 5,
 '03M402': 1,
 '01M696': 4,
 '02M412': 4,
 '02M416': 2,
 '02M294': 5,
 '01M292': 4,
 '25Q281': 2,
 '02M543': 2,
 '02M418': 4,
 '02M422': 3,
 '05M362': 2,
 '02M551': 5,
 '02M414': 5,
 '02M439': 5,
 '04M435': 1,
 '02M288': 3,
 '01M539': 3,
 '01M450': 5,
 '03M541': 2,
 '02M605': 3,
 '02M400': 2,
 '02M615': 3,
 '02M600': 5,
 '32K168': 5,
 '03M479': 4,
 '02M407': 3,
 '02M139': 5,
 '02M545': 1,
 '27Q309': 2,
 '02M282': 4,
 '02M308': 5,
 '15K684': 2,
 '02M413': 5,
 '07X625': 3,
 '30Q580': 1,
 '02M580': 3,
 '02M305': 3,
 '24Q560': 4,
 '13K605': 2,
 '02M534': 2,
 '02M135': 2,
 '21K690': 5,
 '06M540': 5,
 '02M489': 3,
 '02M392': 3,
 '01M515': 4,
 '06M346': 5,
 '04M555': 3,
 '27Q260': 1,
 '02M374': 3,
 '02M630': 4,
 '02M546': 1,
 '22K555': 3,
 '04M610': 4,
 '10X439': 3,
 '05M369': 2,
 '24Q299': 4,
 '20K445': 2,
 '02M419': 1,
 '02M399': 5,
 '02M533': 2,
 '02M260': 2,
 '13K439': 1,
 '32K556': 2,
 '13K527': 3,
 '13K3

In [10]:
# save generated popularity
translation = [[sid, pop] for sid, pop in school_to_pop.items()]
np.save(cwd+"/Data/Generated/school_demographics.npy", pd.DataFrame(translation, columns=["Name", "Popularity"]))
print("saved at", "/Data/Generated/school_demographics.npy")

saved at /Data/Generated/school_demographics.npy


In [16]:
# add popularity-ordering to ranking
s2_student_rankings = {}
for sid, ranks in tqdm(student_rankings.items()):
    new_ranks = sorted(ranks, key=lambda x: school_to_pop[x], reverse=True)  # sort by dict vals
    s2_student_rankings[sid] = new_ranks  # append new rank
s2_student_rankings[np.random.choice(student_ids)]

100%|██████████████████████████████████████| 71250/71250 [00:00<00:00, 302157.93it/s]


['08X530',
 '26Q435',
 '08X293',
 '15K429',
 '14K685',
 '17K543',
 '28Q284',
 '13K605',
 '10X351',
 '29Q492',
 '17K745',
 '16K498']

In [17]:
# saving student rankings
np.save(cwd+"/Data/Generated/student_rankings_stage2.npy", s2_student_rankings)
print("saved at", "/Data/Generated/student_rankings_stage2.npy")

saved at /Data/Generated/student_rankings_stage2.npy


In [22]:
# add popularity weighted choice
s3_student_rankings = {}
weighted_schools = []
for school, prio in tqdm(school_to_pop.items()):  # generate school list weighted by popularity
    for i in range(prio):
        weighted_schools.append(school)
print("Note: number of schools is", len(weighted_schools), "which should be <", len(school_to_pop), "* 5")
        
for sid, ranks in tqdm(student_rankings.items()):  # choose from weighted list
    new_ranks = np.random.choice(weighted_schools, size=student_school_counts[sid], replace=False).tolist()
    s3_student_rankings[sid] = new_ranks
    
s3_student_rankings[np.random.choice(student_ids)]

100%|██████████████████████████████████████████| 437/437 [00:00<00:00, 700386.26it/s]


Note: number of schools is 1283 which should be < 437 * 5


100%|████████████████████████████████████████| 71250/71250 [00:24<00:00, 2942.13it/s]


['11X270',
 '03M479',
 '12X271',
 '04M680',
 '21K540',
 '07X223',
 '08X348',
 '19K659',
 '15K448',
 '27Q650',
 '09X263',
 '07X600']

In [23]:
# saving student rankings
np.save(cwd+"/Data/Generated/student_rankings_stage3.npy", s3_student_rankings)
print("saved at", "/Data/Generated/student_rankings_stage3.npy")

saved at /Data/Generated/student_rankings_stage3.npy


In [24]:
# add popularity weighted choice AND popularity-ordering to ranking
s4_student_rankings = {}
for sid, ranks in tqdm(s3_student_rankings.items()):
    new_ranks = sorted(ranks, key=lambda x: school_to_pop[x], reverse=True)  # sort by dict vals
    s4_student_rankings[sid] = new_ranks  # append new rank
    
s4_student_rankings[np.random.choice(student_ids)]

100%|██████████████████████████████████████| 71250/71250 [00:00<00:00, 179233.55it/s]


['02M531',
 '06M467',
 '20K485',
 '32K168',
 '10X565',
 '17K546',
 '02M459',
 '10X237',
 '25Q241',
 '02M305',
 '02M303',
 '16K688']

In [25]:
# saving student rankings
np.save(cwd+"/Data/Generated/student_rankings_stage4.npy", s4_student_rankings)
print("saved at", "/Data/Generated/student_rankings_stage4.npy")

saved at /Data/Generated/student_rankings_stage4.npy


In [26]:
import uuid
# adding lotterly number
lnums = [uuid.uuid4().hex for i in range(len(student_df))]
student_df["Lottery"] = lnums
student_df

Unnamed: 0,Student_Id,Residential_District,swd,poverty,sex,ell,ENI,Black,Hispanic,Multi-Racial,White,Asian,Native American,Missing Race/Ethnicity Data,school,Math_score,ELA_score,Lottery
0,student_36332,Residential District Unknown,0,0,1,0,0.723,0,1,0,0,0,0,0,30Q127,2.000000,3.125000,96281f752bc944df923f8dc38ae8e4fb
1,student_36144,Residential District Unknown,0,1,0,0,0.684,0,1,0,0,0,0,0,27Q137,2.153846,2.750000,0cefb26e60b14ea18e9186b4c4586fd3
2,student_37038,Residential District Unknown,1,1,1,0,0.881,0,1,0,0,0,0,0,24Q061,2.846154,1.753425,7b2d4b810b584e2faa3d02d951c86f3b
3,student_614,Residential District Unknown,0,0,1,0,0.191,0,0,0,1,0,0,0,02M114,4.275862,4.181818,d4ac0c53efd448dda087d29f015e5efd
4,student_21981,Residential District Unknown,1,0,1,0,0.304,0,0,0,1,0,0,0,02M312,4.017241,3.125000,9b310761b7e94132b7e7395b8de8f73b
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71245,student_22462,Residential District 01,0,0,1,0,0.650,0,1,0,0,0,0,0,01M450,2.000000,1.972603,91a8484bd2f0449cbb69fb8c798f7694
71246,student_30857,Residential District 01,1,0,1,0,0.621,0,0,0,0,1,0,0,01M184,3.000000,3.125000,e1ef6dd72e404360bbce80418e68d77f
71247,student_57833,Residential District 01,0,0,0,0,0.256,1,0,0,0,0,0,0,01M539,2.615385,4.030303,69f0d7c5562a4ebdbe166c75c84c37b7
71248,student_32259,Residential District 01,0,1,0,0,0.950,0,0,0,0,1,0,0,01M188,3.083333,2.000000,a0ce84c7e4be4ef7b9675d97eab1ec36


In [27]:
# adding gpa, seat, and screen attributes

def add_fake_gpa(students):
    """ adds a gpa attribute based on the students ELA and Math scores """
    students["fake_gpa"] = students["Math_score"] + students["ELA_score"]
    max_score = max(students["fake_gpa"])
    students["fake_gpa"] = students["fake_gpa"].apply(lambda score: round(4*score/max_score, 2))  # gpa-normalized
    return students
    
def add_edopt(students):
    """ adds gpa percentile in 3rds for ed opt schools """
    lower = students["fake_gpa"].quantile(1/3)
    upper = students["fake_gpa"].quantile(2/3)
    seated = lambda x: 1 if x >= upper else (2 if x >= lower else 3)
    students["seat"] = students["fake_gpa"].apply(seated)
    return students

def add_screen(students):
    """ adds gpa percentile in 5ths for screen schools """
    lower1 = students["fake_gpa"].quantile(1/5)
    lower2 = students["fake_gpa"].quantile(2/5)
    upper1 = students["fake_gpa"].quantile(3/5)
    upper2 = students["fake_gpa"].quantile(4/5)
    def screened(gpa):
        if gpa >= upper2:
            return 1
        elif gpa >= upper1:
            return 2
        elif gpa >= lower2:
            return 3
        elif gpa >= lower1:
            return 4
        else:
            return 5
        
    students["screen"] = students["fake_gpa"].apply(lambda x: screened(x))
    return students

student_df = add_fake_gpa(student_df)
student_df = add_edopt(student_df)
student_df = add_screen(student_df)
student_df

Unnamed: 0,Student_Id,Residential_District,swd,poverty,sex,ell,ENI,Black,Hispanic,Multi-Racial,...,Asian,Native American,Missing Race/Ethnicity Data,school,Math_score,ELA_score,Lottery,fake_gpa,seat,screen
0,student_36332,Residential District Unknown,0,0,1,0,0.723,0,1,0,...,0,0,0,30Q127,2.000000,3.125000,96281f752bc944df923f8dc38ae8e4fb,2.29,2,3
1,student_36144,Residential District Unknown,0,1,0,0,0.684,0,1,0,...,0,0,0,27Q137,2.153846,2.750000,0cefb26e60b14ea18e9186b4c4586fd3,2.19,2,4
2,student_37038,Residential District Unknown,1,1,1,0,0.881,0,1,0,...,0,0,0,24Q061,2.846154,1.753425,7b2d4b810b584e2faa3d02d951c86f3b,2.05,3,4
3,student_614,Residential District Unknown,0,0,1,0,0.191,0,0,0,...,0,0,0,02M114,4.275862,4.181818,d4ac0c53efd448dda087d29f015e5efd,3.77,1,1
4,student_21981,Residential District Unknown,1,0,1,0,0.304,0,0,0,...,0,0,0,02M312,4.017241,3.125000,9b310761b7e94132b7e7395b8de8f73b,3.19,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71245,student_22462,Residential District 01,0,0,1,0,0.650,0,1,0,...,0,0,0,01M450,2.000000,1.972603,91a8484bd2f0449cbb69fb8c798f7694,1.77,3,5
71246,student_30857,Residential District 01,1,0,1,0,0.621,0,0,0,...,1,0,0,01M184,3.000000,3.125000,e1ef6dd72e404360bbce80418e68d77f,2.73,2,2
71247,student_57833,Residential District 01,0,0,0,0,0.256,1,0,0,...,0,0,0,01M539,2.615385,4.030303,69f0d7c5562a4ebdbe166c75c84c37b7,2.96,1,2
71248,student_32259,Residential District 01,0,1,0,0,0.950,0,0,0,...,1,0,0,01M188,3.083333,2.000000,a0ce84c7e4be4ef7b9675d97eab1ec36,2.27,2,3


In [28]:
# saving school demographics with updated attributes
np.save(cwd+"/Data/Generated/student_demographics.npy", student_df)
print("saved at", "/Data/Generated/student_demographics.npy")

saved at /Data/Generated/student_demographics.npy


## Step 3.5) Add profiles for school rankings

In [29]:
schools_criterion = ["District", "Lottery", "ELA_Score", "Math_Score", "Poverty_Index", "fake_gpa", "seat", "screen"]

# BASIC SCHOOL PROFILES (intuition)
# Note: these weights are in powers of 10 so they are closer to discrete ordering than pure ratios
district_school = [3, 1, 0, 0, 0, 0, 0, 0]
balanced_school = [3, 1, 2, 2, 1, 0, 0, 0]
stem_school =     [3, 1, 2, 5, 1, 0, 0, 0]  # Note: most schools weight ELA and math equivalently (less realistic)
lottery_school =  [1, 5, 0, 0, 0, 0, 0, 0]
elite_school =    [0, 1, 4, 4, 0, 0, 0, 0]
basic_school_profiles = [district_school, balanced_school, stem_school, lottery_school, elite_school]

def basic_rand_pref():
    """ generate a random school preference profile """
    return random.choice(basic_school_profiles)

In [30]:
"""
Open schools: exlcusively lottery
Ed Opt schools: 1/3rd seat for high/medium/low gpa (known) + lottery for inter-group tiebreaker
Screen schools: 5 group distribution of gpa but highest to lowest (known) + lottery tie breaker
Audition/Assessment: some sort of examination with ranking (school provides a score) + lottery tiebreaker

Note: most have set-asides & zoning regulations
Note: careful of skew (students with high seats disproportionally apply more to more popular schools)
# Open, Ed Opt, Screened, Audtion/Assessment
"""
#  ["District", "Lottery", "ELA_Score", "Math_Score", "Poverty_Index", "gpa", "seat", "screen"]
open_school = ["Lottery"]  # [0, 1, 0, 0, 0, 0, 0, 0]
edopt_school = ["seat", "Lottery"]  # [0, 1, 0, 0, 0, 0, 2, 0]
screen_school = ["screen", "Lottery"]  # [0, 0, 0, 0, 0, 0, 0, 1]
# audition_school = [0, 1, 0, 0, 0, 0, 0]
complex_school_profiles = [open_school, edopt_school, screen_school]

student_df.sort_values(edopt_school, ascending=True)

Unnamed: 0,Student_Id,Residential_District,swd,poverty,sex,ell,ENI,Black,Hispanic,Multi-Racial,...,Asian,Native American,Missing Race/Ethnicity Data,school,Math_score,ELA_score,Lottery,fake_gpa,seat,screen
20764,student_49577,Residential District 25,0,0,0,0,0.520,0,0,0,...,1,0,0,25Q025,2.461538,4.000000,000779c2be8f4dfa81731c7c41766c69,2.88,1,2
34962,student_39529,Residential District 20,1,0,1,0,0.335,0,0,0,...,0,0,0,20K686,4.137931,4.030303,0007ca2929134b08b65a4ab96471079e,3.64,1,1
64939,student_31780,Residential District 06,0,1,0,1,0.774,0,1,0,...,0,0,0,06M209,3.416667,4.060606,000c7ab71db549aebba4e6b945013673,3.34,1,1
7053,student_64612,Residential District 30,0,1,1,0,0.609,0,0,0,...,1,0,0,30Q141,4.241379,3.812500,00112d165e4542c3b7cf74b0aef34f79,3.59,1,1
22757,student_65381,Residential District 24,0,1,0,0,0.545,0,0,0,...,0,0,0,24Q087,4.068966,3.812500,00173265d38f4505b3b449fe1850a090,3.52,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56684,student_18836,Residential District 10,0,1,1,0,0.935,0,1,0,...,0,0,0,10X363,2.384615,1.986301,fff6ebf929f6493e9e8f41b0d3e03deb,1.95,3,4
47682,student_44614,Residential District 13,0,1,1,0,0.902,1,0,0,...,0,0,0,13K265,1.840000,2.562500,fffbfccfa795470a8c7e53dd5b69e594,1.96,3,4
55045,student_25370,Residential District 10,1,1,1,0,0.929,0,1,0,...,0,0,0,10X243,1.933333,1.808219,fffd95cb67cc4d2dbf19069a03d0ac92,1.67,3,5
22454,student_6685,Residential District 25,0,1,0,0,0.520,0,0,0,...,0,0,0,25Q194,1.986667,1.616438,fffe552e3d234a359253377196845ad6,1.61,3,5


In [43]:
students_open = student_df.sort_values(open_school, ascending=True)["Student_Id"].tolist()
students_edopt = student_df.sort_values(edopt_school, ascending=True)["Student_Id"].tolist()
students_screen = student_df.sort_values(screen_school, ascending=True)["Student_Id"].tolist()
student_ordering_choices = [students_open, students_edopt, students_screen]

def gen_school_choices(ids, choice_weights):
    assert len(choice_weights) == len(student_ordering_choices), "choices should follow schema:"+str(len(student_ordering_choices))
    weighted_choices = ""  # creates a string representing indicies of the choices
    for i, weight in enumerate(choice_weights):
        weighted_choices += str(i)*weight
    print(weighted_choices)
            
    school_choices = {}  # generate student choices
    for sid in ids:
        choice_type = int(weighted_choices[np.random.randint(len(weighted_choices))])
        school_choices[sid] = student_ordering_choices[choice_type]  # add corresponding choice
    return school_choices

In [None]:
# TODO: ensure order by discrete --> think SQL 'order_by' (next attribute is only looked at for tie-breakers)
# prioirity students always take presidence
# Note: recall you can affect capacity of school
# TODO: add student preferences proportional to school policy
# TODO: add distance from school's popularity
# TODO: take a better look at student preference generation
# TODO: add school district
# get_district = dict([[sid, random.choice(student_df["Residential_District"].unique())] for sid in school_ids])
# get_district[random.choice(school_ids)]

# Next steps: make one-shot function & seperate python file 

## Task 3: Generate School Rankings from profiles
### Subgoal) Create randomized school profiles
#### Case 1: all open school (lottery only)
#### Case 2: add screen and or ed opt policies
#### Case 3: add policy-weighted randomness for variation between schools

In [44]:
# generate profile
all_open = gen_school_choices(school_ids, [1,0,0])
# saving school rankings
np.save(cwd+"/Data/Generated/school_rankings_open.npy", all_open)
print("saved at", "/Data/Generated/school_rankings_open.npy")

0
saved at /Data/Generated/school_rankings_open.npy


In [45]:
# generate profile
all_edopt = gen_school_choices(school_ids, [0,1,0])
# saving school rankings
np.save(cwd+"/Data/Generated/school_rankings_edopt.npy", all_edopt)
print("saved at", "/Data/Generated/school_rankings_edopt.npy")

1
saved at /Data/Generated/school_rankings_edopt.npy


In [46]:
# generate profile
all_screen = gen_school_choices(school_ids, [0,0,1])
# saving school rankings
np.save(cwd+"/Data/Generated/school_rankings_screen.npy", all_screen)
print("saved at", "/Data/Generated/school_rankings_screen.npy")

2
saved at /Data/Generated/school_rankings_screen.npy


In [47]:
# generate profile
all_combo = gen_school_choices(school_ids, [1,1,1])
# saving school rankings
np.save(cwd+"/Data/Generated/school_rankings_combo.npy", all_combo)
print("saved at", "/Data/Generated/school_rankings_combo.npy")

012
saved at /Data/Generated/school_rankings_combo.npy


In [48]:
print(len(student_df), "students ranked alongside", len(all_combo), "schools")

71250 students ranked alongside 437 schools


In [None]:
def insert_student(student_data, df):
    """ insert a student into a ranked category """
    # TODO: find a more efficient way of doing this
    df.append(student_data, ignore_index=True)
    return gen_lottery_ranking(df)

In [None]:
# TODO collinearity matrix between ranking and student attributes like test scores/lottery number
# TODO: add gale-shapley priority => 'set asside'

# INTENDED OUTPUT

### student_rankings = {student_id : [school_ids_ranked]}

### school_rankings = {school ids : [student_ids_ranked]}