# Main Document to Load and Traverse Data

> Expected Data: school admissions preferences, number of students admitted per attribute category, etc.

In [1]:
import pandas as pd
import numpy as np
import os

from tqdm import tqdm

cwd = os.getcwd()
testing = False  # ******

## Schema:
Nutrition Label:
- School Label:
  - Name
  - ID
  - AP courses offered
  - Location
  - Subject Specialization
  - Accomodations
  - Admission Preferences
   - gpa cutoff
   - zone
   - subject specification


- Student Label:
  - Name
  - Location
  - Zone
  - School ID
  - Subject preference
  - AP course preferences

## Task 1: Load student demographics and preferences and join

In [2]:
student_df = pd.read_csv(cwd + "/Data/student_info_with_demographics.csv")
student_df

Unnamed: 0,Student_Id,Residential_District,swd,poverty,sex,ell,ENI,Black,Hispanic,Multi-Racial,White,Asian,Native American,Missing Race/Ethnicity Data,school,Math_score,ELA_score
0,student_36332,Residential District Unknown,0,0,1,0,0.723,0,1,0,0,0,0,0,30Q127,2.000000,3.125000
1,student_36144,Residential District Unknown,0,1,0,0,0.684,0,1,0,0,0,0,0,27Q137,2.153846,2.750000
2,student_37038,Residential District Unknown,1,1,1,0,0.881,0,1,0,0,0,0,0,24Q061,2.846154,1.753425
3,student_614,Residential District Unknown,0,0,1,0,0.191,0,0,0,1,0,0,0,02M114,4.275862,4.181818
4,student_21981,Residential District Unknown,1,0,1,0,0.304,0,0,0,1,0,0,0,02M312,4.017241,3.125000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71245,student_22462,Residential District 01,0,0,1,0,0.650,0,1,0,0,0,0,0,01M450,2.000000,1.972603
71246,student_30857,Residential District 01,1,0,1,0,0.621,0,0,0,0,1,0,0,01M184,3.000000,3.125000
71247,student_57833,Residential District 01,0,0,0,0,0.256,1,0,0,0,0,0,0,01M539,2.615385,4.030303
71248,student_32259,Residential District 01,0,1,0,0,0.950,0,0,0,0,1,0,0,01M188,3.083333,2.000000


In [3]:
ranking_df = pd.read_csv(cwd + "/Data/student_prefs.csv")
ranking_df

Unnamed: 0,Student_Id,School,Rank,Rating
0,student_68963,02M411,0,1.000000
1,student_68963,02M376,1,0.500000
2,student_68963,02M316,2,0.333333
3,student_68963,02M438,3,0.250000
4,student_68963,01M448,4,0.200000
...,...,...,...,...
492363,student_36332,02M414,2,0.333333
492364,student_36332,20K490,3,0.250000
492365,student_36332,18K637,4,0.200000
492366,student_36332,28Q686,5,0.166667


In [4]:
student_ids = list(ranking_df["Student_Id"].unique())  # get student ids
if testing:  # subset of data for testing
    student_ids = student_ids[:400]

unknown_val = "nan"  # placeholder for no rankings
num_student_ranks = 12  # maximum number of rankings a student can have
student_rankings = dict([ [sid, [unknown_val for i in range(num_student_ranks)]] for sid in student_ids])  # initializing dict

cleaner = {}  # keeps track of the maximum ranking of a given student for the cleaning step
for i, row in tqdm(ranking_df.iterrows()):  # Note: this is slower for testing purposes but faster for total runtime
    if not testing or row["Student_Id"] in student_ids:
        student_rankings[row["Student_Id"]][int(row["Rank"])] = row["School"]
        if row["Student_Id"] not in cleaner or cleaner[row["Student_Id"]] < row["Rank"]:
            cleaner[row["Student_Id"]] = row["Rank"]  #

# optional cleaning step
if testing:
    print("Cleaning...")
for sid, rank in tqdm(cleaner.items()):  # more efficient than remove()
    if rank < num_student_ranks:
        student_rankings[sid] = student_rankings[sid][:rank+1]  # +1 due to exclusivity of slicing

print("\nExample Student Ranking\n", student_rankings[np.random.choice(student_ids)])

492368it [00:26, 18339.91it/s]
100%|█████████████████████████████████████| 71250/71250 [00:00<00:00, 1562706.41it/s]


Example Student Ranking
 ['01M696', '21K540', '22K405']





In [6]:
# stats for sanity check
rank_avg = np.mean([len(rank) for sid, rank in student_rankings.items()])
print("Current average number of schools ranked per student:", rank_avg)

Current average number of schools ranked per student: 6.910428070175438


In [6]:
# saving student rankings
np.save(cwd+"/Data/student_rankings.npy", student_rankings)
print("saved at", "/Data/student_rankings.npy")

saved at /Data/student_rankings.npy


## Task 2: Aggregate School Codes & Generate Random Rankings

In [5]:
school_ids = list(ranking_df["School"].unique())
school_rankings = {}
for sid in tqdm(school_ids):
    np.random.shuffle(student_ids)  # randomize id's
    school_rankings[sid] = student_ids
    
print("Example School Ranking\n", school_rankings[np.random.choice(school_ids)][:5], "...")

100%|█████████████████████████████████████████████| 437/437 [00:01<00:00, 406.79it/s]

Example School Ranking
 ['student_66203', 'student_62810', 'student_40718', 'student_16842', 'student_26971'] ...





In [8]:
# saving school rankings
np.save(cwd+"/Data/school_rankings.npy", school_rankings)
print("saved at", "/Data/school_rankings.npy")

saved at /Data/school_rankings.npy


#### INTENDED OUTPUT
student_rankings = {student_id : [school_ids_ranked]}

school_rankings = {school ids : [student_ids_ranked]}

In [9]:
print(len(student_rankings), "students ranked alongside", len(school_rankings), "schools")

71250 students ranked alongside 437 schools


## Task 3: Create more Complex School Ranking

### Subtask 3.1: Derive typical phenotypes for school preferences based on scoring (ei schools that have gpa buckets/cutoffs and schools that prefer certain subjects over others)
> Note: the goal is to simply be more accurate than a random lottery

### Subtask 3.2 Create create model for top 3 phenotypes and recalculate school_rankings

### Subtask 3.3 Add student lottery number (with hex conversion) and weight school-preference lottery by it

In [10]:
def rank_students(school_ids, school_preferences, students):
    """ creates school rankings structure based on phenotypical criteria:
    Criterion A) District
    Criterion B) Lottery (some should be lottery only)
    Criterion C) Math+ELA Score
    Criterion D) Poverty Index (proxy for race/gender)
    """
    
    # basic randomizer variant
    student_ids = list(students["Student_Id"].unique())
    school_rankings = {}
    for sid in tqdm(school_ids):
        np.random.shuffle(student_ids)  # randomize id's
        school_rankings[sid] = student_ids
    
    print("Example School Ranking\n", school_rankings[np.random.choice(school_ids)][:5], "...")
    return school_rankings

test_school_rankings = rank_students(list(ranking_df["School"].unique()), {}, student_df)

100%|█████████████████████████████████████████████| 437/437 [00:00<00:00, 491.09it/s]

Example School Ranking
 ['student_18678', 'student_35755', 'student_61449', 'student_60378', 'student_3546'] ...





In [None]:
# using low-income as a proxy for race