# Main Document to Load and Traverse Data

> Expected Data: school admissions preferences, number of students admitted per attribute category, etc.

In [1]:
import pandas as pd
import numpy as np
import os

from tqdm import tqdm

cwd = os.getcwd()
testing = False  # ******

## Schema:
Nutrition Label:
- School Label:
  - Name
  - ID
  - AP courses offered
  - Location
  - Subject Specialization
  - Accomodations
  - Admission Preferences
   - gpa cutoff
   - zone
   - subject specification


- Student Label:
  - Name
  - Location
  - Zone
  - School ID
  - Subject preference
  - AP course preferences

## Task 1: Load student demographics and preferences and join

In [2]:
student_df = pd.read_csv(cwd + "/Data/student_info_with_demographics.csv")
student_df

Unnamed: 0,Student_Id,Residential_District,swd,poverty,sex,ell,ENI,Black,Hispanic,Multi-Racial,White,Asian,Native American,Missing Race/Ethnicity Data,school,Math_score,ELA_score
0,student_36332,Residential District Unknown,0,0,1,0,0.723,0,1,0,0,0,0,0,30Q127,2.000000,3.125000
1,student_36144,Residential District Unknown,0,1,0,0,0.684,0,1,0,0,0,0,0,27Q137,2.153846,2.750000
2,student_37038,Residential District Unknown,1,1,1,0,0.881,0,1,0,0,0,0,0,24Q061,2.846154,1.753425
3,student_614,Residential District Unknown,0,0,1,0,0.191,0,0,0,1,0,0,0,02M114,4.275862,4.181818
4,student_21981,Residential District Unknown,1,0,1,0,0.304,0,0,0,1,0,0,0,02M312,4.017241,3.125000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71245,student_22462,Residential District 01,0,0,1,0,0.650,0,1,0,0,0,0,0,01M450,2.000000,1.972603
71246,student_30857,Residential District 01,1,0,1,0,0.621,0,0,0,0,1,0,0,01M184,3.000000,3.125000
71247,student_57833,Residential District 01,0,0,0,0,0.256,1,0,0,0,0,0,0,01M539,2.615385,4.030303
71248,student_32259,Residential District 01,0,1,0,0,0.950,0,0,0,0,1,0,0,01M188,3.083333,2.000000


In [3]:
ranking_df = pd.read_csv(cwd + "/Data/student_prefs.csv")
ranking_df

Unnamed: 0,Student_Id,School,Rank,Rating
0,student_68963,02M411,0,1.000000
1,student_68963,02M376,1,0.500000
2,student_68963,02M316,2,0.333333
3,student_68963,02M438,3,0.250000
4,student_68963,01M448,4,0.200000
...,...,...,...,...
492363,student_36332,02M414,2,0.333333
492364,student_36332,20K490,3,0.250000
492365,student_36332,18K637,4,0.200000
492366,student_36332,28Q686,5,0.166667


In [4]:
# MAKING STUDENT_RANKINGS STRUCTURE

student_ids = list(ranking_df["Student_Id"].unique())  # get student ids
if testing:  # subset of data for testing
    student_ids = student_ids[:400]

unknown_val = "nan"  # placeholder for no rankings
num_student_ranks = 12  # maximum number of rankings a student can have
student_rankings = dict([ [sid, [unknown_val for i in range(num_student_ranks)]] for sid in student_ids])  # initializing dict

cleaner = {}  # keeps track of the maximum ranking of a given student for the cleaning step
for i, row in tqdm(ranking_df.iterrows()):  # Note: this is slower for testing purposes but faster for total runtime
    if not testing or row["Student_Id"] in student_ids:
        student_rankings[row["Student_Id"]][int(row["Rank"])] = row["School"]
        if row["Student_Id"] not in cleaner or cleaner[row["Student_Id"]] < row["Rank"]:
            cleaner[row["Student_Id"]] = row["Rank"]  #

# optional cleaning step
if testing:
    print("Cleaning...")
for sid, rank in tqdm(cleaner.items()):  # more efficient than remove()
    if rank < num_student_ranks:
        student_rankings[sid] = student_rankings[sid][:rank+1]  # +1 due to exclusivity of slicing

print("\nExample Student Ranking\n", student_rankings[np.random.choice(student_ids)])

492368it [00:26, 18339.91it/s]
100%|█████████████████████████████████████| 71250/71250 [00:00<00:00, 1562706.41it/s]


Example Student Ranking
 ['01M696', '21K540', '22K405']





In [6]:
# stats for sanity check
rank_avg = np.mean([len(rank) for sid, rank in student_rankings.items()])
print("Current average number of schools ranked per student:", rank_avg)

Current average number of schools ranked per student: 6.910428070175438


In [6]:
# saving student rankings
np.save(cwd+"/Data/student_rankings.npy", student_rankings)
print("saved at", "/Data/student_rankings.npy")

saved at /Data/student_rankings.npy


### Subtask 1.1: Generate Student Lottery Number

In [4]:
import uuid

lnums = [uuid.uuid4().hex for i in range(len(student_df))]
student_df["Lottery"] = lnums
student_df

Unnamed: 0,Student_Id,Residential_District,swd,poverty,sex,ell,ENI,Black,Hispanic,Multi-Racial,White,Asian,Native American,Missing Race/Ethnicity Data,school,Math_score,ELA_score,Lottery
0,student_36332,Residential District Unknown,0,0,1,0,0.723,0,1,0,0,0,0,0,30Q127,2.000000,3.125000,36d1cbb370bf43aa914ee144acce51cc
1,student_36144,Residential District Unknown,0,1,0,0,0.684,0,1,0,0,0,0,0,27Q137,2.153846,2.750000,295ac2021fbd4d138ad23288138febe7
2,student_37038,Residential District Unknown,1,1,1,0,0.881,0,1,0,0,0,0,0,24Q061,2.846154,1.753425,b32f5f11a7aa4411adab9b2a9173dc03
3,student_614,Residential District Unknown,0,0,1,0,0.191,0,0,0,1,0,0,0,02M114,4.275862,4.181818,97693a92afa340ce9185731bbcb2b0d6
4,student_21981,Residential District Unknown,1,0,1,0,0.304,0,0,0,1,0,0,0,02M312,4.017241,3.125000,3ee697c0a26847719af6314166a0a0bd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71245,student_22462,Residential District 01,0,0,1,0,0.650,0,1,0,0,0,0,0,01M450,2.000000,1.972603,9b11f9898c164ec08f874a33a40a6c40
71246,student_30857,Residential District 01,1,0,1,0,0.621,0,0,0,0,1,0,0,01M184,3.000000,3.125000,f0635e2ae3764941b73b98d5b270f7e0
71247,student_57833,Residential District 01,0,0,0,0,0.256,1,0,0,0,0,0,0,01M539,2.615385,4.030303,c493a883596844feadea5e4bcf2efa5d
71248,student_32259,Residential District 01,0,1,0,0,0.950,0,0,0,0,1,0,0,01M188,3.083333,2.000000,277b51c8e60a47f8b229844c719e044a


## Task 2: Aggregate School Codes & Generate Preferences

In [5]:
""" IDEA:
    - generate a spreadsheet of what preferences schools have and rank the metrics
        ex: (District=1, Math=2, ELA=2, Lottery=3, Poverty=0) ==> District > Math = ELA > Lottery
"""
school_ids = list(ranking_df["School"].unique())

In [6]:
import random

schools_criterion = ["District", "Lottery", "ELA_Score", "Math_Score", "Poverty_Index"]
district_school = [5, 1, 0, 0, 0]
balanced_school = [3, 1, 2, 2, 1]
stem_school =     [3, 1, 2, 5, 1]
lottery_school =  [1, 5, 0, 0, 0]
elite_school =    [0, 1, 4, 4, 0]
school_profiles = [district_school, balanced_school, stem_school, lottery_school, elite_school]

def rand_pref():
    """ generate a random school preference profile """
    return random.choice(school_profiles)

school_preferences = dict([[school, rand_pref()] for school in school_ids])
school_preferences[random.choice(school_ids)]

[1, 5, 0, 0, 0]

In [7]:
# adding district (TODO: make more accurate)
get_district = dict([[sid, random.choice(student_df["Residential_District"].unique())] for sid in school_ids])
get_district[random.choice(school_ids)]

'Residential District 07'

## Task 3: Create more Complex School Ranking

### Subtask 3.1: Derive typical phenotypes for school preferences based on scoring (ei schools that have gpa buckets/cutoffs and schools that prefer certain subjects over others)
> Note: the goal is to simply be more accurate than a random lottery

### Subtask 3.2 Create create model for top 3 phenotypes and recalculate school_rankings

### Subtask 3.3 Add student lottery number (with hex conversion) and weight school-preference lottery by it

In [22]:
bounds = [0.25, 0.75]  # upper and lower bound for categorical properties like poverty & district (arbitrarily top 25% and bottom 75%)

def load_static_student_rankings(students, verbose=True):
    """ creates a dict of normalized rankings for each trait per school:
     {student_id: [criteria_a_ranking, criteria_b_ranking, ...], ...}
     """
    student_ids = list(students["Student_Id"].unique())
    
    # preload static rankings
    students = students.sample(frac=1).reset_index(drop=True)  # shuffle indicies
    if verbose:
        print("Preloading student rankings...")
    student_lottery = students[["Student_Id", "Lottery"]].copy(deep=True).sort_values("Lottery", ascending=True).reset_index(drop=True)
    student_ela = students[["Student_Id", "ELA_score"]].copy(deep=True).sort_values("ELA_score", ascending=False).reset_index(drop=True)
    student_math = students[["Student_Id", "Math_score"]].copy(deep=True).sort_values("Math_score", ascending=False).reset_index(drop=True)
    
    # merge rankings into a dict for quicker access
    if verbose:
        print("Merging student rankings...")
    student_placements = dict([[sid, [0, 0, 0, 0, 0]] for sid in student_ids])
    for i in tqdm(range(len(students))):  # add normalized ranking to placement
        # Note: district is dynamic and will be done during ranking function below
        student_placements[student_lottery.iloc[i]["Student_Id"]][1] = i/len(students)
        student_placements[student_ela.iloc[i]["Student_Id"]][2] = i/len(students)
        student_placements[student_math.iloc[i]["Student_Id"]][3] = i/len(students)
        if students.iloc[i]["poverty"] == 1:  # categorical
            student_placements[students.iloc[i]["Student_Id"]][4] = bounds[0]
        else:
            student_placements[students.iloc[i]["Student_Id"]][4] = bounds[1]
    return student_placements

student_places = load_static_student_rankings(student_df)
student_places[random.choice(list(student_df["Student_Id"].unique()))]

Preloading student rankings...
Merging student rankings...


100%|████████████████████████████████████████| 71250/71250 [00:27<00:00, 2559.83it/s]


[0, 0.9957754385964912, 0.1521543859649123, 0.046287719298245614, 0.25]

In [None]:
# Note: higher data-rate needed to complete: jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10 

def rank_students(school_ids, school_prefs, students, student_placements, verbose=True):
    """ creates school rankings structure based on phenotypical criteria:
    Criterion A) District
    Criterion B) Lottery (some should be lottery only)
    Criterion C) ELA+Math Score
    Criterion D) Poverty Index (proxy for race/gender)
    Criterion E) ...
    
    school_ids = list of all schools
    school_prefs = {school_id: preference list} (only if they have them)
    students = dataframe of student info (only need dynamic rankings and id)
    student_placements = {student_id: inverse_percentile_per_criterion }
    
    - derive unique formula for each school to rate preferences
        ex: total_rank = District_rank*10^2, Math_rank*10^1, ELA_rank*10^1, Lottery_rank*10^0
    - TODO: generate buckets for scores based on distribution
    
    Note: priority seats passed to gale-shapley code
    Note: O(school*students) runtime (aka order of 10mil calculations aka slow af)
    """
    
    school_rankings = {}
    student_ids = list(students["Student_Id"].unique())
    
    # create preferences per school
    if verbose:
        print("Loading rankings per school...")
    test_out = 0
    for school in tqdm(school_ids):
        test_out += 1
        if school in school_prefs:  # if preference specified
            if school_prefs[school][0] != 0:  # handle dynamic rankings (district)
                my_dist = lambda dist: bounds[0] if dist == get_district[school] else bounds[1]
                for i, row in students.iterrows():
                    student_placements[row["Student_Id"]][0] = my_dist(row["Residential_District"])
                
            # creating combined ranking
            combined_df = students[["Student_Id"]].copy(deep=True)
            combined = []
            for i, row in combined_df.iterrows():  # for each student
                score = 0
                for j, prio in enumerate(school_prefs[school]):  # for each preference
                    if prio > 0:
                        score += student_placements[row["Student_Id"]][j] * (10**prio)
                if (test_out%100)==2 and verbose and i == 5:  # testing
                    print("Score =", score, "for...")
                    print("Student:", student_placements[row["Student_Id"]])
                    print("School:", school_prefs[school])
                combined.append(score)
            combined_df["Combination_Rank"] = combined
            combined_df = combined_df.sort_values("Combination_Rank", ascending=True).reset_index(drop=True)
            school_rankings[school] = list(combined_df["Student_Id"])
            
        else:  # if no preference specified, randomize
            np.random.shuffle(student_ids)  # randomize id's
            school_rankings[sid] = student_ids

    print("Example School Ranking\n", school_rankings[np.random.choice(school_ids)][:5], "...")
    return school_rankings, student_placements

school_rankings, placements = rank_students(school_ids, school_preferences, student_df, student_places)

Loading rankings per school...


  0%|                                                | 1/437 [00:03<22:46,  3.13s/it]

Score = 83562.58771929824 for...
Student: [0.75, 0.8355508771929825, 0.2503157894736842, 0.47577543859649124, 0.25]
School: [1, 5, 0, 0, 0]


 23%|██████████▋                                   | 101/437 [09:32<32:58,  5.89s/it]

Score = 75008.35550877193 for...
Student: [0.75, 0.8355508771929825, 0.2503157894736842, 0.47577543859649124, 0.25]
School: [5, 1, 0, 0, 0]


 30%|█████████████▉                                | 132/437 [12:20<31:36,  6.22s/it]

In [None]:
# saving school rankings
np.save(cwd+"/Data/school_rankings.npy", school_rankings)
print("saved at", "/Data/school_rankings.npy")

In [None]:
print(len(student_rankings), "students ranked alongside", len(school_rankings), "schools")

# INTENDED OUTPUT

### student_rankings = {student_id : [school_ids_ranked]}

### school_rankings = {school ids : [student_ids_ranked]}