# Main Document to Load and Traverse Data

> Expected Data: school admissions preferences, number of students admitted per attribute category, etc.

In [36]:
import pandas as pd
import numpy as np
import os

from tqdm import tqdm

cwd = os.getcwd()
testing = True  # ******

## Schema:
Nutrition Label:
- School Label:
  - Name
  - ID
  - AP courses offered
  - Location
  - Subject Specialization
  - Accomodations
  - Admission Preferences
   - gpa cutoff
   - zone
   - subject specification


- Student Label:
  - Name
  - Location
  - Zone
  - School ID
  - Subject preference
  - AP course preferences

## Task 1: Load student demographics and preferences and join

In [37]:
student_df = pd.read_csv(cwd + "/Data/student_info_with_demographics.csv")
student_df

Unnamed: 0,Student_Id,Residential_District,swd,poverty,sex,ell,ENI,Black,Hispanic,Multi-Racial,White,Asian,Native American,Missing Race/Ethnicity Data,school,Math_score,ELA_score
0,student_36332,Residential District Unknown,0,0,1,0,0.723,0,1,0,0,0,0,0,30Q127,2.000000,3.125000
1,student_36144,Residential District Unknown,0,1,0,0,0.684,0,1,0,0,0,0,0,27Q137,2.153846,2.750000
2,student_37038,Residential District Unknown,1,1,1,0,0.881,0,1,0,0,0,0,0,24Q061,2.846154,1.753425
3,student_614,Residential District Unknown,0,0,1,0,0.191,0,0,0,1,0,0,0,02M114,4.275862,4.181818
4,student_21981,Residential District Unknown,1,0,1,0,0.304,0,0,0,1,0,0,0,02M312,4.017241,3.125000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71245,student_22462,Residential District 01,0,0,1,0,0.650,0,1,0,0,0,0,0,01M450,2.000000,1.972603
71246,student_30857,Residential District 01,1,0,1,0,0.621,0,0,0,0,1,0,0,01M184,3.000000,3.125000
71247,student_57833,Residential District 01,0,0,0,0,0.256,1,0,0,0,0,0,0,01M539,2.615385,4.030303
71248,student_32259,Residential District 01,0,1,0,0,0.950,0,0,0,0,1,0,0,01M188,3.083333,2.000000


In [38]:
ranking_df = pd.read_csv(cwd + "/Data/student_prefs.csv")
ranking_df

Unnamed: 0,Student_Id,School,Rank,Rating
0,student_68963,02M411,0,1.000000
1,student_68963,02M376,1,0.500000
2,student_68963,02M316,2,0.333333
3,student_68963,02M438,3,0.250000
4,student_68963,01M448,4,0.200000
...,...,...,...,...
492363,student_36332,02M414,2,0.333333
492364,student_36332,20K490,3,0.250000
492365,student_36332,18K637,4,0.200000
492366,student_36332,28Q686,5,0.166667


In [49]:
student_ids = list(ranking_df["Student_Id"].unique())  # get student ids
if testing:  # subset of data for testing
    student_ids = student_ids[:100]

num_student_ranks = 12  # maximum number of rankings a student can have
student_rankings = dict([ [sid, ["nan" for i in range(num_student_ranks)]] for sid in student_ids])  # initializing dict
for i, row in tqdm(ranking_df.iterrows()):  # Note: this is slower for testing purposes but faster for total runtime (approx O(n^2))
    if not testing or row["Student_Id"] in student_ids:
        student_rankings[row["Student_Id"]][int(row["Rank"])] = row["School"]

print("Example Student Ranking\n", student_rankings[np.random.choice(student_ids)])

492368it [00:17, 27893.72it/s]

Example Student Ranking
 ['05M362', '02M600', '02M407', '02M411', '02M605', '01M450', '02M533', '02M416', '02M432', '30Q501', '02M615', 'nan']





In [21]:
# saving student rankings
np.save(cwd+"/Data/student_rankings.npy", student_rankings)
print("saved at", "/Data/student_rankings.npy")

saved at /Data/student_rankings.npy


## Task 2: Aggregate School Codes & Generate Random Rankings

In [45]:
school_ids = list(ranking_df["School"].unique())
school_rankings = {}
for sid in tqdm(school_ids):
    np.random.shuffle(student_ids)  # randomize id's
    school_rankings[sid] = student_ids
    
print("Example School Ranking\n", school_rankings[np.random.choice(school_ids)])

100%|██████████████████████████████████████████| 437/437 [00:00<00:00, 238940.27it/s]

Example School Ranking
 ['student_22462', 'student_2835', 'student_45124', 'student_15686', 'student_11209', 'student_58911', 'student_1143', 'student_37786', 'student_21897', 'student_54391', 'student_59638', 'student_33046', 'student_8083', 'student_67953', 'student_32415', 'student_7251', 'student_18662', 'student_37571', 'student_63350', 'student_13290', 'student_14481', 'student_39830', 'student_35343', 'student_56592', 'student_63688', 'student_21813', 'student_60964', 'student_51818', 'student_10194', 'student_12469', 'student_27971', 'student_66654', 'student_26311', 'student_28177', 'student_54954', 'student_61161', 'student_45928', 'student_25011', 'student_47272', 'student_6930', 'student_8017', 'student_57258', 'student_11550', 'student_47663', 'student_32259', 'student_52540', 'student_54004', 'student_16775', 'student_17797', 'student_9202', 'student_66008', 'student_56082', 'student_63062', 'student_3391', 'student_8242', 'student_8419', 'student_63496', 'student_27599',




In [34]:
# saving school rankings
np.save(cwd+"/Data/school_rankings.npy", school_rankings)
print("saved at", "/Data/school_rankings.npy")

saved at /Data/school_rankings.npy


### Subtask 2.1: Export

#### INTENDED OUTPUT
student_rankings = {student_id : [school_ids_ranked]}

school_rankings = {school ids : [student_ids_ranked]}

[student_rankings, school_rankings]