# Main Document to Load and Traverse Data

> Expected Data: school admissions preferences, number of students admitted per attribute category, etc.

In [36]:
import pandas as pd
import numpy as np
import os

from tqdm import tqdm

cwd = os.getcwd()
testing = True  # ******

## Schema:
Nutrition Label:
- School Label:
  - Name
  - ID
  - AP courses offered
  - Location
  - Subject Specialization
  - Accomodations
  - Admission Preferences
   - gpa cutoff
   - zone
   - subject specification


- Student Label:
  - Name
  - Location
  - Zone
  - School ID
  - Subject preference
  - AP course preferences

## Task 1: Load student demographics and preferences and join

In [37]:
student_df = pd.read_csv(cwd + "/Data/student_info_with_demographics.csv")
student_df

Unnamed: 0,Student_Id,Residential_District,swd,poverty,sex,ell,ENI,Black,Hispanic,Multi-Racial,White,Asian,Native American,Missing Race/Ethnicity Data,school,Math_score,ELA_score
0,student_36332,Residential District Unknown,0,0,1,0,0.723,0,1,0,0,0,0,0,30Q127,2.000000,3.125000
1,student_36144,Residential District Unknown,0,1,0,0,0.684,0,1,0,0,0,0,0,27Q137,2.153846,2.750000
2,student_37038,Residential District Unknown,1,1,1,0,0.881,0,1,0,0,0,0,0,24Q061,2.846154,1.753425
3,student_614,Residential District Unknown,0,0,1,0,0.191,0,0,0,1,0,0,0,02M114,4.275862,4.181818
4,student_21981,Residential District Unknown,1,0,1,0,0.304,0,0,0,1,0,0,0,02M312,4.017241,3.125000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71245,student_22462,Residential District 01,0,0,1,0,0.650,0,1,0,0,0,0,0,01M450,2.000000,1.972603
71246,student_30857,Residential District 01,1,0,1,0,0.621,0,0,0,0,1,0,0,01M184,3.000000,3.125000
71247,student_57833,Residential District 01,0,0,0,0,0.256,1,0,0,0,0,0,0,01M539,2.615385,4.030303
71248,student_32259,Residential District 01,0,1,0,0,0.950,0,0,0,0,1,0,0,01M188,3.083333,2.000000


In [38]:
ranking_df = pd.read_csv(cwd + "/Data/student_prefs.csv")
ranking_df

Unnamed: 0,Student_Id,School,Rank,Rating
0,student_68963,02M411,0,1.000000
1,student_68963,02M376,1,0.500000
2,student_68963,02M316,2,0.333333
3,student_68963,02M438,3,0.250000
4,student_68963,01M448,4,0.200000
...,...,...,...,...
492363,student_36332,02M414,2,0.333333
492364,student_36332,20K490,3,0.250000
492365,student_36332,18K637,4,0.200000
492366,student_36332,28Q686,5,0.166667


In [40]:
student_ids = list(ranking_df["Student_Id"].unique())  # get student ids
if testing:  # subset of data for testing
    student_ids = student_ids[:100]

student_rankings = {}
for sid in tqdm(student_ids):  # use database filtering and sorting to get individual student's ranks
    temp = ranking_df[ranking_df["Student_Id"]==sid]
    temp.sort_values("Rank", inplace=True, ascending=True)
    student_rankings[sid] = list(temp["School"])

print("Example Student Ranking\n", student_rankings[np.random.choice(school_ids)])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp.sort_values("Rank", inplace=True, ascending=True)
100%|██████████████████████████████████████████████| 100/100 [00:02<00:00, 33.91it/s]


{'student_68963': ['02M411',
  '02M376',
  '02M316',
  '02M438',
  '01M448',
  '02M298',
  '03M402'],
 'student_32259': ['01M696',
  '02M412',
  '02M416',
  '02M411',
  '02M294',
  '01M448',
  '01M292',
  '02M298'],
 'student_57833': ['25Q281',
  '02M411',
  '02M543',
  '02M418',
  '02M298',
  '02M438',
  '01M696',
  '02M422',
  '05M362'],
 'student_30857': ['02M543',
  '02M551',
  '02M414',
  '02M439',
  '04M435',
  '02M288',
  '01M539',
  '01M450',
  '03M541',
  '02M294'],
 'student_22462': ['02M605',
  '02M400',
  '03M541',
  '02M418',
  '02M294',
  '02M615',
  '02M439'],
 'student_21897': ['05M362',
  '02M615',
  '01M448',
  '02M414',
  '01M539',
  '02M376',
  '01M696',
  '02M600'],
 'student_31089': ['32K168', '03M479', '01M539', '02M407', '02M139', '02M298'],
 'student_63688': ['01M450',
  '02M615',
  '01M448',
  '02M418',
  '02M605',
  '02M414',
  '02M416',
  '02M545',
  '27Q309',
  '02M282'],
 'student_17797': ['02M308', '01M539', '01M292', '15K684', '02M413', '07X625'],
 'stud

In [21]:
# saving student rankings
np.save(cwd+"/Data/student_rankings.npy", student_rankings)
print("saved at", "/Data/student_rankings.npy")

saved at /Data/student_rankings.npy


## Task 2: Aggregate School Codes & Generate Random Rankings

In [43]:
school_ids = list(ranking_df["School"].unique())
school_rankings = {}
for sid in tqdm(school_ids):
    np.random.shuffle(student_ids)  # randomize id's
    school_rankings[sid] = student_ids
    
print("Example School Ranking\n", school_rankings[np.random.choice(school_ids)])

100%|██████████████████████████████████████████| 437/437 [00:00<00:00, 286392.32it/s]


In [34]:
# saving school rankings
np.save(cwd+"/Data/school_rankings.npy", school_rankings)
print("saved at", "/Data/school_rankings.npy")

saved at /Data/school_rankings.npy


### Subtask 2.1: Export

#### INTENDED OUTPUT
student_rankings = {student_id : [school_ids_ranked]}

school_rankings = {school ids : [student_ids_ranked]}

[student_rankings, school_rankings]