# Weighted Projection
Calculate the weights of proportions of student registering for each course in each semester.

In [1]:
# importing the required libraries
import os
import pandas as pd
import matplotlib.pyplot as plt

os.chdir( os.path.join("..", "..", "..") )

# importing custom modules
from Code.src.modules.db_ops import *
from Code.src.modules.dataManager import DataManager
from Code.src.modules.eda import *

# initializing the DataManager
DM = DataManager()

In [2]:
# importing the data for analysis
df = DM.get_data('EnrollmentFinalStatus', 'pkl', 'processed')
db = DM.get_data('EnrollmentFinalStatus', 'db', 'processed')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59409 entries, 0 to 323686
Data columns (total 35 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   rec_id               59409 non-null  int64         
 1   rec_ext_date         59409 non-null  datetime64[ns]
 2   file_name            59409 non-null  object        
 3   file_index           59409 non-null  int64         
 4   reg_term_code        59409 non-null  object        
 5   reg_term_year        59409 non-null  int64         
 6   reg_term_name        59409 non-null  category      
 7   reg_term_desc        59409 non-null  object        
 8   stu_id               59408 non-null  object        
 9   stu_deg_level        59409 non-null  category      
 10  stu_college          59409 non-null  category      
 11  stu_res              59409 non-null  category      
 12  stu_visa             59409 non-null  category      
 13  stu_bam              59409 non

# Simple Weighted Projection
Taking all students, and calculating weights of proportions of student registering for each course in each semester.

How to improve the model:
- We can filter this data for international students and domestic students separately.
- We can calculate weights for Spring and Fall semesters separately.

## Data

- Admission Term: $\mathscr{at}$
- Registration Term: $\mathscr{rt}$
- Course: $\mathscr{crs}$
- Program: $\mathscr{prog}$
- Number of Students, in a prog, admitting in a semester: $\mathscr{ns_{prog, at}}$
- Weights: $\mathscr{w_{at, rt, crs, prog}}$

In [5]:
df.columns

Index(['rec_id', 'rec_ext_date', 'file_name', 'file_index', 'reg_term_code',
       'reg_term_year', 'reg_term_name', 'reg_term_desc', 'stu_id',
       'stu_deg_level', 'stu_college', 'stu_res', 'stu_visa', 'stu_bam',
       'stu_new_ret', 'stu_dept', 'stu_dept_desc', 'stu_prog_code',
       'stu_prog_level', 'stu_prog_desc', 'stu_admit_term_code',
       'stu_admit_term_year', 'stu_admit_term_name', 'stu_admit_term_desc',
       'crs', 'crs_type', 'crs_credits', 'crs_hours', 'crs_sect',
       'crs_sect_clg', 'crs_sect_modality', 'crs_sect_wiley_ind', 'reg_status',
       'reg_status_date', 'stu_act_reg_ind'],
      dtype='object')

In [32]:
df.stu_prog_desc.value_counts()

MS Data Analytics Engineering     19553
MS Computer Science               15617
MS Applied Info Technology         4387
MS Software Engineering            2761
MS Civil & Infrastructure Engr     2643
MS Electrical Engineering          2375
MS Computer Engineering            2075
MS Information Systems             1745
MS Telecommunications              1314
MS Computer Forensics              1309
MS Infrmatn Security & Assrnce     1306
MS Systems Engineering             1260
MS Statistical Science              908
MS Operations Research              763
MS Cyber Security Engineering       451
MS Digital Forensics                401
MS Bioengineering                   237
MS Biostatistics                    214
MENG GeoConStruct Engineering        90
Name: stu_prog_desc, dtype: int64

In [50]:
# ns_prog,at: Number of Students from a particular program, taking admission on a particular semester, and registering/waitlisting for courses
StudentInputData = db.runQuery("""--sql
    SELECT stu_prog_desc, stu_admit_term_desc, COUNT(DISTINCT stu_id) AS ns_prog_at
    FROM EnrollmentFinalStatus
    WHERE reg_status IN ("**Web Registered**", "Wait Listed", "**Registered**")
    GROUP BY stu_prog_desc, stu_admit_term_desc
    ORDER BY stu_prog_desc, stu_admit_term_code;
""")
StudentInputData.sample(10)

Unnamed: 0,stu_prog_desc,stu_admit_term_desc,ns_prog_at
155,MS Data Analytics Engineering,Fall 2019,170
304,MS Systems Engineering,Spring 2019,5
101,MS Computer Forensics,Spring 2016,5
50,MS Biostatistics,Fall 2021,2
284,MS Statistical Science,Fall 2018,17
265,MS Software Engineering,Fall 2018,25
326,MS Telecommunications,Fall 2019,11
130,MS Computer Science,Fall 2021,247
11,MS Applied Info Technology,Spring 2014,1
104,MS Computer Forensics,Spring 2017,8


In [61]:
# w_at,rt,crs,prog Caclulating the weights for the projection
# admit semester, registration semester, course, student count
weights = db.runQuery("""--sql
    SELECT
        stu_prog_desc, stu_admit_term_desc, crs, reg_term_desc, COUNT(DISTINCT stu_id) AS w_prog_at_crs_rt
    FROM EnrollmentFinalStatus
    WHERE reg_status IN ("**Web Registered**", "Wait Listed", "**Registered**")
    GROUP BY stu_prog_desc, stu_admit_term_desc, crs, reg_term_desc
    ORDER BY stu_prog_desc, stu_admit_term_code, crs, reg_term_code;
""")

weights.head(10)

Unnamed: 0,stu_prog_desc,stu_admit_term_desc,crs,reg_term_desc,w_prog_at_crs_rt
0,MENG GeoConStruct Engineering,Fall 2013,CEIE 526,Spring 2019,1
1,MENG GeoConStruct Engineering,Fall 2013,CEIE 573,Spring 2019,1
2,MENG GeoConStruct Engineering,Fall 2013,CEIE 623,Fall 2018,1
3,MENG GeoConStruct Engineering,Fall 2013,CEIE 639,Fall 2018,1
4,MENG GeoConStruct Engineering,Fall 2013,CEIE 690,Spring 2018,1
5,MENG GeoConStruct Engineering,Fall 2013,CEIE 795,Spring 2018,1
6,MENG GeoConStruct Engineering,Fall 2013,CEIE 795,Fall 2018,1
7,MENG GeoConStruct Engineering,Fall 2014,CEIE 513,Fall 2019,1
8,MENG GeoConStruct Engineering,Fall 2014,CEIE 524,Fall 2017,1
9,MENG GeoConStruct Engineering,Fall 2014,CEIE 524,Spring 2020 - COVID-19,1


To make projections, we multiply the weights (the proportion of students who registered for each course in each semester) by the number of student enrolling in cohorts.

In [None]:
sem_dict ={
    'Spring 2018'   : ['Spring 2018',   'Fall 2018',    'Spring 2019',  'Fall 2019'],
    'Fall 2018'     : ['Fall 2018',     'Spring 2019',  'Fall 2019',    'Spring 2020'],
    'Spring 2019'   : ['Spring 2019',   'Fall 2019',    'Spring 2020',  'Fall 2020'],
    'Fall 2019'     : ['Fall 2019',     'Spring 2020',  'Fall 2020',    'Spring 2021'],
    'Spring 2020'   : ['Spring 2020',   'Fall 2020',    'Spring 2021',  'Fall 2021'],
    'Fall 2020'     : ['Fall 2020',     'Spring 2021',  'Fall 2021',    'Spring 2022'],
    'Spring 2021'   : ['Spring 2021',   'Fall 2021',    'Spring 2022',  'Fall 2022'],
    'Fall 2021'     : ['Fall 2021',     'Spring 2022',  'Fall 2022',    'Spring 2023']
}

In [3]:
df.reg_status.value_counts()

**Web Registered**                50631
Wait Listed                        4212
**Registered**                     2981
Drop-Course Cancelled               959
Drop/Delete                         485
Web Drop (Liability)                115
Web Withdrawal                       20
Registered for Audit                  2
Selective Withdrawal Exception        2
Withdrawal from Course                2
Name: reg_status, dtype: int64

In [59]:
StudentInputData[StudentInputData.stu_admit_term_desc == "Fall 2021"] \
[StudentInputData.stu_prog_desc == "MS Data Analytics Engineering"]

  StudentInputData[StudentInputData.stu_admit_term_desc == "Fall 2021"] \


Unnamed: 0,stu_prog_desc,stu_admit_term_desc,ns_prog_at
161,MS Data Analytics Engineering,Fall 2021,235
