# Weighted Projection
Calculate the weights of proportions of student registering for each course in each semester.

In [2]:
# importing the required libraries
import os
import pandas as pd
import matplotlib.pyplot as plt

os.chdir( os.path.join("..", "..", "..") )

# importing custom modules
from Code.src.modules.db_ops import *
from Code.src.modules.dataManager import DataManager
from Code.src.modules.eda import *
from Code.src.modules.db_ops import *

# initializing the DataManager
DM = DataManager()

In [3]:
# importing the data for analysis
df = DM.get_data('EnrollmentFinalStatus', 'pkl', 'processed')
db = DM.get_data('EnrollmentFinalStatus', 'db', 'processed')

# Simple Weighted Projection
Taking all students, and calculating weights of proportions of student registering for each course in each semester.

How to improve the model:
- We can filter this data for international students and domestic students separately.
- We can calculate weights for Spring and Fall semesters separately.

## Data

- Admission Term: $\mathscr{at}$
- Registration Term: $\mathscr{rt}$
- Course: $\mathscr{crs}$
- Program: $\mathscr{prog}$
- Modality: $\mathscr{crs_sect_modality}$
- Student VISA Status: $\mathscr{stu_visa}$
- Number of Students, in a prog, admitting in a semester: $\mathscr{ns_{prog, at}}$
- Weights: $\mathscr{w_{at, rt, crs, prog}}$

### Points to note:
- `prog`, `modality`, and `VISA` can take any of the following values:
    - "Combined": Give the combined demand of all programs/modality/VISA status.
    - "All": Give the demand of all programs/modality/VISA status individually.
    - `@Specific`: Give the demand of a specific program/modality/VISA status.
- `course` can take any of the following values:
    - "All": Give the demand of all courses.
    - `@Specific`: Give the demand of a specific course.

# Algorithm

- Take User Inputs:
    - Following are the user inputs:
        - `eat` : Exactly 1 Expected Admission Term
            - Filtered, Aggregated, and defined in input as well
            - Set lower limit for options to MIN(reg_term_code) + 2 semesters

        - `prog` : Either exactly 1 Program or All the Programs
            - Filtered, Aggregated, and defined in input as well
        - `crs` : Either 1 course, OR  a List of Courses, OR All the Courses
            - Just Filtered
        - `mod` : Either of ["combined", "individual"]
            - Aggregated (if "individual")
        - `visa` : Either of ["combined", "F1 Visa", "Not Relevant"]
            - Filtered (if not "combined")
        - `Expected Enrollment` : Exactly 1 Number defined by `at`, `prog`
    - Example:
        - `eat`: "Fall 2021"
        - `prog`: "MS Data Analytics Engineering"
        - `mod`: 'combined'
        - `visa`: 'combined'
        - `crs`: 'All'
        - `Expected Enrollment for Fall 2021 Cohort in MS Data Analytics Engineering`: 100
- Calculate `at` as follows:
    - Query unique reg_term_code from `FinalEnrollmentStatus` where reg_term_code <= `eat`.
    - Sort the reg_term_codes in descending order.
    - Limit the result to 4 OR 7 based on `visa`.
- Calculate predicted enrollment for each course in subsequent semesters:
    - Define a function and calculate with `at` as argument:
        - Calculate EnrollmentChangeRatio as:
            - $\mathscr{rat_{at, prog}} = \frac{\mathscr{ExpN_{eat, prog}}}{\mathscr{N_{at, prog}}}$
        - Filter the `CrsDemandHistData` table for the following:
            - `stu_admit_term_desc` = (`eat`- 4) OR (`eat`- 7) based on `visa`
            - `stu_prog_desc` = `prog`
            - `crs_sect_modality` = `mod`
            - `stu_visa` = `visa`
            - `crs` IN [`crs`]
        - Group BY on `CrsDemandHistData` table for the following:
            - `reg_term_desc`
            - `crs_sect_modality` [if `mod` != 'combined']
            - `stu_visa` [if `visa` != 'combined']
            - `crs`
        - Calculate `N` by COUNT (DISTINCT `stu_id`) for each group.
    - Create a new column `ExpN` in the table, which is the product of `rat` and `N`
    - Use the function and calculate with `eat` as argument, and store it in a new column `RealN`.
    - Create a new column `AbsErr` which is the absolute difference between `RealN` and `ExpN`.

Prediction Calculation:

$$\mathscr{Prediction}_{eat,rt,prog,crs,mod,visa} = {\frac{\mathscr{CrsDemandHist}_{at,rt,prog,crs,mod, visa}}{\mathscr{NewAdmissions}_{at, prog}} * {\mathscr{ExpectedNewAdmissions}_{eat, prog}}}$$

In [4]:
# # prog should be "Combined", or a specific program
# prog        = "MS Data Analytics Engineering"
# # modality should be "Combined", "individual"
# modality    = "combined"
# # VISA should be "Combined", "individual"
# VISA        = "all"
# # course should be "All" or specific course
# course      = "All"

# prog_cond           = "--" if prog.lower() == "all"     else ""
# prog_filt_cond      = "--" if prog.lower() == "combined" else ""
# course_cond         = "--" if course.lower() == "all"   else ""
# modality_cond       = "--" if modality.lower() == "combined" else ""
# modality_fil_cond   = "--" if modality.lower() == "all"  else ""
# VISA_cond           = "--" if VISA.lower() == "all"    else ""
# VISA_filt_cond      = "--" if VISA.lower() == "combined" else ""

In [5]:
new_admissions = db.runQuery(f""" --sql
    SELECT
        stu_admit_term_desc,
        COUNT(DISTINCT stu_id) AS num_stu
    FROM EnrollmentFinalStatus
    WHERE
        stu_admit_term_year > 2017
        AND stu_admit_term_name != 'Summer'
    GROUP BY
        stu_admit_term_desc
    ORDER BY
        stu_admit_term_code
    ;
""")
new_admissions

Unnamed: 0,stu_admit_term_desc,num_stu
0,Spring 2018,231
1,Fall 2018,427
2,Spring 2019,271
3,Fall 2019,486
4,Spring 2020 - COVID-19,286
5,Fall 2020,416
6,Spring 2021,293
7,Fall 2021,716
8,Spring 2022,407
9,Fall 2022,884


In [4]:
# Query new_admissions dataframe by stu_admit_term_desc, stu_prog_desc, stu_visa
# and return the number of students
def get_num_stu(df, term, prog, visa):
    return df.loc[(df.stu_admit_term_desc == term) & (df.stu_prog_desc == prog) & (df.stu_visa == visa), 'num_stu'].values[0]

In [5]:
crs_demand_hist = db.runQuery(f""" --sql
    SELECT
        stu_admit_term_desc, reg_term_desc, stu_prog_desc, crs, crs_sect_modality, stu_visa,
        COUNT (DISTINCT stu_id) AS stu_count
    FROM EnrollmentFinalStatus
    WHERE
        stu_admit_term_name != "Summer" AND
        reg_status IN ("**Web Registered**", "Wait Listed", "**Registered**") AND
        stu_admit_term_year - reg_term_year <= 5 AND
        stu_admit_term_year > 2017
    GROUP BY
        stu_admit_term_desc, reg_term_desc, crs, stu_prog_desc, crs_sect_modality, stu_visa
    ORDER BY
        stu_admit_term_code, reg_term_code, stu_prog_desc, crs, stu_visa
    ;
""")
crs_demand_hist

Unnamed: 0,stu_admit_term_desc,reg_term_desc,stu_prog_desc,crs,crs_sect_modality,stu_visa,stu_count
0,Spring 2018,Spring 2018,MENG GeoConStruct Engineering,CEIE 512,F2F,F1 Visa,1
1,Spring 2018,Spring 2018,MENG GeoConStruct Engineering,CEIE 527,F2F,F1 Visa,1
2,Spring 2018,Spring 2018,MENG GeoConStruct Engineering,CEIE 575,F2F,F1 Visa,1
3,Spring 2018,Spring 2018,MENG GeoConStruct Engineering,CEIE 613,F2F,F1 Visa,2
4,Spring 2018,Spring 2018,MENG GeoConStruct Engineering,CEIE 639,F2F,F1 Visa,1
...,...,...,...,...,...,...,...
14118,Spring 2023,Spring 2023,MS Telecommunications,TCOM 500,F2F,Not Relevent,2
14119,Spring 2023,Spring 2023,MS Telecommunications,TCOM 514,F2F,Not Relevent,1
14120,Spring 2023,Spring 2023,MS Telecommunications,TCOM 535,F2F,F1 Visa,1
14121,Spring 2023,Spring 2023,MS Telecommunications,TCOM 535,F2F,Not Relevent,2


In [6]:
crs_demand_hist.iloc[0]

stu_admit_term_desc                      Spring 2018
reg_term_desc                            Spring 2018
stu_prog_desc          MENG GeoConStruct Engineering
crs                                         CEIE 512
crs_sect_modality                                F2F
stu_visa                                     F1 Visa
stu_count                                          1
Name: 0, dtype: object

In [7]:
weights = pd.DataFrame(
    columns=['stu_admit_term_desc', 'reg_term_desc', 'stu_prog_desc', 'crs', 'crs_sect_modality', 'stu_visa', 'weight']
)

# iterating through the rows of the dataframe
for n_row in range(len(crs_demand_hist)):
    # fetching the record of the current row
    rec = crs_demand_hist.iloc[n_row]
    new_rec = rec[:-1]
    new_rec['weight'] = float(rec.stu_count) / float(new_admissions.loc[
            (new_admissions.stu_admit_term_desc == rec.stu_admit_term_desc) &
            (new_admissions.stu_prog_desc == rec.stu_prog_desc) &
            (new_admissions.stu_visa == rec.stu_visa),
            'num_stu'
        ].values[0])
    # weights.iloc[n_row] = new_rec
    weights = weights.append(new_rec, ignore_index=True)
    # pd.concat([weights, pd.DataFrame(new_rec)], axis=1)

weights

  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights 

Unnamed: 0,stu_admit_term_desc,reg_term_desc,stu_prog_desc,crs,crs_sect_modality,stu_visa,weight
0,Spring 2018,Spring 2018,MENG GeoConStruct Engineering,CEIE 512,F2F,F1 Visa,0.5
1,Spring 2018,Spring 2018,MENG GeoConStruct Engineering,CEIE 527,F2F,F1 Visa,0.5
2,Spring 2018,Spring 2018,MENG GeoConStruct Engineering,CEIE 575,F2F,F1 Visa,0.5
3,Spring 2018,Spring 2018,MENG GeoConStruct Engineering,CEIE 613,F2F,F1 Visa,1.0
4,Spring 2018,Spring 2018,MENG GeoConStruct Engineering,CEIE 639,F2F,F1 Visa,0.5
...,...,...,...,...,...,...,...
14118,Spring 2023,Spring 2023,MS Telecommunications,TCOM 500,F2F,Not Relevent,1.0
14119,Spring 2023,Spring 2023,MS Telecommunications,TCOM 514,F2F,Not Relevent,0.5
14120,Spring 2023,Spring 2023,MS Telecommunications,TCOM 535,F2F,F1 Visa,1.0
14121,Spring 2023,Spring 2023,MS Telecommunications,TCOM 535,F2F,Not Relevent,1.0


In [8]:
# Saving the weights to a new sqlite database
db_weights = ConnectDB(
    os.path.join("Data", "02_processed", "weights.db")
)
weights.to_sql('weights', db_weights.connection, if_exists='replace', index=False)
db_weights.commitDB()

# Saving the weights to a csv file
weights.to_csv(os.path.join("Data", "02_processed", "weights.csv"), index=False)

# Saving the weights to a pickle file
weights.to_pickle(os.path.join("Data", "02_processed", "weights.pkl"))