# Weighted Projections
### What is Weighted Projections?
### How are the weights calculated?
$$\mathscr{Prediction}_{eat,rt,prog,crs,mod,visa} = {\frac{\mathscr{CrsDemandHist}_{at,rt,prog,crs,mod, visa}}{\mathscr{NewAdmissions}_{at, prog}} * {\mathscr{ExpectedNewAdmissions}_{eat, prog}}}$$
### What are the benefits of using Weighted Projections?
### What are the drawbacks of using Weighted Projections?

---
# Code: Weighted Projections
## Initialization

In [1]:
# importing required libraries
# importing the required libraries
import os
import pandas as pd

os.chdir( os.path.join("..", "..", "..") )

# importing custom modules
from Code.src.modules.db_ops import *
from Code.src.modules.dataManager import DataManager
from Code.src.modules.eda import *
from Code.src.modules.db_ops import *

# initializing the DataManager
DM = DataManager()

In [2]:
# importing the data for analysis
df_finalEnrollment = DM.get_data('EnrollmentFinalStatus', 'pkl', 'processed')
db_finalEnrollment = DM.get_data('EnrollmentFinalStatus', 'db', 'processed')

### Configuring the model

In [3]:
def get_term_desc(term_code):
    if term_code == 202010:
        return 'Spring 2020 - COVID-19'
    term_year = term_code // 100
    term_name = 'Fall' if term_code % 100 == 70 else 'Spring'
    return str(term_name) + " " + str(term_year)

def get_term_code(term_desc):
    if term_desc == 'Spring 2020 - COVID-19':
        return 202010
    term_year = int(term_desc.split(' ')[1])
    term_name = 70 if term_desc.split(' ')[0] == 'Fall' else 10
    return term_year * 100 + term_name

get_term_desc(202110)

'Spring 2021'

In [4]:
# Getting all the semesters
semesters = db_finalEnrollment.runQuery("""--sql
    SELECT reg_term_desc, reg_term_code
    FROM EnrollmentFinalStatus
    WHERE
        reg_term_year > 2017
    GROUP BY reg_term_desc, reg_term_code
    ORDER BY reg_term_code
""")
semesters['reg_term_code'] = semesters['reg_term_code'].astype(int)

future_semesters = [202370, 202410, 202470, 202510]
future_semesters = pd.DataFrame(future_semesters, columns=['reg_term_code'])
future_semesters['reg_term_desc'] = future_semesters['reg_term_code'].apply(get_term_desc)

### Taking inputs

In [5]:
# Defining the model inputs
inputs = {    
    'eat_desc'      : "Fall 2023",
    'prog_desc'     : "MS Data Analytics Engineering",
    'courses'       : "All",
    'mod'           : "combined",
    'visa'          : "combined",
    'ExpN_eat'      : 150
}

In [6]:
# Building the model parameters
model_params = {
    'eat_desc'          : inputs['eat_desc'],
    'eat_code'          : get_term_code(inputs['eat_desc']),
    'prog_desc'         : inputs['prog_desc'],
    'courses'           : inputs['courses'],
    'mod'               : inputs['mod'],
    'visa'              : inputs['visa'],
    'ExpN_eat'          : inputs['ExpN_eat'],
    'enr_hist'          : 4 if inputs['visa'] == 'F1 Visa' else 8,
    'filters'           : {
        'prog_filter'   : inputs['prog_desc'] != 'All',
        'course_filter' : inputs['courses'] != 'All',
        'visa_filter'   : inputs['visa'] != 'combined'
    },
    'groupby'           : {
        'mod_groupby'   : inputs['mod'] != 'combined'
    }
}

### Model Calculations

- $at$        : Latest Suitable Academic Term
- $ExpN$      : Expected New Admissions for $eat$ Academic Term
    - $rat$   : Ratio of $ExpN_{eat}$ to $N_{at}$ Academic Terms

In [7]:
# Getting the latest suitable term code for the model
def get_latest_at(eat_code):

    # List of eligible semesters
    term_list = semesters[
        semesters['reg_term_code'] % 100 == eat_code % 100
    ] \
        .reg_term_code.tolist()

    # Earliest Data Point Available
    at_code_min = term_list[0]

    # Last enr_hist semesters before the last reg_term
    if semesters.reg_term_code.max() % 100 == model_params['eat_code'] % 100:
        at_code_max_1 = semesters.reg_term_code[ semesters.index.max() - model_params['enr_hist'] ]
    else:
        at_code_max_1 = semesters.reg_term_code[ semesters.index.max() + 1 - model_params['enr_hist'] ]

    # Atleast 2 terms before Expected Admit Term
    at_code_max_2 = eat_code - 100

    # Actual stu_admit_term_code
    print(f"at_code_min \t: {at_code_min}  |  (Earliest Data Point Available)",
        f"\nat_code_max_1 \t: {at_code_max_1}  |  (Latest reg_term_code for the given enr_hist)",
        f"\nat_code_max_2 \t: {at_code_max_2}  |  (Atleast 2 terms before the Expected Admit Term)",
        f"\nBest_at \t: {max(at_code_min, min(at_code_max_1, at_code_max_2))}  |  (Best admit_term for the given criteria.)"
    )
    return max(at_code_min, min(at_code_max_1, at_code_max_2))

model_params['at_code'] = get_latest_at(model_params['eat_code'])
model_params['at_desc'] = get_term_desc(model_params['at_code'])

at_code_min 	: 201870  |  (Earliest Data Point Available) 
at_code_max_1 	: 201970  |  (Latest reg_term_code for the given enr_hist) 
at_code_max_2 	: 202270  |  (Atleast 2 terms before the Expected Admit Term) 
Best_at 	: 201970  |  (Best admit_term for the given criteria.)


In [8]:
def list_to_str(l):
    """
    Create a string in the format of "('a', 'b', 'c', 'd')" from list ['a', 'b', 'c', 'd']
    """
    return str(tuple(l))

# Building the SQL WHERE conditions

# Program Filter
if model_params['filters']['prog_filter']:
    prog_cond = f"AND stu_prog_desc = '{model_params['prog_desc']}'"
else:
    prog_cond = "-- No program filters"

# Course Filter
if model_params['filters']['course_filter']:
    course_cond = f"AND crs IN {list_to_str(model_params['courses'])}"
else:
    course_cond = "-- No course filters"

# Visa Filter
if model_params['filters']['visa_filter']:
    visa_cond = f"AND stu_visa = '{model_params['visa']}'"
else:
    visa_cond = "-- No visa filters"

# Mod Groupby
if model_params['groupby']['mod_groupby']:
    mod_groupby = ",crs_sect_modality"
else:
    mod_groupby = "-- No modality grouping"

In [9]:
# Querying the database for the number of new enrollments in `at` Academic Term
query = f"""--sql
    SELECT COUNT(DISTINCT stu_id) AS new_enrollments
    FROM EnrollmentFinalStatus
    WHERE
        stu_admit_term_code = {model_params['at_code']}
        {prog_cond}
    GROUP BY
        stu_admit_term_code
"""

# Calculating number of new enrollments in `at` Academic Term
model_params['N_at'] = db_finalEnrollment.runQuery(query).new_enrollments[0]

# Calculating the ratio of Expected new enrollments in `eat` Academic Term to `at` Academic Term
model_params['rat_eat_at'] = model_params['ExpN_eat'] / model_params['N_at']


print(query, model_params['N_at'], model_params['rat_eat_at'], sep = '\n\n')

--sql
    SELECT COUNT(DISTINCT stu_id) AS new_enrollments
    FROM EnrollmentFinalStatus
    WHERE
        stu_admit_term_code = 201970
        AND stu_prog_desc = 'MS Data Analytics Engineering'
    GROUP BY
        stu_admit_term_code


173

0.8670520231213873


In [10]:
# Getting the list of semesters to be considered for calculating the historical enrollment demand
term_list = semesters.reg_term_code.tolist()[
    semesters.reg_term_code.tolist().index(model_params['at_code']) : semesters.reg_term_code.tolist().index(model_params['at_code']) + model_params['enr_hist']
]

In [11]:
# Getting CourseDemandHistory for the `at` Academic Term
query = f"""--sql
    SELECT crs, reg_term_code
        {',crs_sect_modality' if model_params['groupby']['mod_groupby'] else '--'}
        ,COUNT(DISTINCT stu_id) AS demand
    FROM EnrollmentFinalStatus
    WHERE
        stu_admit_term_code = {model_params['at_code']}
        AND reg_term_code IN {list_to_str(term_list)}
        {prog_cond}
        {course_cond}
        {visa_cond}
    GROUP BY
        crs, reg_term_code
        {mod_groupby}
"""

print(query)

--sql
    SELECT crs, reg_term_code
        --
        ,COUNT(DISTINCT stu_id) AS demand
    FROM EnrollmentFinalStatus
    WHERE
        stu_admit_term_code = 201970
        AND reg_term_code IN (201970, 202010, 202070, 202110, 202170, 202210, 202270, 202310)
        AND stu_prog_desc = 'MS Data Analytics Engineering'
        -- No course filters
        -- No visa filters
    GROUP BY
        crs, reg_term_code
        -- No modality grouping



In [12]:
# Querying the database for CourseDemandHistory for the `at` Academic Term with the above conditions
    # Resetting the index to crs and reg_term_code
        # Unstacking the reg_term_code column to get the demand for each semester in pivot table format, filling the missing values with 0
            # Resetting the index to get the demand for each semester in a column

df_CrsDemHist = db_finalEnrollment.runQuery(query)
df_CrsDemHist = df_CrsDemHist \
    .set_index(['crs', 'reg_term_code']) \
        .unstack(fill_value=0) \
            .reset_index()

df_CrsDemHist.columns = df_CrsDemHist.columns.droplevel(0)
df_CrsDemHist.columns.name = None
df_CrsDemHist.rename({'' : 'crs'}, axis=1, inplace=True)
df_CrsDemHist

Unnamed: 0,crs,201970,202010,202070,202110,202170,202210,202270,202310
0,AIT 512,0,0,0,1,0,0,0,0
1,AIT 524,14,42,17,5,5,0,2,1
2,AIT 542,0,0,0,1,0,0,0,0
3,AIT 580,103,28,8,3,0,0,1,0
4,AIT 582,15,27,14,2,0,0,0,0
...,...,...,...,...,...,...,...,...,...
92,SYST 588,0,5,0,0,0,1,0,0
93,SYST 618,0,0,0,0,0,1,0,0
94,SYST 664,0,6,0,9,0,1,0,0
95,SYST 688,1,0,0,0,0,0,1,0


In [13]:
# Generating predictions dataframe for eat Term by multiplying the ratio_eat_at with the previous Enrollment Demand History
df_Pred = df_CrsDemHist.copy()
df_Pred = pd.merge(
    df_Pred.iloc[:, :1],
    df_Pred.iloc[:, 1:] \
        .apply(lambda x: x * model_params['rat_eat_at'], axis=1),
    left_index=True, right_index=True
)

# Renaming the columns in the df_Pred dataframe with term_list
df_Pred.columns = ['crs'] + [int(i) + model_params['eat_code'] - model_params['at_code'] for i in df_Pred.columns.to_list()[1:]]
df_Pred

Unnamed: 0,crs,202370,202410,202470,202510,202570,202610,202670,202710
0,AIT 512,0.000000,0.000000,0.000000,0.867052,0.00000,0.000000,0.000000,0.000000
1,AIT 524,12.138728,36.416185,14.739884,4.335260,4.33526,0.000000,1.734104,0.867052
2,AIT 542,0.000000,0.000000,0.000000,0.867052,0.00000,0.000000,0.000000,0.000000
3,AIT 580,89.306358,24.277457,6.936416,2.601156,0.00000,0.000000,0.867052,0.000000
4,AIT 582,13.005780,23.410405,12.138728,1.734104,0.00000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...
92,SYST 588,0.000000,4.335260,0.000000,0.000000,0.00000,0.867052,0.000000,0.000000
93,SYST 618,0.000000,0.000000,0.000000,0.000000,0.00000,0.867052,0.000000,0.000000
94,SYST 664,0.000000,5.202312,0.000000,7.803468,0.00000,0.867052,0.000000,0.000000
95,SYST 688,0.867052,0.000000,0.000000,0.000000,0.00000,0.000000,0.867052,0.000000


In [14]:
# Getting difference between the dataframes
df_Diff = df_Pred.copy()
df_Diff.columns = ['crs'] + ['Sem'+str(i+1) for i in range(df_Diff.shape[1]-1)]
df_temp = df_CrsDemHist.copy()
df_temp.columns = ['crs'] + ['Sem'+str(i+1) for i in range(df_temp.shape[1]-1)]

# Subtracting df_temp from df_Diff
df_Diff = df_Diff.iloc[:, :1].merge(df_Diff.iloc[:, 1:].sub(df_temp.iloc[:, 1:]), left_index=True, right_index=True)
df_Diff

Unnamed: 0,crs,Sem1,Sem2,Sem3,Sem4,Sem5,Sem6,Sem7,Sem8
0,AIT 512,0.000000,0.000000,0.000000,-0.132948,0.00000,0.000000,0.000000,0.000000
1,AIT 524,-1.861272,-5.583815,-2.260116,-0.664740,-0.66474,0.000000,-0.265896,-0.132948
2,AIT 542,0.000000,0.000000,0.000000,-0.132948,0.00000,0.000000,0.000000,0.000000
3,AIT 580,-13.693642,-3.722543,-1.063584,-0.398844,0.00000,0.000000,-0.132948,0.000000
4,AIT 582,-1.994220,-3.589595,-1.861272,-0.265896,0.00000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...
92,SYST 588,0.000000,-0.664740,0.000000,0.000000,0.00000,-0.132948,0.000000,0.000000
93,SYST 618,0.000000,0.000000,0.000000,0.000000,0.00000,-0.132948,0.000000,0.000000
94,SYST 664,0.000000,-0.797688,0.000000,-1.196532,0.00000,-0.132948,0.000000,0.000000
95,SYST 688,-0.132948,0.000000,0.000000,0.000000,0.00000,0.000000,-0.132948,0.000000
