# Weighted Projection
Calculate the weights of proportions of student registering for each course in each semester.

In [1]:
# importing the required libraries
import os
import pandas as pd
import matplotlib.pyplot as plt

os.chdir( os.path.join("..", "..", "..") )

# importing custom modules
from Code.src.modules.db_ops import *
from Code.src.modules.dataManager import DataManager
from Code.src.modules.eda import *
from Code.src.modules.db_ops import *

# initializing the DataManager
DM = DataManager()

In [2]:
# importing the data for analysis
df = DM.get_data('EnrollmentFinalStatus', 'pkl', 'processed')
db = DM.get_data('EnrollmentFinalStatus', 'db', 'processed')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59409 entries, 0 to 323686
Data columns (total 35 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   rec_id               59409 non-null  int64         
 1   rec_ext_date         59409 non-null  datetime64[ns]
 2   file_name            59409 non-null  object        
 3   file_index           59409 non-null  int64         
 4   reg_term_code        59409 non-null  object        
 5   reg_term_year        59409 non-null  int64         
 6   reg_term_name        59409 non-null  category      
 7   reg_term_desc        59409 non-null  object        
 8   stu_id               59408 non-null  object        
 9   stu_deg_level        59409 non-null  category      
 10  stu_college          59409 non-null  category      
 11  stu_res              59409 non-null  category      
 12  stu_visa             59409 non-null  category      
 13  stu_bam              59409 non

In [3]:
df.stu_prog_desc.value_counts()

MS Data Analytics Engineering     19553
MS Computer Science               15617
MS Applied Info Technology         4387
MS Software Engineering            2761
MS Civil & Infrastructure Engr     2643
MS Electrical Engineering          2375
MS Computer Engineering            2075
MS Information Systems             1745
MS Telecommunications              1314
MS Computer Forensics              1309
MS Infrmatn Security & Assrnce     1306
MS Systems Engineering             1260
MS Statistical Science              908
MS Operations Research              763
MS Cyber Security Engineering       451
MS Digital Forensics                401
MS Bioengineering                   237
MS Biostatistics                    214
MENG GeoConStruct Engineering        90
Name: stu_prog_desc, dtype: int64

In [4]:
df.columns

Index(['rec_id', 'rec_ext_date', 'file_name', 'file_index', 'reg_term_code',
       'reg_term_year', 'reg_term_name', 'reg_term_desc', 'stu_id',
       'stu_deg_level', 'stu_college', 'stu_res', 'stu_visa', 'stu_bam',
       'stu_new_ret', 'stu_dept', 'stu_dept_desc', 'stu_prog_code',
       'stu_prog_level', 'stu_prog_desc', 'stu_admit_term_code',
       'stu_admit_term_year', 'stu_admit_term_name', 'stu_admit_term_desc',
       'crs', 'crs_type', 'crs_credits', 'crs_hours', 'crs_sect',
       'crs_sect_clg', 'crs_sect_modality', 'crs_sect_wiley_ind', 'reg_status',
       'reg_status_date', 'stu_act_reg_ind'],
      dtype='object')

# Simple Weighted Projection
Taking all students, and calculating weights of proportions of student registering for each course in each semester.

How to improve the model:
- We can filter this data for international students and domestic students separately.
- We can calculate weights for Spring and Fall semesters separately.

## Data

- Admission Term: $\mathscr{at}$
- Registration Term: $\mathscr{rt}$
- Course: $\mathscr{crs}$
- Program: $\mathscr{prog}$
- Modality: $\mathscr{crs_sect_modality}$
- Student VISA Status: $\mathscr{stu_visa}$
- Number of Students, in a prog, admitting in a semester: $\mathscr{ns_{prog, at}}$
- Weights: $\mathscr{w_{at, rt, crs, prog}}$

### Points to note:
- `prog`, `modality`, and `VISA` can take any of the following values:
    - "Combined": Give the combined demand of all programs/modality/VISA status.
    - "All": Give the demand of all programs/modality/VISA status individually.
    - `@Specific`: Give the demand of a specific program/modality/VISA status.
- `course` can take any of the following values:
    - "All": Give the demand of all courses.
    - `@Specific`: Give the demand of a specific course.

In [5]:
# # prog should be "Combined", "All" or specific program
# prog        = "MS Data Analytics Engineering"
# # modality should be "Combined", "All" or specific modality
# modality    = "combined"
# # VISA should be "Combined", "All" or specific VISA status
# VISA        = "all"
# # course should be "All" or specific course
# course      = "All"

# prog_cond           = "--" if prog.lower() == "all"     else ""
# prog_filt_cond      = "--" if prog.lower() == "combined" else ""
# course_cond         = "--" if course.lower() == "all"   else ""
# modality_cond       = "--" if modality.lower() == "combined" else ""
# modality_fil_cond   = "--" if modality.lower() == "all"  else ""
# VISA_cond           = "--" if VISA.lower() == "all"    else ""
# VISA_filt_cond      = "--" if VISA.lower() == "combined" else ""

In [6]:
new_admissions = db.runQuery(f""" --sql
    SELECT
        stu_admit_term_desc, stu_prog_desc, stu_visa,
        COUNT(DISTINCT stu_id) AS num_stu
    FROM EnrollmentFinalStatus
    WHERE
        stu_admit_term_year > 2017
        AND stu_admit_term_name != 'Summer'
    GROUP BY
        stu_admit_term_desc, stu_prog_desc, stu_visa
    ORDER BY
        stu_admit_term_code
    ;
""")
new_admissions

Unnamed: 0,stu_admit_term_desc,stu_prog_desc,stu_visa,num_stu
0,Spring 2018,MENG GeoConStruct Engineering,F1 Visa,2
1,Spring 2018,MS Applied Info Technology,F1 Visa,7
2,Spring 2018,MS Applied Info Technology,Not Relevent,37
3,Spring 2018,MS Civil & Infrastructure Engr,F1 Visa,8
4,Spring 2018,MS Civil & Infrastructure Engr,Not Relevent,2
...,...,...,...,...
312,Spring 2023,MS Software Engineering,Not Relevent,5
313,Spring 2023,MS Systems Engineering,F1 Visa,1
314,Spring 2023,MS Systems Engineering,Not Relevent,3
315,Spring 2023,MS Telecommunications,F1 Visa,1


In [7]:
# Query new_admissions dataframe by stu_admit_term_desc, stu_prog_desc, stu_visa
# and return the number of students
def get_num_stu(df, term, prog, visa):
    return df.loc[(df.stu_admit_term_desc == term) & (df.stu_prog_desc == prog) & (df.stu_visa == visa), 'num_stu'].values[0]

In [8]:
crs_demand_hist = db.runQuery(f""" --sql
    SELECT
        stu_admit_term_desc, reg_term_desc, stu_prog_desc, crs, crs_sect_modality, stu_visa,
        COUNT (DISTINCT stu_id) AS stu_count
    FROM EnrollmentFinalStatus
    WHERE
        stu_admit_term_name != "Summer" AND
        reg_status IN ("**Web Registered**", "Wait Listed", "**Registered**") AND
        stu_admit_term_year - reg_term_year <= 5 AND
        stu_admit_term_year > 2017
    GROUP BY
        stu_admit_term_desc, reg_term_desc, crs, stu_prog_desc, crs_sect_modality, stu_visa
    ORDER BY
        stu_admit_term_code, reg_term_code, stu_prog_desc, crs, stu_visa
    ;
""")
crs_demand_hist

Unnamed: 0,stu_admit_term_desc,reg_term_desc,stu_prog_desc,crs,crs_sect_modality,stu_visa,stu_count
0,Spring 2018,Spring 2018,MENG GeoConStruct Engineering,CEIE 512,F2F,F1 Visa,1
1,Spring 2018,Spring 2018,MENG GeoConStruct Engineering,CEIE 527,F2F,F1 Visa,1
2,Spring 2018,Spring 2018,MENG GeoConStruct Engineering,CEIE 575,F2F,F1 Visa,1
3,Spring 2018,Spring 2018,MENG GeoConStruct Engineering,CEIE 613,F2F,F1 Visa,2
4,Spring 2018,Spring 2018,MENG GeoConStruct Engineering,CEIE 639,F2F,F1 Visa,1
...,...,...,...,...,...,...,...
14118,Spring 2023,Spring 2023,MS Telecommunications,TCOM 500,F2F,Not Relevent,2
14119,Spring 2023,Spring 2023,MS Telecommunications,TCOM 514,F2F,Not Relevent,1
14120,Spring 2023,Spring 2023,MS Telecommunications,TCOM 535,F2F,F1 Visa,1
14121,Spring 2023,Spring 2023,MS Telecommunications,TCOM 535,F2F,Not Relevent,2


$$\mathscr{Weights}_{at,rt,prog,crs,mod,visa} = \frac{\mathscr{CrsDemandHist}_{at,rt,prog,crs,mod, visa}}{{\mathscr{NewAdmissions}_{at,prog, visa}}}$$

In [9]:
crs_demand_hist.iloc[0]

stu_admit_term_desc                      Spring 2018
reg_term_desc                            Spring 2018
stu_prog_desc          MENG GeoConStruct Engineering
crs                                         CEIE 512
crs_sect_modality                                F2F
stu_visa                                     F1 Visa
stu_count                                          1
Name: 0, dtype: object

In [10]:
weights = pd.DataFrame(
    columns=['stu_admit_term_desc', 'reg_term_desc', 'stu_prog_desc', 'crs', 'crs_sect_modality', 'stu_visa', 'weight']
)

# iterating through the rows of the dataframe
for n_row in range(len(crs_demand_hist)):
    # fetching the record of the current row
    rec = crs_demand_hist.iloc[n_row]
    new_rec = rec[:-1]
    new_rec['weight'] = float(rec.stu_count) / float(new_admissions.loc[
            (new_admissions.stu_admit_term_desc == rec.stu_admit_term_desc) &
            (new_admissions.stu_prog_desc == rec.stu_prog_desc) &
            (new_admissions.stu_visa == rec.stu_visa),
            'num_stu'
        ].values[0])
    # weights.iloc[n_row] = new_rec
    weights = weights.append(new_rec, ignore_index=True)
    # pd.concat([weights, pd.DataFrame(new_rec)], axis=1)

weights

  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights 

Unnamed: 0,stu_admit_term_desc,reg_term_desc,stu_prog_desc,crs,crs_sect_modality,stu_visa,weight
0,Spring 2018,Spring 2018,MENG GeoConStruct Engineering,CEIE 512,F2F,F1 Visa,0.5
1,Spring 2018,Spring 2018,MENG GeoConStruct Engineering,CEIE 527,F2F,F1 Visa,0.5
2,Spring 2018,Spring 2018,MENG GeoConStruct Engineering,CEIE 575,F2F,F1 Visa,0.5
3,Spring 2018,Spring 2018,MENG GeoConStruct Engineering,CEIE 613,F2F,F1 Visa,1.0
4,Spring 2018,Spring 2018,MENG GeoConStruct Engineering,CEIE 639,F2F,F1 Visa,0.5
...,...,...,...,...,...,...,...
14118,Spring 2023,Spring 2023,MS Telecommunications,TCOM 500,F2F,Not Relevent,1.0
14119,Spring 2023,Spring 2023,MS Telecommunications,TCOM 514,F2F,Not Relevent,0.5
14120,Spring 2023,Spring 2023,MS Telecommunications,TCOM 535,F2F,F1 Visa,1.0
14121,Spring 2023,Spring 2023,MS Telecommunications,TCOM 535,F2F,Not Relevent,1.0


In [13]:
# Saving the weights to a new sqlite database
db_weights = ConnectDB(
    os.path.join("Data", "02_processed", "weights.db")
)
weights.to_sql('weights', db_weights.connection, if_exists='replace', index=False)
db_weights.commitDB()

# Saving the weights to a csv file
weights.to_csv(os.path.join("Data", "02_processed", "weights.csv"), index=False)

# Saving the weights to a pickle file
weights.to_pickle(os.path.join("Data", "02_processed", "weights.pkl"))

# Use Cases:
- 

In [None]:
def get_predictions(ExpEnroll, prog_flag = 'comb', visa_flag = 'comb',
                    prog_comb = False, visa_comb = False, mod_comb = False,
                    prog_filt = None, visa_file = None, mod_filt = None, crs_filt = None):
    

In [28]:
# Get nth row of crs_demand_hist
crs_demand_hist.iloc[14122]

stu_admit_term_desc              Spring 2023
reg_term_desc                    Spring 2023
stu_prog_desc          MS Telecommunications
crs                                 TCOM 616
crs_sect_modality                     Online
stu_visa                             F1 Visa
stu_count                                  1
Name: 14122, dtype: object

In [15]:
_ = [
    crs_demand_hist.stu_admit_term_desc.nunique(),
    crs_demand_hist.reg_term_desc.nunique(),
    crs_demand_hist.stu_prog_desc.nunique(),
    crs_demand_hist.crs.nunique(),
    crs_demand_hist.crs_sect_modality.nunique(),
    crs_demand_hist.stu_visa.nunique()
]
c = 1
for i in _:
    c *= i
print(f"Total number of combinations: {c}")

Total number of combinations: 8400546


$$\mathscr{Prediction}_{at,rt,prog,crs,mod} = \mathscr{CrsDemandHist}_{at,rt,prog,crs,mod} * \frac{\mathscr{ExpectedEnrollment}}{\mathscr{NewAdmissions}_{at,prog}}$$

In [None]:
prog        = "MS Data Analytics Engineering" # "All" or specify program
course      = "All" # "All" or specify course
modality    = "combined" # "combined" or "separate"
VISA        = "all" # "all" or specify VISA status

prog_cond       = "--" if prog.lower() == "all"     else ""
course_cond     = "--" if course.lower() == "all"   else ""
modality_cond   = "--" if modality.lower() == "combined" else ""
VISA_cond       = "--" if VISA.lower() == "all"    else ""

In [148]:
EXPECTED_ENROLLMENT = 300

cols = ["stu_admit_term_desc", "reg_term_desc", "stu_prog_desc", "crs", "crs_sect_modality", "stu_count"]

predictions = pd.DataFrame(
    columns = ["stu_admit_term_desc", "reg_term_desc", "stu_prog_desc", "crs", "crs_sect_modality", "prediction"]
)

predictions

Unnamed: 0,stu_admit_term_desc,reg_term_desc,stu_prog_desc,crs,crs_sect_modality,prediction


In [50]:
# ns_prog,at: Number of Students from a particular program, taking admission on a particular semester, and registering/waitlisting for courses
StudentInputData = db.runQuery("""--sql
    SELECT stu_prog_desc, stu_admit_term_desc, COUNT(DISTINCT stu_id) AS ns_prog_at
    FROM EnrollmentFinalStatus
    WHERE reg_status IN ("**Web Registered**", "Wait Listed", "**Registered**")
    GROUP BY stu_prog_desc, stu_admit_term_desc
    ORDER BY stu_prog_desc, stu_admit_term_code;
""")
StudentInputData.sample(10)

Unnamed: 0,stu_prog_desc,stu_admit_term_desc,ns_prog_at
155,MS Data Analytics Engineering,Fall 2019,170
304,MS Systems Engineering,Spring 2019,5
101,MS Computer Forensics,Spring 2016,5
50,MS Biostatistics,Fall 2021,2
284,MS Statistical Science,Fall 2018,17
265,MS Software Engineering,Fall 2018,25
326,MS Telecommunications,Fall 2019,11
130,MS Computer Science,Fall 2021,247
11,MS Applied Info Technology,Spring 2014,1
104,MS Computer Forensics,Spring 2017,8


In [61]:
# w_at,rt,crs,prog Caclulating the weights for the projection
# admit semester, registration semester, course, student count
weights = db.runQuery("""--sql
    SELECT
        stu_prog_desc, stu_admit_term_desc, crs, reg_term_desc, COUNT(DISTINCT stu_id) AS w_prog_at_crs_rt
    FROM EnrollmentFinalStatus
    WHERE reg_status IN ("**Web Registered**", "Wait Listed", "**Registered**")
    GROUP BY stu_prog_desc, stu_admit_term_desc, crs, reg_term_desc
    ORDER BY stu_prog_desc, stu_admit_term_code, crs, reg_term_code;
""")

weights.head(10)

Unnamed: 0,stu_prog_desc,stu_admit_term_desc,crs,reg_term_desc,w_prog_at_crs_rt
0,MENG GeoConStruct Engineering,Fall 2013,CEIE 526,Spring 2019,1
1,MENG GeoConStruct Engineering,Fall 2013,CEIE 573,Spring 2019,1
2,MENG GeoConStruct Engineering,Fall 2013,CEIE 623,Fall 2018,1
3,MENG GeoConStruct Engineering,Fall 2013,CEIE 639,Fall 2018,1
4,MENG GeoConStruct Engineering,Fall 2013,CEIE 690,Spring 2018,1
5,MENG GeoConStruct Engineering,Fall 2013,CEIE 795,Spring 2018,1
6,MENG GeoConStruct Engineering,Fall 2013,CEIE 795,Fall 2018,1
7,MENG GeoConStruct Engineering,Fall 2014,CEIE 513,Fall 2019,1
8,MENG GeoConStruct Engineering,Fall 2014,CEIE 524,Fall 2017,1
9,MENG GeoConStruct Engineering,Fall 2014,CEIE 524,Spring 2020 - COVID-19,1


To make projections, we multiply the weights (the proportion of students who registered for each course in each semester) by the number of student enrolling in cohorts.

In [None]:
sem_dict ={
    'Spring 2018'   : ['Spring 2018',   'Fall 2018',    'Spring 2019',  'Fall 2019'],
    'Fall 2018'     : ['Fall 2018',     'Spring 2019',  'Fall 2019',    'Spring 2020'],
    'Spring 2019'   : ['Spring 2019',   'Fall 2019',    'Spring 2020',  'Fall 2020'],
    'Fall 2019'     : ['Fall 2019',     'Spring 2020',  'Fall 2020',    'Spring 2021'],
    'Spring 2020'   : ['Spring 2020',   'Fall 2020',    'Spring 2021',  'Fall 2021'],
    'Fall 2020'     : ['Fall 2020',     'Spring 2021',  'Fall 2021',    'Spring 2022'],
    'Spring 2021'   : ['Spring 2021',   'Fall 2021',    'Spring 2022',  'Fall 2022'],
    'Fall 2021'     : ['Fall 2021',     'Spring 2022',  'Fall 2022',    'Spring 2023']
}

In [150]:
df.stu_visa.value_counts()

F1 Visa         33199
Not Relevent    26210
Name: stu_visa, dtype: int64

In [59]:
StudentInputData[StudentInputData.stu_admit_term_desc == "Fall 2021"] \
[StudentInputData.stu_prog_desc == "MS Data Analytics Engineering"]

  StudentInputData[StudentInputData.stu_admit_term_desc == "Fall 2021"] \


Unnamed: 0,stu_prog_desc,stu_admit_term_desc,ns_prog_at
161,MS Data Analytics Engineering,Fall 2021,235
