# Weighted Projection
Calculate the weights of proportions of student registering for each course in each semester.

In [1]:
# importing the required libraries
import os
import pandas as pd
import matplotlib.pyplot as plt

os.chdir( os.path.join("..", "..", "..") )

# importing custom modules
from Code.src.modules.db_ops import *
from Code.src.modules.dataManager import DataManager
from Code.src.modules.eda import *
from Code.src.modules.db_ops import *

# initializing the DataManager
DM = DataManager()

In [2]:
# importing the data for analysis
df = DM.get_data('EnrollmentFinalStatus', 'pkl', 'processed')
db = DM.get_data('EnrollmentFinalStatus', 'db', 'processed')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59409 entries, 0 to 323686
Data columns (total 35 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   rec_id               59409 non-null  int64         
 1   rec_ext_date         59409 non-null  datetime64[ns]
 2   file_name            59409 non-null  object        
 3   file_index           59409 non-null  int64         
 4   reg_term_code        59409 non-null  object        
 5   reg_term_year        59409 non-null  int64         
 6   reg_term_name        59409 non-null  category      
 7   reg_term_desc        59409 non-null  object        
 8   stu_id               59408 non-null  object        
 9   stu_deg_level        59409 non-null  category      
 10  stu_college          59409 non-null  category      
 11  stu_res              59409 non-null  category      
 12  stu_visa             59409 non-null  category      
 13  stu_bam              59409 non

# Simple Weighted Projection
Taking all students, and calculating weights of proportions of student registering for each course in each semester.

How to improve the model:
- We can filter this data for international students and domestic students separately.
- We can calculate weights for Spring and Fall semesters separately.

## Data

- Admission Term: $\mathscr{at}$
- Registration Term: $\mathscr{rt}$
- Course: $\mathscr{crs}$
- Program: $\mathscr{prog}$
- Modality: $\mathscr{crs_sect_modality}$
- Student VISA Status: $\mathscr{stu_visa}$
- Number of Students, in a prog, admitting in a semester: $\mathscr{ns_{prog, at}}$
- Weights: $\mathscr{w_{at, rt, crs, prog}}$

### Points to note:
- `prog`, `modality`, and `VISA` can take any of the following values:
    - "Combined": Give the combined demand of all programs/modality/VISA status.
    - "All": Give the demand of all programs/modality/VISA status individually.
    - `@Specific`: Give the demand of a specific program/modality/VISA status.
- `course` can take any of the following values:
    - "All": Give the demand of all courses.
    - `@Specific`: Give the demand of a specific course.

In [5]:
# # prog should be "Combined", "All" or specific program
# prog        = "MS Data Analytics Engineering"
# # modality should be "Combined", "All" or specific modality
# modality    = "combined"
# # VISA should be "Combined", "All" or specific VISA status
# VISA        = "all"
# # course should be "All" or specific course
# course      = "All"

# prog_cond           = "--" if prog.lower() == "all"     else ""
# prog_filt_cond      = "--" if prog.lower() == "combined" else ""
# course_cond         = "--" if course.lower() == "all"   else ""
# modality_cond       = "--" if modality.lower() == "combined" else ""
# modality_fil_cond   = "--" if modality.lower() == "all"  else ""
# VISA_cond           = "--" if VISA.lower() == "all"    else ""
# VISA_filt_cond      = "--" if VISA.lower() == "combined" else ""

In [6]:
new_admissions = db.runQuery(f""" --sql
    SELECT
        stu_admit_term_desc, stu_prog_desc, stu_visa,
        COUNT(DISTINCT stu_id) AS num_stu
    FROM EnrollmentFinalStatus
    WHERE
        stu_admit_term_year > 2017
        AND stu_admit_term_name != 'Summer'
    GROUP BY
        stu_admit_term_desc, stu_prog_desc, stu_visa
    ORDER BY
        stu_admit_term_code
    ;
""")
new_admissions

Unnamed: 0,stu_admit_term_desc,stu_prog_desc,stu_visa,num_stu
0,Spring 2018,MENG GeoConStruct Engineering,F1 Visa,2
1,Spring 2018,MS Applied Info Technology,F1 Visa,7
2,Spring 2018,MS Applied Info Technology,Not Relevent,37
3,Spring 2018,MS Civil & Infrastructure Engr,F1 Visa,8
4,Spring 2018,MS Civil & Infrastructure Engr,Not Relevent,2
...,...,...,...,...
312,Spring 2023,MS Software Engineering,Not Relevent,5
313,Spring 2023,MS Systems Engineering,F1 Visa,1
314,Spring 2023,MS Systems Engineering,Not Relevent,3
315,Spring 2023,MS Telecommunications,F1 Visa,1


In [7]:
# Query new_admissions dataframe by stu_admit_term_desc, stu_prog_desc, stu_visa
# and return the number of students
def get_num_stu(df, term, prog, visa):
    return df.loc[(df.stu_admit_term_desc == term) & (df.stu_prog_desc == prog) & (df.stu_visa == visa), 'num_stu'].values[0]

In [8]:
crs_demand_hist = db.runQuery(f""" --sql
    SELECT
        stu_admit_term_desc, reg_term_desc, stu_prog_desc, crs, crs_sect_modality, stu_visa,
        COUNT (DISTINCT stu_id) AS stu_count
    FROM EnrollmentFinalStatus
    WHERE
        stu_admit_term_name != "Summer" AND
        reg_status IN ("**Web Registered**", "Wait Listed", "**Registered**") AND
        stu_admit_term_year - reg_term_year <= 5 AND
        stu_admit_term_year > 2017
    GROUP BY
        stu_admit_term_desc, reg_term_desc, crs, stu_prog_desc, crs_sect_modality, stu_visa
    ORDER BY
        stu_admit_term_code, reg_term_code, stu_prog_desc, crs, stu_visa
    ;
""")
crs_demand_hist

Unnamed: 0,stu_admit_term_desc,reg_term_desc,stu_prog_desc,crs,crs_sect_modality,stu_visa,stu_count
0,Spring 2018,Spring 2018,MENG GeoConStruct Engineering,CEIE 512,F2F,F1 Visa,1
1,Spring 2018,Spring 2018,MENG GeoConStruct Engineering,CEIE 527,F2F,F1 Visa,1
2,Spring 2018,Spring 2018,MENG GeoConStruct Engineering,CEIE 575,F2F,F1 Visa,1
3,Spring 2018,Spring 2018,MENG GeoConStruct Engineering,CEIE 613,F2F,F1 Visa,2
4,Spring 2018,Spring 2018,MENG GeoConStruct Engineering,CEIE 639,F2F,F1 Visa,1
...,...,...,...,...,...,...,...
14118,Spring 2023,Spring 2023,MS Telecommunications,TCOM 500,F2F,Not Relevent,2
14119,Spring 2023,Spring 2023,MS Telecommunications,TCOM 514,F2F,Not Relevent,1
14120,Spring 2023,Spring 2023,MS Telecommunications,TCOM 535,F2F,F1 Visa,1
14121,Spring 2023,Spring 2023,MS Telecommunications,TCOM 535,F2F,Not Relevent,2


$$\mathscr{Weights}_{at,rt,prog,crs,mod,visa} = \frac{\mathscr{CrsDemandHist}_{at,rt,prog,crs,mod, visa}}{{\mathscr{NewAdmissions}_{at,prog, visa}}}$$

In [9]:
crs_demand_hist.iloc[0]

stu_admit_term_desc                      Spring 2018
reg_term_desc                            Spring 2018
stu_prog_desc          MENG GeoConStruct Engineering
crs                                         CEIE 512
crs_sect_modality                                F2F
stu_visa                                     F1 Visa
stu_count                                          1
Name: 0, dtype: object

In [10]:
weights = pd.DataFrame(
    columns=['stu_admit_term_desc', 'reg_term_desc', 'stu_prog_desc', 'crs', 'crs_sect_modality', 'stu_visa', 'weight']
)

# iterating through the rows of the dataframe
for n_row in range(len(crs_demand_hist)):
    # fetching the record of the current row
    rec = crs_demand_hist.iloc[n_row]
    new_rec = rec[:-1]
    new_rec['weight'] = float(rec.stu_count) / float(new_admissions.loc[
            (new_admissions.stu_admit_term_desc == rec.stu_admit_term_desc) &
            (new_admissions.stu_prog_desc == rec.stu_prog_desc) &
            (new_admissions.stu_visa == rec.stu_visa),
            'num_stu'
        ].values[0])
    # weights.iloc[n_row] = new_rec
    weights = weights.append(new_rec, ignore_index=True)
    # pd.concat([weights, pd.DataFrame(new_rec)], axis=1)

weights

  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights = weights.append(new_rec, ignore_index=True)
  weights 

Unnamed: 0,stu_admit_term_desc,reg_term_desc,stu_prog_desc,crs,crs_sect_modality,stu_visa,weight
0,Spring 2018,Spring 2018,MENG GeoConStruct Engineering,CEIE 512,F2F,F1 Visa,0.5
1,Spring 2018,Spring 2018,MENG GeoConStruct Engineering,CEIE 527,F2F,F1 Visa,0.5
2,Spring 2018,Spring 2018,MENG GeoConStruct Engineering,CEIE 575,F2F,F1 Visa,0.5
3,Spring 2018,Spring 2018,MENG GeoConStruct Engineering,CEIE 613,F2F,F1 Visa,1.0
4,Spring 2018,Spring 2018,MENG GeoConStruct Engineering,CEIE 639,F2F,F1 Visa,0.5
...,...,...,...,...,...,...,...
14118,Spring 2023,Spring 2023,MS Telecommunications,TCOM 500,F2F,Not Relevent,1.0
14119,Spring 2023,Spring 2023,MS Telecommunications,TCOM 514,F2F,Not Relevent,0.5
14120,Spring 2023,Spring 2023,MS Telecommunications,TCOM 535,F2F,F1 Visa,1.0
14121,Spring 2023,Spring 2023,MS Telecommunications,TCOM 535,F2F,Not Relevent,1.0


In [13]:
# Saving the weights to a new sqlite database
db_weights = ConnectDB(
    os.path.join("Data", "02_processed", "weights.db")
)
weights.to_sql('weights', db_weights.connection, if_exists='replace', index=False)
db_weights.commitDB()

# Saving the weights to a csv file
weights.to_csv(os.path.join("Data", "02_processed", "weights.csv"), index=False)

# Saving the weights to a pickle file
weights.to_pickle(os.path.join("Data", "02_processed", "weights.pkl"))