# Data Shitery

In [1]:
# importing the required libraries
import os
import pandas as pd
import matplotlib.pyplot as plt

os.chdir( os.path.join("..", "..") )

from Code.src.modules.db_ops import *
from Code.src.modules.dataManager import DataManager
from Code.src.modules.eda import *

DM = DataManager()

In [2]:
# Importing the data
df_finalEnrollment = DM.get_data('EnrollmentFinalStatus', 'pkl', 'processed')
db_finalEnrollment = DM.get_data('EnrollmentFinalStatus', 'db', 'processed')

In [3]:
db_enrollment = DM.get_data('Enrollment', 'db', 'processed')
df_enrollment = DM.get_data('Enrollment', 'pkl', 'processed')

# Testing

## Take 2: Getting Final Status from latest snapshot only

In [4]:
# Final Snapshots

df = df_enrollment \
    .loc[
        ((df_enrollment['rec_ext_date'].dt.month.isin([2,9])) &
        (df_enrollment['rec_ext_date'].dt.day == 1)) |
        ((df_enrollment['rec_ext_date'].dt.month.isin([1,8])) &
        (df_enrollment['rec_ext_date'].dt.day == 15))
    ]

df.head().T
df[['reg_term_code', 'stu_id', 'crs']]
# df.to_csv( os.path.join('Data', '02_processed', 'final_snapshot.csv') )
# df.to_pickle( os.path.join('Data', '02_processed', 'final_snapshot.pkl') )

# db = ConnectDB( os.path.join("Data", "02_processed", "final_snapshot.db") )
# df.to_sql("FinalSnapshot", db.connection, if_exists="replace", index=False)

Unnamed: 0,reg_term_code,stu_id,crs
13924,201770,CEC2640,CS 504
13925,201770,CEC2640,DAEN 690
13926,201770,CEC3286,INFS 640
13927,201770,CEC3289,SWE 619
13928,201770,CEC865,CEIE 639
...,...,...,...
318574,202310,CEC30889,AIT 614
318575,202310,CEC11679,CEIE 605
318576,202310,CEC11679,CEIE 639
318577,202310,CEC11679,CEIE 795


In [5]:
# Getting the latest re_ext_date for each Registration Term
db_enrollment.runQuery(""" --sql
    SELECT reg_term_desc, rec_ext_date
    FROM enrollment4EDA
    WHERE
        reg_term_desc IN ('Fall 2021', 'Spring 2022')
    GROUP BY reg_term_desc, rec_ext_date
    ORDER BY reg_term_code, rec_ext_date
""")

Unnamed: 0,reg_term_desc,rec_ext_date
0,Fall 2021,2021-05-01 00:00:00
1,Fall 2021,2021-05-15 00:00:00
2,Fall 2021,2021-06-01 00:00:00
3,Fall 2021,2021-06-15 00:00:00
4,Fall 2021,2021-07-01 00:00:00
5,Fall 2021,2021-07-15 00:00:00
6,Fall 2021,2021-08-01 00:00:00
7,Fall 2021,2021-08-15 00:00:00
8,Fall 2021,2021-09-01 00:00:00
9,Fall 2021,2021-09-15 00:00:00


In [6]:
# Not a reliable approach for fetching the latest enrollment info

# # Fetching the final snapshot of registration data for each semester
# df = db_enrollment.runQuery(""" --sql
#     SELECT *
#     FROM enrollment4EDA
#     WHERE
#         rec_ext_date IN (
#             SELECT MAX(rec_ext_date) AS LatestSnapshot
#             FROM enrollment4EDA
#             GROUP BY reg_term_desc
#             ORDER BY reg_term_code)
# """)

# # Exporting the final snapshot
# df.to_csv( os.path.join('Data', '02_processed', 'final_snapshot.csv') )
# df.to_pickle( os.path.join('Data', '02_processed', 'final_snapshot.pkl') )

# db = ConnectDB( os.path.join("Data", "02_processed", "final_snapshot.db") )
# df.to_sql("FinalSnapshot", db.connection, if_exists="replace", index=False)

# df.info()

In [7]:
# Checking if data is consistent
df \
    .loc[
        (df['stu_prog_desc'] == "MS Data Analytics Engineering") &
        #(df['stu_admit_term_desc'] == "Fall 2021") &
        (df['stu_visa'] == "F1 Visa") &
        (df['reg_status'].isin(["**Web Registered**", "Wait Listed", "**Registered**"]))
    ] \
        .groupby(['stu_new_ret', 'stu_id', 'reg_term_code', 'stu_admit_term_desc', 'reg_term_desc']) \
            .agg({'rec_id': 'count', 'crs': 'unique'}) \
                .reset_index() \
                    .sort_values(['rec_id']).head(60)

Unnamed: 0,stu_new_ret,stu_id,reg_term_code,stu_admit_term_desc,reg_term_desc,rec_id,crs
0,N,CEC10012,201770,Fall 2016,Fall 2017,0,
3443284,R,CEC20850,202010,Spring 2021,Fall 2021,0,
3443283,R,CEC20850,202010,Spring 2021,Fall 2020,0,
3443282,R,CEC20850,202010,Spring 2021,Fall 2019,0,
3443281,R,CEC20850,202010,Spring 2021,Fall 2018,0,
3443280,R,CEC20850,202010,Spring 2021,Fall 2017,0,
3443279,R,CEC20850,202010,Spring 2020 - COVID-19,Spring 2023,0,
3443278,R,CEC20850,202010,Spring 2020 - COVID-19,Spring 2022,0,
3443277,R,CEC20850,202010,Spring 2020 - COVID-19,Spring 2021,0,
3443276,R,CEC20850,202010,Spring 2020 - COVID-19,Spring 2020 - COVID-19,0,


In [8]:
# Getting total course registration for each student
df \
    .loc[
        (df['stu_prog_desc'] == "MS Data Analytics Engineering") &
        (df['stu_admit_term_desc'] == "Fall 2021") &
        (df['stu_visa'] == "F1 Visa") &
        (df['reg_status'] in ["**Web Registered**", "Wait Listed", "**Registered**"])
    ] \
        .groupby('stu_id', as_index=False) \
            .agg({'crs':'count'}) \
                .sort_values('crs', ascending=True) \
                    .groupby('crs', as_index=False) \
                        .agg({'stu_id':'count'}) \
                            .rename(columns={'stu_id':'count'})

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
# Getting the latest snapshot for each Registration Term
df = db_enrollment.runQuery(""" --sql
    SELECT stu_id, COUNT(*) AS Count
    FROM enrollment4EDA
    WHERE
        rec_ext_date IN (
            SELECT MAX(rec_ext_date) AS LatestSnapshot
            FROM enrollment4EDA
            GROUP BY reg_term_desc
            ORDER BY reg_term_code)
        AND stu_prog_desc = "MS Data Analytics Engineering"
        AND stu_admit_term_desc = "Fall 2021"
        AND stu_visa = "F1 Visa"
    GROUP BY stu_id
    ORDER BY Count
""")

df.head(60)

Unnamed: 0,stu_id,Count
0,CEC27550,1
1,CEC28239,1
2,CEC26131,2
3,CEC26913,2
4,CEC27189,2
5,CEC27486,2
6,CEC27628,2
7,CEC27747,2
8,CEC23829,3
9,CEC23919,3


## Getting Final Enrollment Data

In [None]:
# This Student is nuts. What's wrong with them? 3-3-3-1 pattern??
db_enrollment.runQuery(""" --sql
    SELECT rec_id, stu_admit_term_desc, stu_id, stu_prog_desc, reg_status_date, reg_term_desc, crs, crs_sect, reg_status, rec_ext_date
FROM enrollment4EDA
WHERE
    stu_id = "CEC25006"
    AND reg_term_desc = "Fall 2022"
ORDER BY
    reg_term_code, rec_ext_date, crs, reg_status_date ;
""").head(60)

NameError: name 'db_enrollment' is not defined

In [74]:
#  Final Enrollment for the student
db_finalEnrollment.runQuery(""" --sql
    SELECT rec_id, stu_admit_term_desc, stu_id, stu_prog_desc, reg_status_date, reg_term_desc, crs, reg_status, rec_ext_date
    FROM enrollmentFinalStatus
    WHERE
        stu_id = "CEC28311"
""")

Unnamed: 0,rec_id,stu_admit_term_desc,stu_id,stu_prog_desc,reg_status_date,reg_term_desc,crs,reg_status,rec_ext_date
0,216223,Spring 2022,CEC28311,MS Computer Science,2021-11-03 00:00:00,Spring 2022,CS 531,**Web Registered**,2022-01-01 00:00:00
1,216754,Spring 2022,CEC28311,MS Computer Science,2021-11-07 00:00:00,Spring 2022,CS 504,**Web Registered**,2022-01-01 00:00:00
2,216755,Spring 2022,CEC28311,MS Computer Science,2021-11-07 00:00:00,Spring 2022,CS 530,**Web Registered**,2022-01-01 00:00:00
3,218234,Spring 2022,CEC28311,MS Computer Science,2022-01-14 00:00:00,Spring 2022,CS 540,**Web Registered**,2022-01-15 00:00:00
4,218235,Spring 2022,CEC28311,MS Computer Science,2022-01-14 00:00:00,Spring 2022,CS 571,Wait Listed,2022-01-15 00:00:00
5,218236,Spring 2022,CEC28311,MS Computer Science,2022-01-14 00:00:00,Spring 2022,CS 583,Wait Listed,2022-01-15 00:00:00
6,222921,Spring 2022,CEC28311,MS Computer Science,2022-01-22 00:00:00,Spring 2022,SWE 622,**Web Registered**,2022-02-01 00:00:00
7,223242,Spring 2022,CEC28311,MS Computer Science,2022-01-25 00:00:00,Spring 2022,CS 550,**Web Registered**,2022-02-01 00:00:00
8,223637,Spring 2022,CEC28311,MS Computer Science,2022-01-30 00:00:00,Spring 2022,SWE 621,Wait Listed,2022-02-01 00:00:00
9,232635,Spring 2022,CEC28311,MS Computer Science,2021-11-03 00:00:00,Spring 2022,CS 580,**Web Registered**,2021-11-04 00:00:00


In [46]:
# Testing if the way we created the final enrollment status is correct
db_enrollment.runQuery(""" --sql
    SELECT reg_term_desc, stu_admit_term_desc, stu_id, crs, MAX(reg_status_date) as final_reg_date, reg_status
    FROM enrollment4EDA
    WHERE
        stu_visa = "F1 Visa"
    GROUP BY reg_term_desc, stu_id, crs
    ORDER BY reg_term_code DESC, stu_id, final_reg_date, crs;
""").head(60)


Unnamed: 0,reg_term_desc,stu_admit_term_desc,stu_id,crs,final_reg_date,reg_status
0,Spring 2023,Fall 2021,CEC10635,DAEN 690,2022-12-20 00:00:00,**Web Registered**
1,Spring 2023,Spring 2021,CEC11382,SYST 514,2022-11-14 00:00:00,**Web Registered**
2,Spring 2023,Spring 2021,CEC11382,SYST 699,2023-01-04 00:00:00,**Web Registered**
3,Spring 2023,Fall 2021,CEC14339,ECE 555,2023-01-15 00:00:00,**Web Registered**
4,Spring 2023,Fall 2021,CEC14339,ECE 615,2023-01-15 00:00:00,**Web Registered**
5,Spring 2023,Fall 2021,CEC14339,ECE 612,2023-01-24 00:00:00,**Web Registered**
6,Spring 2023,Fall 2021,CEC14339,ECE 698,2023-01-30 00:00:00,**Web Registered**
7,Spring 2023,Fall 2021,CEC14339,ECE 795,2023-01-30 00:00:00,**Web Registered**
8,Spring 2023,Fall 2021,CEC14339,ECE 797,2023-01-30 00:00:00,**Web Registered**
9,Spring 2023,Spring 2022,CEC15381,CS 663,2022-11-01 00:00:00,**Web Registered**


In [11]:
# Checking number of classes registered by International Students
df = db_finalEnrollment.runQuery(""" --sql
    SELECT stu_id, COUNT(DISTINCT crs) as num_crs
    FROM enrollmentFinalStatus
    WHERE
                stu_visa = "F1 Visa"
        AND     stu_prog_desc = "MS Data Analytics Engineering"
        AND     reg_status IN ("**Web Registered**", "**Registered**")
        AND     stu_admit_term_year = reg_term_year
        AND     stu_admit_term_year > 2020
    GROUP BY stu_id
    ORDER BY num_crs DESC;
""")

df
# df.groupby('num_crs').count()
# df.plot.hist(bins=100, figsize=(10, 5))

Unnamed: 0,stu_id,num_crs
0,CEC24702,12
1,CEC22693,12
2,CEC21786,12
3,CEC28750,11
4,CEC22896,11
...,...,...
654,CEC28058,1
655,CEC27607,1
656,CEC27550,1
657,CEC27460,1


## Enrollment Numbers for International Students

In [17]:
query1 = db_finalEnrollment.runQuery(""" --sql
    SELECT
        stu_admit_term_desc, reg_term_desc,
        COUNT(DISTINCT stu_id) AS 'Total International Students Admitted',
        SUM(CASE WHEN crs = 'AIT 580' THEN 1 ELSE 0 END) AS 'AIT 580',
        SUM(CASE WHEN crs = 'CS 504' THEN 1 ELSE 0 END) AS 'CS 504',
        SUM(CASE WHEN crs = 'STAT 515' THEN 1 ELSE 0 END) AS 'STAT 515',
        SUM(CASE WHEN crs = 'OR 531' THEN 1 ELSE 0 END) AS 'OR 531'
    FROM EnrollmentFinalStatus
    WHERE
                stu_visa = 'F1 Visa'
        AND     stu_prog_desc = 'MS Data Analytics Engineering'
        AND     stu_admit_term_year > 2017
        AND     reg_status IN ("**Web Registered**", "Wait Listed", "**Registered**")
    GROUP BY
        stu_admit_term_desc, reg_term_desc
    ORDER BY
        stu_admit_term_code, reg_term_code
""")

query1

Unnamed: 0,stu_admit_term_desc,reg_term_desc,Total International Students Admitted,AIT 580,CS 504,STAT 515,OR 531
0,Spring 2018,Spring 2018,40,21,24,22,18
1,Spring 2018,Fall 2018,41,5,12,2,3
2,Spring 2018,Spring 2019,36,1,8,0,1
3,Spring 2018,Fall 2019,13,0,1,0,0
4,Spring 2018,Spring 2020 - COVID-19,1,0,0,0,0
5,Fall 2018,Fall 2018,83,63,41,58,51
6,Fall 2018,Spring 2019,78,11,50,8,11
7,Fall 2018,Fall 2019,79,1,12,2,2
8,Fall 2018,Spring 2020 - COVID-19,59,0,0,0,0
9,Fall 2018,Fall 2020,3,0,0,0,0


In [26]:
db_finalEnrollment.runQuery(""" --sql
    SELECT stu_admit_term_desc, COUNT(DISTINCT stu_id) AS 'Incoming Students',3*COUNT(DISTINCT stu_id) AS 'Expected Course Registrations'
    FROM EnrollmentFinalStatus
    WHERE
                stu_visa = 'F1 Visa'
        AND     stu_prog_desc = 'MS Data Analytics Engineering'
        AND     stu_admit_term_year > 2017
        AND     reg_status IN ("**Web Registered**", "**Registered**")
    GROUP BY stu_admit_term_desc
    ORDER BY stu_admit_term_code
""")

Unnamed: 0,stu_admit_term_desc,Incoming Students,Expected Course Registrations
0,Spring 2018,44,132
1,Fall 2018,89,267
2,Spring 2019,50,150
3,Fall 2019,83,249
4,Spring 2020 - COVID-19,69,207
5,Fall 2020,54,162
6,Spring 2021,85,255
7,Fall 2021,148,444
8,Spring 2022,118,354
9,Fall 2022,190,570


In [25]:
db_finalEnrollment.runQuery(""" --sql
    SELECT stu_admit_term_desc, reg_term_desc, COUNT(stu_id) AS 'Total Course Registrations'
    FROM EnrollmentFinalStatus
    WHERE
                stu_visa = 'F1 Visa'
        AND     stu_prog_desc = 'MS Data Analytics Engineering'
        AND     stu_admit_term_year > 2017
        AND     reg_status IN ("**Web Registered**", "**Registered**")
    GROUP BY stu_admit_term_desc, reg_term_desc
    ORDER BY stu_admit_term_code, reg_term_code
""")

Unnamed: 0,stu_admit_term_desc,reg_term_desc,Total Course Registrations
0,Spring 2018,Spring 2018,133
1,Spring 2018,Fall 2018,187
2,Spring 2018,Spring 2019,119
3,Spring 2018,Fall 2019,37
4,Spring 2018,Spring 2020 - COVID-19,2
5,Fall 2018,Fall 2018,317
6,Fall 2018,Spring 2019,315
7,Fall 2018,Fall 2019,324
8,Fall 2018,Spring 2020 - COVID-19,154
9,Fall 2018,Fall 2020,4
