In [22]:
# importing required libraries
import os
import glob
import pandas as pd
import sqlite3

In [23]:
# Merging the data from all the files in the folder
_ = os.sep
files = glob.glob(".."+_+".."+_+"Data"+_+"EnrollmentData"+_+"01_raw"+_+"*.csv")

# Reading each file, adding filename as a column and appending to a pandas dataframe
df_enrollment = pd.read_csv(files[0])
df_enrollment['file_name'] = files[0]
for file in files[1:]:
    data = pd.read_csv(file)
    data['file_name'] = file
    df_enrollment = pd.concat([df_enrollment, data], ignore_index=True)

# Shape of the final dataframe
df_enrollment.shape

(344119, 29)

In [24]:
# renaming all the columns in the dataframe
rename_dict = {
    'Unnamed: 0'                        : 'file_index',
    'Stu Term'                          : 'reg_term_code',
    'Unnamed: 1'                        : 'reg_term_desc',
    'Stu Admit Term'                    : 'stu_admit_term_code',
    'Unnamed: 3'                        : 'stu_admit_term_desc',
    'Record ID'                         : 'stu_id',
    'Stu Visa Type'                     : 'stu_visa',
    'Stu Attribute BAM'                 : 'stu_bam',
    'Stu Admit Type'                    : 'stu_prog_level',
    'Stu Primary Degree Level'          : 'stu_deg_level',
    'Stu Primary Major 1 College'       : 'stu_college',
    'Stu Primary Program'               : 'stu_prog_desc',
    'Stu Primary Program Code'          : 'stu_prog_code',
    'Stu Primary Department'            : 'stu_dept',
    'Unnamed: 14'                       : 'stu_dept_desc',
    'Stu New/Returning Ind'             : 'stu_new_ret',
    'Stu Residency Group'               : 'stu_res',
    'Course Sect College'               : 'crs_sect_clg',
    'Stu Registered Ind'                : 'stu_act_reg_ind',
    'Registration Status'               : 'reg_status',
    'Registration Status Date'          : 'reg_status_date',
    'Course Desc'                       : 'crs',
    'Course Section'                    : 'crs_sect',
    'Course Sect Schedule Type'         : 'crs_type',
    'Course Sect Wiley Courses Ind'     : 'crs_sect_wiley_ind',
    'Course Sect Credits'               : 'crs_credits',
    'Stu Course Registered Hours'       : 'crs_hours',
    'Course Sect Instruction Delivery Method Group' : 'crs_sect_modality',
}
df_enrollment.rename(columns=rename_dict, inplace=True)

In [25]:
# Extracting features from the dataframe
df_enrollment['reg_term_name'] = df_enrollment['reg_term_desc'].str.split(' ').str[0]
df_enrollment['reg_term_year'] = df_enrollment['reg_term_desc'].str.split(' ').str[1]
df_enrollment['stu_admit_term_name'] = df_enrollment['stu_admit_term_desc'].str.split(' ').str[0]
df_enrollment['stu_admit_term_year'] = df_enrollment['stu_admit_term_desc'].str.split(' ').str[1]

df_enrollment['rec_ext_date'] = df_enrollment['file_name'].str.split("_").str[-2]
df_enrollment['rec_ext_date'] = df_enrollment['rec_ext_date'].str.replace(".", "/", regex=False)

In [26]:
# Cleaning the data

df_enrollment['stu_id'].dropna(inplace=True)
df_enrollment['stu_visa'].fillna("Not Relevent", inplace=True)

df_enrollment['stu_bam'].replace("'--", "Not BAM", inplace=True)
df_enrollment['stu_dept'].replace("'-----", "No Value", inplace=True)
df_enrollment['crs_sect_wiley_ind'].replace("'--", "No Value", inplace=True)

df_enrollment['crs_credits'] = df_enrollment['crs_credits'].astype(str)

df_enrollment['crs_credits'].replace("6-Jan", "1-6", inplace=True)
df_enrollment['crs_credits'].replace("3-Jan", "1-3", inplace=True)
df_enrollment['crs_credits'].replace("18-Jan", "1-18", inplace=True)
df_enrollment['crs_credits'].replace("4-Jan", "1-4", inplace=True)
df_enrollment['crs_credits'].replace("0,3", "0-3", inplace=True)

In [27]:
# sorting the dataframe by term code and registration status date
df_enrollment = df_enrollment.sort_values(
    by=['reg_term_code', 'rec_ext_date', 'reg_status_date']
)

# resetting the index after sorting
df_enrollment = df_enrollment.reset_index(drop=True)

# creating a new rec_id column for identifying each record
df_enrollment['rec_id'] = df_enrollment.index

In [28]:
# Changing the data type of the columns
df_enrollment['rec_ext_date']           = pd.to_datetime(df_enrollment['rec_ext_date'])
df_enrollment['reg_term_code']          = df_enrollment['reg_term_code'].astype(str)
df_enrollment['reg_term_year']          = df_enrollment['reg_term_year'].astype(int)
df_enrollment['reg_term_name']          = df_enrollment['reg_term_name'].astype('category')
df_enrollment['stu_new_ret']            = df_enrollment['stu_new_ret'].astype('category')
df_enrollment['stu_deg_level']          = df_enrollment['stu_deg_level'].astype('category')
df_enrollment['stu_college']            = df_enrollment['stu_college'].astype('category')
df_enrollment['stu_prog_desc']          = df_enrollment['stu_prog_desc'].astype('category')
df_enrollment['stu_prog_level']         = df_enrollment['stu_prog_level'].astype('category')
df_enrollment['stu_dept']               = df_enrollment['stu_dept'].astype('category')
df_enrollment['stu_dept_desc']          = df_enrollment['stu_dept_desc'].astype('category')
df_enrollment['stu_admit_term_code']    = df_enrollment['stu_admit_term_code'].astype(str)
df_enrollment['stu_admit_term_year']    = df_enrollment['stu_admit_term_year'].astype(int)
df_enrollment['stu_admit_term_name']    = df_enrollment['stu_admit_term_name'].astype('category')
df_enrollment['stu_res']                = df_enrollment['stu_res'].astype('category')
df_enrollment['stu_visa']               = df_enrollment['stu_visa'].astype('category')
df_enrollment['stu_bam']                = df_enrollment['stu_bam'].astype('category')
df_enrollment['crs_sect_clg']           = df_enrollment['crs_sect_clg'].astype('category')
df_enrollment['crs_type']               = df_enrollment['crs_type'].astype('category')
df_enrollment['crs_sect_modality']      = df_enrollment['crs_sect_modality'].astype('category')
df_enrollment['crs_sect_wiley_ind']     = df_enrollment['crs_sect_wiley_ind'].astype('category')
df_enrollment['crs_credits']            = df_enrollment['crs_credits'].astype('category')
df_enrollment['crs_hours']              = df_enrollment['crs_hours'].astype('category')
df_enrollment['stu_act_reg_ind']        = df_enrollment['stu_act_reg_ind'].astype('category')
df_enrollment['reg_status']             = df_enrollment['reg_status'].astype('category')
df_enrollment['reg_status_date']        = pd.to_datetime(df_enrollment['reg_status_date'])

In [29]:
# reordering the columns
df_enrollment = df_enrollment[[
    # Records Info
    'rec_id', 'rec_ext_date', 'file_name', 'file_index',
    # Registration Term/Semester Info
    'reg_term_code', 'reg_term_year', 'reg_term_name', 'reg_term_desc',
    # Student Info
    'stu_id', 'stu_deg_level', 'stu_college', 'stu_res', 'stu_visa', 'stu_bam', 'stu_new_ret',
    'stu_dept', 'stu_dept_desc', 'stu_prog_code', 'stu_prog_level', 'stu_prog_desc',
    'stu_admit_term_code', 'stu_admit_term_year', 'stu_admit_term_name', 'stu_admit_term_desc',
    # Course Info
    'crs', 'crs_type', 'crs_credits', 'crs_hours',
    'crs_sect', 'crs_sect_clg', 'crs_sect_modality', 'crs_sect_wiley_ind',
    # Registration Status Info
    'reg_status', 'reg_status_date', 'stu_act_reg_ind'
]]

In [30]:
df_enrollment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344119 entries, 0 to 344118
Data columns (total 35 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   rec_id               344119 non-null  int64         
 1   rec_ext_date         344119 non-null  datetime64[ns]
 2   file_name            344119 non-null  object        
 3   file_index           344119 non-null  int64         
 4   reg_term_code        344119 non-null  object        
 5   reg_term_year        344119 non-null  int32         
 6   reg_term_name        344119 non-null  category      
 7   reg_term_desc        344119 non-null  object        
 8   stu_id               344118 non-null  object        
 9   stu_deg_level        344119 non-null  category      
 10  stu_college          344119 non-null  category      
 11  stu_res              344119 non-null  category      
 12  stu_visa             344119 non-null  category      
 13  stu_bam       

In [31]:
df_enrollment.sample(5).T

Unnamed: 0,271448,147124,177293,128475,247428
rec_id,271448,147124,177293,128475,247428
rec_ext_date,2022-07-15 00:00:00,2020-07-01 00:00:00,2020-12-01 00:00:00,2019-11-06 00:00:00,2022-05-01 00:00:00
file_name,..\..\Data\EnrollmentData\01_raw\CEC Graduate ...,..\..\Data\EnrollmentData\01_raw\CEC Graduate ...,..\..\Data\EnrollmentData\01_raw\CEC Graduate ...,..\..\Data\EnrollmentData\01_raw\CEC Graduate ...,..\..\Data\EnrollmentData\01_raw\CEC Graduate ...
file_index,2846,1688,413,1178,215
reg_term_code,202270,202070,202110,202010,202270
reg_term_year,2022,2020,2021,2020,2022
reg_term_name,Fall,Fall,Spring,Spring,Fall
reg_term_desc,Fall 2022,Fall 2020,Spring 2021,Spring 2020 - COVID-19,Fall 2022
stu_id,CEC28097,CEC15059,CEC6503,CEC17100,CEC17985
stu_deg_level,Master,Master,Master,Master,Master


In [32]:
df_enrollment['file_name'][0]

'..\\..\\Data\\EnrollmentData\\01_raw\\CEC Graduate Registrations for DAEN Capstone_05.01.2017_Fall17.csv'

In [33]:
# Saving the dataframe as a pickle file
df_enrollment.to_pickle(".."+_+".."+_+"Data"+_+"EnrollmentData"+_+"02_intermediate"+_+"enrollment.pkl")

In [14]:
# Saving the dataframe to a csv file
df_enrollment.to_csv(".."+_+".."+_+"Data"+_+"EnrollmentData"+_+"02_intermediate"+_+"enrollment.csv", index=False)

In [13]:
# Saving the dataframe to a new sqlite database

conn = sqlite3.connect(".."+_+".."+_+"Data"+_+"EnrollmentData"+_+"02_intermediate"+_+"enrollment4EDA.db")
cur = conn.cursor()

df_enrollment.to_sql('enrollment4EDA', conn, if_exists='replace', index=False)

344119

# Normalizing the data
Will work on it later, once the Web Scraping is done on the Course Data
Add an image below:

![EER_Enrollment](Code\DataEngineering\EER_Enrollment.png)

![EER_Enrollment](..\..\Code\DataEngineering\EER_Enrollment.png)

In [13]:
# # Creating a new SQLite database
# conn = sqlite3.connect(".."+_+".."+_+"Data"+_+"EnrollmentData"+_+"02_intermediate"+_+"enrollment.db")
# cur = conn.cursor()

In [None]:
# # Creating the tables in the database
# conn.execute('''
#     CREATE TABLE IF NOT EXISTS

# EDA

In [15]:
# getting the number of unique values in each column for the entire dataframe
for col in df_enrollment.columns.to_list():
    unique_values = df_enrollment[col].nunique()
    if unique_values < 50:
        print(" "*10, col)
        print(df_enrollment[col].value_counts())
        print("-"*50)
    else:
        print(" "*10, col)
        print(f"Skipping '{col}' column as it has {unique_values} unique values.")
        print("-"*50)

           rec_id
Skipping 'rec_id' column as it has 344119 unique values.
--------------------------------------------------
           rec_ext_date
Skipping 'rec_ext_date' column as it has 106 unique values.
--------------------------------------------------
           file_name
Skipping 'file_name' column as it has 107 unique values.
--------------------------------------------------
           file_index
Skipping 'file_index' column as it has 6960 unique values.
--------------------------------------------------
           reg_term_code
202270    55420
202310    44755
202170    33471
202210    30433
201970    29295
202070    26223
201870    24396
201770    24376
202010    21603
201910    20067
201810    17266
202110    16814
Name: reg_term_code, dtype: int64
--------------------------------------------------
           reg_term_year
2022    85853
2021    50285
2019    49362
2020    47826
2023    44755
2018    41662
2017    24376
Name: reg_term_year, dtype: int64
-------------------

In [21]:
# getting the number of unique values in each column and for each 'stu_id'
for col in df_enrollment.columns.to_list():
    unique_values = df_enrollment[col].nunique()
    if unique_values < 50:
        print(
            df_enrollment[[col, 'stu_id']].drop_duplicates().value_counts(col).sort_index().rename(f"Unique number of Students in '{col}' column.")
        )
        print("-"*50)
    else:
        print(" "*10, col)
        print(f"Skipping '{col}' column as it has {unique_values} unique values.")
        print("-"*50)

           rec_id
Skipping 'rec_id' column as it has 344119 unique values.
--------------------------------------------------
           rec_ext_date
Skipping 'rec_ext_date' column as it has 106 unique values.
--------------------------------------------------
           file_name
Skipping 'file_name' column as it has 107 unique values.
--------------------------------------------------
           file_index
Skipping 'file_index' column as it has 6960 unique values.
--------------------------------------------------
reg_term_code
201770    1177
201810    1183
201870    1209
201910    1269
201970    1439
202010    1371
202070    1335
202110    1307
202170    1593
202210    1661
202270    2222
202310    2193
Name: Unique number of Students in 'reg_term_code' column., dtype: int64
--------------------------------------------------
reg_term_year
2017    1177
2018    1618
2019    1842
2020    1785
2021    2029
2022    2585
2023    2193
Name: Unique number of Students in 'reg_term_year' colu

We only need to look at the 2 dates to get the final course registration outcome:
- September 15th for the Fall Semester
- February 15th for the Spring Semester

In [53]:
df_enrollment[['reg_term_code', 'rec_ext_date']].value_counts().reset_index().sort_values(by=['reg_term_code', 'rec_ext_date']).tail(50)

Unnamed: 0,reg_term_code,rec_ext_date,0
75,202070,2020-06-15,2513
62,202070,2020-07-01,2664
48,202070,2020-07-15,2830
41,202070,2020-08-01,3026
39,202070,2020-08-15,3039
47,202070,2020-09-01,2835
60,202070,2020-09-15,2713
106,202110,2020-11-05,1482
98,202110,2020-11-15,2060
88,202110,2020-12-01,2298
