# Initiation

In [1]:
# importing required libraries
import os
import glob
import pandas as pd
import sqlite3
from file_paths import *

In [3]:
# Defining the path to the source folder
src = filepath("..", "..")

# Importing Data

In [6]:
# Merging the Enrollment data from all the CSV files in the EnrollmentData folder
files = glob.glob(
    filepath(src, "Data", "01_raw", "EnrollmentData", "*.csv")
)

files
# # Reading each file, adding filename as a column and appending to a pandas dataframe
# df_enrollment = pd.read_csv(files[0])
# df_enrollment['file_name'] = files[0]
# for file in files[1:]:
#     data = pd.read_csv(file)
#     data['file_name'] = file
#     df_enrollment = pd.concat([df_enrollment, data], ignore_index=True)

# # Shape of the final dataframe
# df_enrollment.shape

[]

In [3]:
# renaming all the columns in the dataframe
rename_dict = {
    'Unnamed: 0'                        : 'file_index',
    'Stu Term'                          : 'reg_term_code',
    'Unnamed: 1'                        : 'reg_term_desc',
    'Stu Admit Term'                    : 'stu_admit_term_code',
    'Unnamed: 3'                        : 'stu_admit_term_desc',
    'Record ID'                         : 'stu_id',
    'Stu Visa Type'                     : 'stu_visa',
    'Stu Attribute BAM'                 : 'stu_bam',
    'Stu Admit Type'                    : 'stu_prog_level',
    'Stu Primary Degree Level'          : 'stu_deg_level',
    'Stu Primary Major 1 College'       : 'stu_college',
    'Stu Primary Program'               : 'stu_prog_desc',
    'Stu Primary Program Code'          : 'stu_prog_code',
    'Stu Primary Department'            : 'stu_dept',
    'Unnamed: 14'                       : 'stu_dept_desc',
    'Stu New/Returning Ind'             : 'stu_new_ret',
    'Stu Residency Group'               : 'stu_res',
    'Course Sect College'               : 'crs_sect_clg',
    'Stu Registered Ind'                : 'stu_act_reg_ind',
    'Registration Status'               : 'reg_status',
    'Registration Status Date'          : 'reg_status_date',
    'Course Desc'                       : 'crs',
    'Course Section'                    : 'crs_sect',
    'Course Sect Schedule Type'         : 'crs_type',
    'Course Sect Wiley Courses Ind'     : 'crs_sect_wiley_ind',
    'Course Sect Credits'               : 'crs_credits',
    'Stu Course Registered Hours'       : 'crs_hours',
    'Course Sect Instruction Delivery Method Group' : 'crs_sect_modality',
}
df_enrollment.rename(columns=rename_dict, inplace=True)

In [4]:
# Extracting features from the dataframe
df_enrollment['reg_term_name'] = df_enrollment['reg_term_desc'].str.split(' ').str[0]
df_enrollment['reg_term_year'] = df_enrollment['reg_term_desc'].str.split(' ').str[1]
df_enrollment['stu_admit_term_name'] = df_enrollment['stu_admit_term_desc'].str.split(' ').str[0]
df_enrollment['stu_admit_term_year'] = df_enrollment['stu_admit_term_desc'].str.split(' ').str[1]

df_enrollment['rec_ext_date'] = df_enrollment['file_name'].str.split("_").str[-2]
df_enrollment['rec_ext_date'] = df_enrollment['rec_ext_date'].str.replace(".", "/", regex=False)

In [5]:
# Cleaning the data

df_enrollment['stu_id'].dropna(inplace=True)
df_enrollment['stu_visa'].fillna("Not Relevent", inplace=True)

df_enrollment['stu_bam'].replace("'--", "Not BAM", inplace=True)
df_enrollment['stu_dept'].replace("'-----", "No Value", inplace=True)
df_enrollment['crs_sect_wiley_ind'].replace("'--", "No Value", inplace=True)

df_enrollment['crs_credits'] = df_enrollment['crs_credits'].astype(str)

df_enrollment['crs_credits'].replace("6-Jan", "1-6", inplace=True)
df_enrollment['crs_credits'].replace("3-Jan", "1-3", inplace=True)
df_enrollment['crs_credits'].replace("18-Jan", "1-18", inplace=True)
df_enrollment['crs_credits'].replace("4-Jan", "1-4", inplace=True)
df_enrollment['crs_credits'].replace("0,3", "0-3", inplace=True)

In [6]:
# sorting the dataframe by term code and registration status date
df_enrollment = df_enrollment.sort_values(
    by=['reg_term_code', 'rec_ext_date', 'reg_status_date']
)

# resetting the index after sorting
df_enrollment = df_enrollment.reset_index(drop=True)

# creating a new rec_id column for identifying each record
df_enrollment['rec_id'] = df_enrollment.index

In [7]:
# Changing the data type of the columns
df_enrollment['rec_ext_date']           = pd.to_datetime(df_enrollment['rec_ext_date'])
df_enrollment['reg_term_code']          = df_enrollment['reg_term_code'].astype(str)
df_enrollment['reg_term_year']          = df_enrollment['reg_term_year'].astype(int)
df_enrollment['reg_term_name']          = df_enrollment['reg_term_name'].astype('category')
df_enrollment['stu_new_ret']            = df_enrollment['stu_new_ret'].astype('category')
df_enrollment['stu_deg_level']          = df_enrollment['stu_deg_level'].astype('category')
df_enrollment['stu_college']            = df_enrollment['stu_college'].astype('category')
df_enrollment['stu_prog_desc']          = df_enrollment['stu_prog_desc'].astype('category')
df_enrollment['stu_prog_level']         = df_enrollment['stu_prog_level'].astype('category')
df_enrollment['stu_dept']               = df_enrollment['stu_dept'].astype('category')
df_enrollment['stu_dept_desc']          = df_enrollment['stu_dept_desc'].astype('category')
df_enrollment['stu_admit_term_code']    = df_enrollment['stu_admit_term_code'].astype(str)
df_enrollment['stu_admit_term_year']    = df_enrollment['stu_admit_term_year'].astype(int)
df_enrollment['stu_admit_term_name']    = df_enrollment['stu_admit_term_name'].astype('category')
df_enrollment['stu_res']                = df_enrollment['stu_res'].astype('category')
df_enrollment['stu_visa']               = df_enrollment['stu_visa'].astype('category')
df_enrollment['stu_bam']                = df_enrollment['stu_bam'].astype('category')
df_enrollment['crs_sect_clg']           = df_enrollment['crs_sect_clg'].astype('category')
df_enrollment['crs_type']               = df_enrollment['crs_type'].astype('category')
df_enrollment['crs_sect_modality']      = df_enrollment['crs_sect_modality'].astype('category')
df_enrollment['crs_sect_wiley_ind']     = df_enrollment['crs_sect_wiley_ind'].astype('category')
df_enrollment['crs_credits']            = df_enrollment['crs_credits'].astype('category')
df_enrollment['crs_hours']              = df_enrollment['crs_hours'].astype('category')
df_enrollment['stu_act_reg_ind']        = df_enrollment['stu_act_reg_ind'].astype('category')
df_enrollment['reg_status']             = df_enrollment['reg_status'].astype('category')
df_enrollment['reg_status_date']        = pd.to_datetime(df_enrollment['reg_status_date'])

In [8]:
# reordering the columns
df_enrollment = df_enrollment[[
    # Records Info
    'rec_id', 'rec_ext_date', 'file_name', 'file_index',
    # Registration Term/Semester Info
    'reg_term_code', 'reg_term_year', 'reg_term_name', 'reg_term_desc',
    # Student Info
    'stu_id', 'stu_deg_level', 'stu_college', 'stu_res', 'stu_visa', 'stu_bam', 'stu_new_ret',
    'stu_dept', 'stu_dept_desc', 'stu_prog_code', 'stu_prog_level', 'stu_prog_desc',
    'stu_admit_term_code', 'stu_admit_term_year', 'stu_admit_term_name', 'stu_admit_term_desc',
    # Course Info
    'crs', 'crs_type', 'crs_credits', 'crs_hours',
    'crs_sect', 'crs_sect_clg', 'crs_sect_modality', 'crs_sect_wiley_ind',
    # Registration Status Info
    'reg_status', 'reg_status_date', 'stu_act_reg_ind'
]]

In [9]:
df_enrollment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344119 entries, 0 to 344118
Data columns (total 35 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   rec_id               344119 non-null  int64         
 1   rec_ext_date         344119 non-null  datetime64[ns]
 2   file_name            344119 non-null  object        
 3   file_index           344119 non-null  int64         
 4   reg_term_code        344119 non-null  object        
 5   reg_term_year        344119 non-null  int32         
 6   reg_term_name        344119 non-null  category      
 7   reg_term_desc        344119 non-null  object        
 8   stu_id               344118 non-null  object        
 9   stu_deg_level        344119 non-null  category      
 10  stu_college          344119 non-null  category      
 11  stu_res              344119 non-null  category      
 12  stu_visa             344119 non-null  category      
 13  stu_bam       

In [10]:
df_enrollment.sample(5).T

Unnamed: 0,98316,256583,84100,324258,158457
rec_id,98316,256583,84100,324258,158457
rec_ext_date,2019-07-01 00:00:00,2022-06-01 00:00:00,2018-12-15 00:00:00,2022-11-02 00:00:00,2020-09-01 00:00:00
file_name,..\TeamProphecyRawData\EnrollmentData\CEC Grad...,..\TeamProphecyRawData\EnrollmentData\CEC Grad...,..\TeamProphecyRawData\EnrollmentData\CEC Grad...,..\TeamProphecyRawData\EnrollmentData\CEC Grad...,..\TeamProphecyRawData\EnrollmentData\CEC Grad...
file_index,1405,2869,367,3654,2107
reg_term_code,201970,202270,201910,202310,202070
reg_term_year,2019,2022,2019,2023,2020
reg_term_name,Fall,Fall,Spring,Spring,Fall
reg_term_desc,Fall 2019,Fall 2022,Spring 2019,Spring 2023,Fall 2020
stu_id,CEC16807,CEC28109,CEC41,CEC31513,CEC21746
stu_deg_level,Master,Master,Master,Master,Master


In [11]:
# Saving the dataframe as a pickle file
df_enrollment.to_pickle(src+_+"Data"+_+"EnrollmentData"+_+"enrollment.pkl")

In [12]:
# Saving the dataframe to a csv file
df_enrollment.to_csv(src+_+"Data"+_+"EnrollmentData"+_+"enrollment.csv", index=False)

In [13]:
# Saving the dataframe to a new sqlite database

conn = sqlite3.connect(src+_+"Data"+_+"EnrollmentData"+_+"enrollment4EDA.db")
cur = conn.cursor()

df_enrollment.to_sql('enrollment4EDA', conn, if_exists='replace', index=False)

344119

# Normalizing the data
Will work on it later, once the Web Scraping is done on the Course Data
Add an image below:

![EER_Enrollment](Code\DataEngineering\EER_Enrollment.png)

![EER_Enrollment](..\..\Code\DataEngineering\EER_Enrollment.png)

In [14]:
# # Creating a new SQLite database
# conn = sqlite3.connect(".."+_+".."+_+"Data"+_+"EnrollmentData"+_+"02_intermediate"+_+"enrollment.db")
# cur = conn.cursor()

In [15]:
# # Creating the tables in the database
# conn.execute('''
#     CREATE TABLE IF NOT EXISTS