# Data Engineering
This noteook is used to merge all the data from different sources into one SQLite database. The database is normalized to 3NF.

In [1]:
# importing required libraries
import os
import pandas as pd

os.chdir(os.path.join( '..','..' ))

from Code.src.modules.dataManager   import DataManager
from Code.src.modules.db_ops        import ConnectDB

DM = DataManager()

In [2]:
# Importing all the data
db_enrollment               = DM.get_data('enrollment', 'db')
db_professor                = DM.get_data('professor', 'db')
db_course                   = DM.get_data('course', 'db')
db_enrollmentFinalStatus    = DM.get_data('enrollmentFinalStatus', 'db')

df_enrollment               = DM.get_data('enrollment', 'pkl')
df_professor                = DM.get_data('professor', 'pkl')
df_enrollmentFinalStatus    = DM.get_data('enrollmentFinalStatus', 'pkl')

## Data Prep for merging and normalizing

List of all the variables in the different databases, and identifying entities, attributes.
Formatting the variables as follows:
#### Entities
- `var_name`                            - `Primary Key`
- **var_name** - **FK to** `var_name`   - Foriegn Key to another entity
- ~~var_name~~                          - Unnecessary variable

### Semesters / Terms
- `term_code`
- term_year
- term_name
- term_desc

### Degree Levels (Only 1 level in this dataset)
- `deg_level`

### Campus
- `campus_id`

### Colleges
- `col_id`

### Departments
- `dept_id`
- dept_desc

### Programs
- `prog_code`
- prog_url
- prog_type
- prog_school (Is this the same as college?)
- prog_desc
- prog_level

### Instructors
- `inst_name`
- instr_home_org

### Courses
- `crs_id`
- crs_type
- crs_credits
- crs_hours

### Program Course Offerings
- `prog_req_id`            - `Surrogate Key`
- `**prog_code**`          - **FK to** `prog_code`
- `**crs_id**`             - **FK to** `crs_id`
- prog_req_crs

### Course Prerequisites
- `prereq_id`               - `Surrogate Key`
- `**crs**`                 - **FK to** `crs_id`
- `**crs_prereq_req**`      - **FK to** `crs_id`

### Course Sections
- `sect_id`
- **sect_crs**              - **FK to** `crs_id`
- **sect_clg**              - **FK to** `col_id`
- **sect_dept**             - **FK to** `dept_id`
- **sect_camp**
- sect_mode
- sect_status
- sect_lvl
- **sect_instr**            - **FK to** `inst_name`
- ~~sect_wiley~~

### Student Details
- `stu_id`
- stu_name
- stu_admit_term_code
- **stu_deg_level**         - **FK to** `deg_level`
- **stu_college**           - **FK to** `col_id`
- **stu_dept**              - **FK to** `dept_id`
- **stu_prog**              - **FK to** `prog_code`
- stu_res
- stu_visa
- ~~stu_bam~~

### Registration Status
- 'reg_id'                  - `Surrogate Key`
- `**reg_term_code**`       - **FK to** `term_code`
- `**reg_stu_id**`          - **FK to** `stu_id`
- `**sect_id**`             - **FK to** `sect_id`
- reg_status
- reg_status_date
- reg_final
- ~~reg_sct_reg_ind~~
- ~~stu_new_ret~~

### Course Cumulative Enrollment
- `cum_id`                  - `Surrogate Key`
- `**cum_term_code**`       - **FK to** `term_code`
- `**cum_sect_id**`         - **FK to** `sect_id`
- cum_seat_enroll
- cum_seat_wait
- cum_seat_avail

### Registration Log Record (A Registration Record is recorded at frequent intervals for each student)
- `rec_id`
- **reg_id**                - **FK to** `reg_id`
- rec_ext_date
- ~~file_name~~
- ~~file_index~~

In [48]:
df = df_professor
db = db_course

In [27]:
df['Course Sect Term Count'].value_counts()

1    25601
Name: Course Sect Term Count, dtype: int64

In [20]:
df.crs_sect_camp.unique()

['Fairfax', 'Online', 'SciTech - Prince William', 'Off-campus/Other', 'Arlington', 'Loudoun', 'GMU Korea', 'Study Abroad', 'Front Royal']
Categories (9, object): ['Arlington', 'Fairfax', 'Front Royal', 'GMU Korea', ..., 'Off-campus/Other', 'Online', 'SciTech - Prince William', 'Study Abroad']

In [49]:
db.runQuery("""
    SELECT *
    FROM lookup_class_to_course
    WHERE
        course_name_fk = 'STAT554' AND
        required_class = 1
""").head(60)

Unnamed: 0,lookup_ctc_id,program_name_fk,course_name_fk,required_class
0,415,biology-ms,STAT554,1
1,471,biostatistics-ms,STAT554,1
2,658,civil-infrastructure-engineering-ms,STAT554,1
3,695,civil-infrastructure-engineering-ms,STAT554,1
4,1261,data-analytics-engineering-ms,STAT554,1
5,1265,data-analytics-engineering-ms,STAT554,1
6,1300,data-analytics-engineering-ms,STAT554,1
7,1796,environmental-science-policy-ms,STAT554,1
8,1834,environmental-science-policy-ms,STAT554,1
9,2506,information-systems-ms,STAT554,1


In [50]:
db.runQuery("""
    SELECT course_name_fk, COUNT(DISTINCT program_name_fk) AS prog_count
    FROM lookup_class_to_course
    WHERE required_class = 1
    GROUP BY course_name_fk
    ORDER BY prog_count DESC;
""")

Unnamed: 0,course_name_fk,prog_count
0,STAT554,8
1,GGS553,8
2,GGS579,7
3,STAT544,6
4,PUAD630,6
...,...,...
1197,ACCT624,1
1198,ACCT623,1
1199,ACCT621,1
1200,ACCT611,1


In [52]:
db.runQuery("""
    SELECT program_name_fk, COUNT(DISTINCT course_name_fk) AS course_count
    FROM lookup_class_to_course
    WHERE required_class = 1
    GROUP BY program_name_fk
    ORDER BY course_count DESC;
""")

Unnamed: 0,program_name_fk,course_count
0,data-analytics-engineering-ms,142
1,information-systems-ms,130
2,environmental-science-policy-ms,102
3,biology-ms,92
4,civil-infrastructure-engineering-ms,80
...,...,...
69,naval-ship-design-gc,4
70,accounting-government-contracts-gc,4
71,public-management-gc,3
72,nonprofit-management-gc,3


In [6]:
df_enrollment.head().T

Unnamed: 0,0,1,2,3,4
rec_id,0,1,2,3,4
rec_ext_date,2017-05-01 00:00:00,2017-05-01 00:00:00,2017-05-01 00:00:00,2017-05-01 00:00:00,2017-05-01 00:00:00
file_name,Data/01_raw/EnrollmentData/CEC Graduate Regist...,Data/01_raw/EnrollmentData/CEC Graduate Regist...,Data/01_raw/EnrollmentData/CEC Graduate Regist...,Data/01_raw/EnrollmentData/CEC Graduate Regist...,Data/01_raw/EnrollmentData/CEC Graduate Regist...
file_index,5,6,10,11,12
reg_term_code,201770,201770,201770,201770,201770
reg_term_year,2017,2017,2017,2017,2017
reg_term_name,Fall,Fall,Fall,Fall,Fall
reg_term_desc,Fall 2017,Fall 2017,Fall 2017,Fall 2017,Fall 2017
stu_id,CEC3286,CEC3289,CEC865,CEC865,CEC901
stu_deg_level,Master,Master,Master,Master,Master


# Normalizing the data
Normalizing the data

![EER_Enrollment](../../Code/01_DataEngineering/EER_Enrollment.png)

In [3]:
# Creating a new SQLite database
db_merged = ConnectDB(os.path.join( "Data", "02_processed", "merged.db" ))

In [None]:
# # Creating the tables in the database
# conn.execute('''
#     CREATE TABLE IF NOT EXISTS