Note: This file operates as a way to merge new data that may not be present under our enrollment database.

In [102]:
import json
import os
import re

import requests
#import pandas as pd
import sqlite3
import copy

from bs4 import BeautifulSoup

In [103]:
core_courses ={
    'VS-MENG-GCS' :
        ['CEIE524', 'CEIE525', 'CEIE531', 'CEIE575', 'CEIE605'],
    'EC-MS-AIT' :
        ['AIT524', 'AIT542', 'AIT580', 'AIT664'],
    'EC-MS-BIOE' :
        ['BENG520', 'BENG521', 'BENG541', 'BENG526', 'BENG537', 'BENG538', 'BENG501', 'BENG514', 'BENG575', 'BENG601', 'BENG602', 'BENG535', 'BENG560'],
    'EC-MS-BSTA' :
        ['STAT544','STAT554','STAT654','GCH712','BENG501','BINF630','BENG538','STAT560','STAT634','STAT798'],
    'EC-MS-CEIE' :
        ['CEIE601', 'CEIE605'],
    'EC-MS-CPE' :
        ['ECE505', 'ECE508', 'ECE511', 'ECE516', 'ECE527', 'ECE528', 'ECE531', 'ECE535', 'ECE542', 'ECE545',
         'ECE554', 'ECE556', 'ECE580', 'ECE611', 'ECE646', 'ECE799'],
    # Computer ForensiCSInfo not available
    'EC-MS-CS' :
        ['CS550', 'CS580', 'CS584', 'CS540', 'SWE619', 'SWE621', 'CS555', 'CS571', 'ISA562', 'CS583', 'CS551'],
    'EC-MS-CYSE' :
        ['CYSE550', 'CYSE570', 'CYSE580', 'CYSE610', 'CYSE587', 'CYSE690', 'CYSE799'],
    'EC-MS-DAEN' :
        ['AIT580', 'BENG515', 'OR531', 'CS504', 'OR541', 'BENG554', 'CS584','STAT515',"STAT554", "DAEN690"],
    'EC-MS-DFOR' :
        ['DFOR510','DFOR660','DFOR661','DFOR663','DFOR664','DFOR670','DFOR671','DFOR672','DFOR790'],
    'VS-MS-CFRS' :
        ['DFOR510','DFOR660','DFOR661','DFOR663','DFOR664','DFOR670','DFOR671','DFOR672','DFOR790'],
    'EC-MS-ELEN' :
        ['ECE511', 'ECE521', 'ECE527', 'ECE528', 'ECE535', 'ECE539', 'ECE542', 'ECE552', 'ECE580', 
         'ECE584', 'ECE586', 'ECE587', 'ECE621', 'ECE630', 'ECE799'],
    'EC-MS-ISYS' :
        ['COMP502', 'CS550', 'INFS622', 'INFS580', 'INFS611'],
    'EC-MS-ISA' :
        ['ISA562', 'ISA656', 'INFS612', 'CS555'],
    'EC-MS-OPRS' :
        ['OR541', 'OR542', 'OR568', 'OR635'],
    'EC-MS-SWE' :
        ['SWE619', 'SWE621', 'SWE632', 'SWE637'],
    'EC-MS-STAT' :
        ['STAT544', 'STAT554', 'STAT634', 'STAT652', 'STAT654'],
    'EC-MS-SYST' :
        ['SYST505', 'SYST510', 'SYST520', 'SYST530', 'SYST611'],
    'EC-MS-TCOM' :
        ['TCOM500', 'TCOM514', 'TCOM515', 'TCOM535', 'TCOM570', 'TCOM610', 'TCOM750']
}

#This means that at least ONE of the following courses can be taken
variables_one_courses = {
    'EC-MS-ISA' : 
        [['INFS612','CS555']],
    'EC-MS-DFOR' :
        [['DFOR663','DFOR664'],['DFOR670','DFOR671']],
    'VS-MS-CFRS' :
        [['DFOR663','DFOR664'],['DFOR670','DFOR671']],
    'EC-MS-BIOE' :
        [['BENG521','BENG541'],['BENG537','BENG538']],
    'EC-MS-BSTA' :
        [['GCH712','BENG501'],['BINF630','BENG538'],['STAT634','STAT798']]
}

variables_two_courses = {
    'EC-MS-BIOE':
        ['BENG501','BENG514','BENG575','BENG601', 'BENG602', 'STAT535', 'STAT560']
}

In [104]:
process_path = os.path.join(os.sep+"home"+os.sep+"jupyter"+os.sep+"Team-Prophecy","Data","02_processed","intermediate.db")
print(process_path)

/home/jupyter/Team-Prophecy/Data/02_processed/intermediate.db


In [105]:
raw_path = os.path.join(os.sep+"home"+os.sep+"jupyter"+os.sep+"Team-Prophecy","Data","01_raw","CourseData","raw.db")
print(raw_path)

/home/jupyter/Team-Prophecy/Data/01_raw/CourseData/raw.db


In [106]:
raw_connection = sqlite3.connect(raw_path)

In [107]:
process_connection = sqlite3.connect(process_path)

In [108]:
base_catalog = "https://catalog.gmu.edu"
base_masters_url = "https://catalog.gmu.edu/programs/#filter=.filter_23"
#base_phd_url = "https://catalog.gmu.edu/programs/#filter=.filter_23&.filter_28"
base_courses_url = "https://catalog.gmu.edu/courses/" 

In [109]:
class CourseInfo:
    #We have REQUIRED PREREQUISITES and RECOMMENDED PREREQUISITES
    def __init__(self, code, program, credits, required_prereq_classes=""): #, recommended_classes=""):
        self.code = code
        self.program = program
        self.credits = int(credits)
        self.required_prereq_classes = required_prereq_classes
        #self.recommended_classes = recommended_classes
    def to_sql_insert(self):
        return tuple((self.code, self.credits))
    def to_sql_insert_program_code_key(self):
        return tuple((self.program, self.code))

In [110]:
class ProgramInfo:
    def __init__(self, name, url, program_type):
        self.name = name
        self.url = url
        self.program_type = program_type #i.e. masters/phd

    def to_sql_insert(self):
        return tuple((self.name, self.url, self.program_type))

In [111]:
masters_html_listed = requests.get(base_masters_url)
print("Now, we process our masters...")
masters_content = None
if masters_html_listed.status_code != 200:
    print("Could not retrieve list of masters classes")
    exit()
# Match using this: (\/colleges-schools\/.+?(?=\")) for masters
masters_content = str(masters_html_listed.content)
masters_content_list = re.findall(r"(\/colleges-schools\/.+?(?=\"))", masters_content)[1:]
print("Masters Content List will now be filtered.")

Now, we process our masters...
Masters Content List will now be filtered.


In [112]:
masters_content_list = [m_iter for m_iter in masters_content_list if
                            ("-graduate-certificate" in m_iter or "-ms" in m_iter or "-phd" in m_iter)]

In [113]:
#print(masters_content_list)

We must now declare all values under our colleges, departments, and degree levels so that we can later use them as referential variables

In [114]:
process_connection.executemany("INSERT INTO degree_level(deg_name) VALUES(?)", [("gc",),("ms",),("phd",)])
process_connection.commit()

In [115]:
master_link_dict = {}
i = 1;  # This will be our temporary auto-increment value
# Now that we have all values listed, we should rework them so that they can be handled
for m_iter in masters_content_list:
    cert_list = m_iter.split("/")
    cert_list = cert_list[1:len(cert_list)-1]
    name = cert_list[len(cert_list) - 1]
    if "graduate-certificate" in name or "phd" in name:
        continue
        #type = "graduate-certificate"
        #name = name.replace("-graduate-certificate", "-gc")  # .replace("-ms", "")
    #elif "phd" in name:
        #type = "phd"
    else:
        type = "ms"
    try:
        name = re.findall(r"Banner Code: ([A-Za-z]*-[A-Za-z]*-[A-Za-z]*)",str(requests.get(base_catalog+m_iter).content))[0]
        if name in master_link_dict.keys():
            continue
        #school = cert_list[len(cert_list) - 2]
        master_link_dict[name] = ProgramInfo(name, m_iter, type)
    except Exception as e:
        #If they cannot be added, it means that they do not have a formal degree associated with them
        #print(f"Exception provided for name {name}: {e}")
        continue
prog_info_list = [i.to_sql_insert() for i in master_link_dict.values()]

NOTE: Student Departments and College info will be merged later when we do enrollment

In [116]:
#print("\n".join([str(i[0]) for i in prog_info_list]))

In [117]:
raw_connection.executemany("INSERT INTO programs VALUES(?,?,?)", prog_info_list)
raw_connection.commit()

In [118]:
error_list = []
masters_obj_dict = {} #This handles the extended masters list
m_vals = master_link_dict.values()

In [119]:
l_req_words = ["requirements","required courses","core courses"]
l_ex_words = ["concentration","electives","msaccel","accelerated","dual","bam"]
l_acc_words = ["msaccel","accelerated","dual","bam"]
prog_list = set()
prog_offerings_values = set()
for m_iter in m_vals:
    if m_iter.name not in core_courses:
        continue
    masters_url = base_catalog+m_iter.url+"#requirementstext"
    masters_courses_html = requests.get(masters_url)
    if masters_courses_html.status_code != 200:
        print("Could not retrieve list of masters classes")
        continue
    #masters_courses_types_html = str(masters_courses_html.content).replace("\\xc2\\xa0", " ").lower()
    be_assess = BeautifulSoup(masters_courses_html.content,'html.parser')
    #Either it's a required course, ORit's not.
    for core_class in be_assess.find_all("h3"):
        #print(core_class)
        core_text = core_class.get_text().lower()
        core_list = core_class.parent.find_all("a",{"class":"bubblelink code"})
        for course in core_list:
            course_w_no = str(course["href"]).replace("/search/?P=", "").lower().split("%20")
            course_w_no[0] = course_w_no[0].upper()
            code_chk = course_w_no[0].strip() + " " + course_w_no[1].strip()
            class_code = course_w_no[0].strip() + course_w_no[1].strip()
            #print("======= Course =======")
            #print(f"classname = {m_iter.name}")
            #print(f"classtext = {code_chk}")
            #print("======= Course =======")
            #print()
            prog_list.add(m_iter.name)
            prog_offerings_values.add(tuple((m_iter.name,class_code,0)))

In [120]:
prog_offerings_values = list(prog_offerings_values)
p_len = len(prog_offerings_values)
for i in range(0,p_len):
    try:
        #print(f"program offerings: {prog_offerings_values[i][0]}")
        #print(f"course: {prog_offerings_values[i][1]}")
        #print()
        program = prog_offerings_values[i][0]
        course = prog_offerings_values[i][1]
        if course in core_courses[program]:
            prog_offerings_values[i] = (program,course,1)
            #print("successful added to required")
    except:
        continue

In [121]:
raw_connection.executemany("INSERT INTO program_course_offerings(prog_code, crs_id, required) VALUES(?,?,?)",prog_offerings_values)
raw_connection.commit()

In [122]:
opt_req_values = set()
for p in prog_list:
    if p in variables_one_courses:
        for vo in variables_one_courses[p]:
            opt_req_values.add((p,str(vo),len(vo)))
    if p in variables_two_courses:
        for vo in variables_two_courses[p]:
            opt_req_values.add((p,str(vo),len(vo)))
opt_req_values = list(opt_req_values)
process_connection.executemany("INSERT INTO optional_req_courses VALUES(?,?,?)",prog_offerings_values)
process_connection.commit()

## DEPRECATED CODE ##

In [69]:
raw_connection.execute("DELETE FROM program_course_offerings")
raw_connection.commit()

In [52]:
required_df = raw_connection.execute("SELECT DISTINCT prog_code, crs_id, required FROM program_course_offerings WHERE required == 1").fetchall()
elective_df = raw_connection.execute("SELECT DISTINCT prog_code, crs_id, required FROM program_course_offerings WHERE required == 0").fetchall()
#,columns=["prog_code","crs_id","required"])

In [53]:
required_df = {(k[0],k[1]) : k[2] for k in required_df}
correct_df = {(k[0],k[1]) : k[2] for k in elective_df}
correct_df.update(required_df)
correct_df = list(set((i[0][0],i[0][1],i[1]) for i in correct_df.items()))

In [54]:
process_connection.execute("DELETE FROM program_course_offerings")
process_connection.commit()
process_connection.executemany("INSERT INTO program_course_offerings(prog_code, crs_id, required) VALUES(?,?,?)",correct_df)
process_connection.commit()

In [4]:
def dontrun():
    """
    l_req_words = ["requirements","required courses","core courses"]
    l_ex_words = ["concentration","electives","msaccel","accelerated","dual","bam"]
    l_acc_words = ["msaccel","accelerated","dual","bam"]
    for m_iter in m_vals:
        masters_url = base_catalog+m_iter.url+"#requirementstext"
        masters_courses_html = requests.get(masters_url)
        if masters_courses_html.status_code != 200:
            print("Could not retrieve list of masters classes")
            continue
        #masters_courses_types_html = str(masters_courses_html.content).replace("\\xc2\\xa0", " ").lower()
        be_assess = BeautifulSoup(masters_courses_html.content,'html.parser')
        #Either it's a required course, or it's not.
        for core_class in be_assess.find_all("h3"):
            core_text = core_class.get_text().lower()
            if any(r in core_text for r in l_req_words) \
                and (any(x in core_text fORx in l_ex_words) == False):
                #print("======= REQUIRED =======")
                #print(f"classname = {m_iter.name}")
                #print(f"classtext = {core_class.get_text()}")
                #print("======= REQUIRED =======")
                core_list = core_class.parent.find_all("a",{"class":"bubblelink code"})
                #For any courses that we find, we must add all of them to our database.
                # This is IIF we haven't encountered them prior.. (Likely will need to include courses later)
                selected_classes, extend_map_list = scrape_course_info(m_iter.name,core_list,masters_obj_dict,1) #master_info,
                masters_obj_dict.update(extend_map_list)
                #core_class.parent.find_all("a", {"class": "bubblelink code"})[0]["onclick"]
            elif (any(a in core_text for a in l_acc_words) == False):
                #print("======= OPTIONAL =======")
                #print(f"classname = {m_iter.name}")
                #print(f"classtext = {core_class.get_text()}")
                #print("======= OPTIONAL =======")
                elective_list = core_class.parent.find_all("a", {"class": "bubblelink code"})
                selected_classes, extend_map_list = scrape_course_info(m_iter.name, elective_list, masters_obj_dict,0)  # master_info,
                masters_obj_dict.update(extend_map_list)
    """

In [5]:
def scrape_course_info(program,core_list,masters_obj_dict):
    required = 0
    m_dict = masters_obj_dict #This will include all courses listed.
    selected_course_info = []
    all_course_info = [] #Once we get all of the course information, we want to be notified of it!
    program_course_link = set()
    c_w_prereqs = []
    fORcourse in core_list:
        course_w_no = str(course["href"]).replace("/search/?P=", "").lower().split("%20")
        selected_course_info.append(course_w_no[0].strip() + course_w_no[1].strip())
        courses_found = base_courses_url+course_w_no[0]
        course_w_no[0] = course_w_no[0].upper()
        class_code = course_w_no[0].strip() + course_w_no[1].strip()
        
        program_course_link.add(tuple((program,class_code,required)))
        """
        if course_w_no[0] not in m_dict.keys():
            courses_html = requests.get(courses_found) #Be sure to grab by class courseblocklevel
            course_assess = BeautifulSoup(courses_html.content, 'html.parser')
            m_dict[course_w_no[0]] = {}
            for c_iter in course_assess.find_all("div", {"class":"courseblocklevel"}):
                #spl_iter_text = int(c_iter.get_text().replace("\n","")[:3])
                #if spl_iter_text < 500:
                #    continue
                course_block_iter = c_iter.find_all_next("div", {"class":"courseblock"})
                for cb_iter in course_block_iter:
                    gen_indexing = cb_iter.get_text()
                    class_code = cb_iter.find_next("strong",{"class": "cb_code"}).get_text()[:-1].replace("\xa0","")
                    class_credits = int(re.search(r"([0-9]{1}) credit",gen_indexing).group(0)[:1])
                    #if tuple((class_code, class_credits)) in all_course_info:
                    #    continue
                    #class_desc = cb_iter.find_next("div",{"class": "courseblockdesc"}).get_text()
                    class_prereqs_needed = []
                    try:
                        prereq_needed = gen_indexing.index("Required Prerequisite")
                        if prereq_needed > -1:
                            class_prereqs_needed = set(re.findall(r"([A-Z]+.[0-9]{3})",cb_iter.get_text()[prereq_needed:].split(".")[0]))
                            class_prereqs_needed = [i.replace("\xa0", "") for i in class_prereqs_needed]
                    except:
                        class_prereqs_needed = []
                    cInfo = CourseInfo(class_code, program, class_credits,class_prereqs_needed) #,class_prereqs_recom)
                    program_course_link.add(tuple((cInfo.program,cInfo.code,required)))
                    m_dict[course_w_no[0]][cInfo.code] = cInfo
                    all_course_info.append(cInfo.to_sql_insert())
                    for pr_c in cInfo.required_prereq_classes:
                        c_w_prereqs.append(tuple((cInfo.code,pr_c)))
            """ 
            
        #raw_connection.executemany("INSERT INTO courses(crs_name,crs_credits) VALUES(?)",all_course_info)
        #raw_connection.commit()

        program_course_arr = list(program_course_link)
        #print("\n".join(str(i) for i in program_course_arr))
        raw_connection.executemany("INSERT INTO program_course_offerings(prog_code, crs_id, required) VALUES(?,?,?)",program_course_arr)
        raw_connection.commit()

        """
        # Prerequisite information
        process_connection.executemany(
            "INSERT INTO course_prerequisite(crs, crs_prereq) VALUES(?,?)",
            c_w_prereqs)
        process_connection.commit()

        all_course_info = []
        c_w_prereqs = []
        """
    return selected_course_info, m_dict #program_course_link

In [None]:
l_req_words = ["requirements","required courses","core courses"]
l_ex_words = ["concentration","electives","msaccel","accelerated","dual","bam"]
l_acc_words = ["msaccel","accelerated","dual","bam"]
prog_offerings_values = []
for m_iter in m_vals:
    if m_iter.name not in core_courses:
        continue
    masters_url = base_catalog+m_iter.url+"#requirementstext"
    masters_courses_html = requests.get(masters_url)
    if masters_courses_html.status_code != 200:
        print("Could not retrieve list of masters classes")
        continue
    #masters_courses_types_html = str(masters_courses_html.content).replace("\\xc2\\xa0", " ").lower()
    be_assess = BeautifulSoup(masters_courses_html.content,'html.parser')
    #Either it's a required course, ORit's not.
    for core_class in be_assess.find_all("h3"):
        #print(core_class)
        core_text = core_class.get_text().lower()
        core_list = core_class.parent.find_all("a",{"class":"bubblelink code"})
        for course in core_list:
            course_w_no = str(course["href"]).replace("/search/?P=", "").lower().split("%20")
            course_w_no[0] = course_w_no[0].upper()
            code_chk = course_w_no[0].strip() + " " + course_w_no[1].strip()
            class_code = course_w_no[0].strip() + course_w_no[1].strip()
            if code_chk in core_courses[m_iter.name]:
                print("======= REQUIRED =======")
                print(f"classname = {m_iter.name}")
                print(f"classtext = {code_chk}")
                print("======= REQUIRED =======")
                prog_offerings_values.append((m_iter.name,class_code,1))
            elif code_chk not in core_courses[m_iter.name]:
                print("======= OPTIONAL =======")
                print(f"classname = {m_iter.name}")
                print(f"classtext = {code_chk}")
                print("======= OPTIONAL =======")
                prog_offerings_values.append(tuple((m_iter.name,class_code,0)))
            print()