# UTS Course Data Script
The goal of this python script is to scrape the UTS handbook for degree, course and subject data. This is so that it can be stored in our database and the necessary algorithms can be run on it, so that we can recommend subjects automatically.

## Library Imports
* requests library: Uses HTTP to request the webpage from a server with a given URL (in this case https://www.handbook.uts.edu.au/).
* BeautifulSoup library: Parsers HTML files so that it can be searched and filtered.
* json library: Reading, writing and handling JSON files.

In [1]:
import requests
from bs4 import BeautifulSoup
import json
import time

In [2]:
baseUrl = 'https://www.handbook.uts.edu.au/'

In [3]:
def getPageHTML(url):
    return BeautifulSoup(requests.get(url).content, 'html.parser')

In [None]:
def getDegrees(courseArea, graduateType):
    degrees = []
    degreesPage = BeautifulSoup(requests.get(f"{baseUrl}{courseArea}/{graduateType}/index.html").content, 'html.parser')
    tables = degreesPage.find(id='content').find(class_='ie-images').find_all('table')
    
    if len(tables) <= 1:
        return None
    else:
        for table in tables:
            tableCells = table.find_all("td")
            for cellNumber in range(1, len(tableCells)):
                isDegreeCode = tableCells[cellNumber-1].text.strip()[0] == "C"
                degrees.append({
                    'name': tableCells[cellNumber if isDegreeCode else cellNumber-1].text.strip(),
                    'degreeCode': (tableCells[cellNumber-1 if isDegreeCode else cellNumber].text).lower().strip(),
                    'subjects': {}
                })
                
    return degrees

In [None]:
indexPage = requests.get(baseUrl)

soup = BeautifulSoup(indexPage.content, 'html.parser')
# print(soup.find(class_='toc').find_all('a', href=True)[0]['href'])

# links to the pages for undergraduate and post graduate degrees in all course areas
courseAreas = {}

for courseArea in soup.find(class_='toc').find_all('a', href=True):
    abbreviation = courseArea['href'][1:4]
    
    courseAreas[courseArea.text] = {
        'abbreviation': abbreviation,
        'url': f"https://www.handbook.uts.edu.au{courseArea['href']}",
        'ug_degrees': getDegrees(abbreviation, 'ug'),
        'pg_degrees': getDegrees(abbreviation, 'pg')
    }

## Scraping subject data seperately
Since many degrees share common subjects, I will be scraping the subject data seperately so I can normalise the json. Which in turn will reduce the data redundancy of the database.

The callback parameter allows the user of this script to create a function which processes the degree pages in different ways. The callback parameter returns the processed data from the degree page and then the getDegreeData function saves and eventually returns it.

The numDegreePerCourseArea parameter allows the user of this script to set the number of degree pages it wants to access per course area. This was added as depending on your callback function, it could take a long time to execute as there are many different combinations of courseAreas, degrees, majors, subjects etc.

In [None]:
def subjectMajorIndex(text):
    containsMajor = text.find("major")
    containsMajors = text.find("majors")
    
    if containsMajor != -1:
        return containsMajor
    if containsMajors != -1:
        return containsMajors
    
    return False

def getDegreeData(callback, numDegreePerCourseArea="length"):
    subjects = {}
    majors = {}
    
    for key in courseAreas:
        courseArea = courseAreas[key]
        if courseArea['ug_degrees'] != None:
            courseAreaPortion = courseArea["ug_degrees"] 
            
            if numDegreePerCourseArea != "length" and numDegreePerCourseArea < len(courseArea["ug_degrees"]):
                courseAreaPortion = courseArea["ug_degrees"][:numDegreePerCourseArea]
            
            for index, degree in enumerate(courseAreaPortion):
    #             print("degree", degree)
                tables = getPageHTML(f"{baseUrl}courses/{degree['degreeCode']}.html").find_all('table')
                subjectProps = {
                    "currentYear": None,
                    "currentSem": None,
                    "major": None,
                    "degree": degree['degreeCode']
                }

    #             print(len(tables))
                if len(tables) > 0:
                    for table in tables:
                        for tableCell in table:
                            updateSubjectDetails = callback(subjects, majors, subjectProps, tableCell)
#                             print("u", updateSubjectDetails != None)
                            
                            if updateSubjectDetails != None:
                                updateVar = updateSubjectDetails["updateVar"]
                            
                                if updateVar == "major":
                                    major = updateSubjectDetails["data"]
                                    majorCode = major["code"]
                                    del major["code"]
                                    
                                    majors[majorCode] = major
                                elif updateVar == "currentYear":
                                    subjectProps["currentYear"] = updateSubjectDetails["data"]
                                elif updateVar == "currentSem":
                                    subjectProps["currentSem"] = updateSubjectDetails["data"]
                                elif updateVar == "subjects":
                                    subjects.update(updateSubjectDetails["data"])
                                elif updateVar == "subjects_degreeCodes":
                                    subjects[updateSubjectDetails["data"]]['degreeCodes'][degree['degreeCode']] = True
                                else:
                                    pass
                            
    return majors, subjects
    
def getSubjectData(subjects, majors, subjectProps, tableCell):
    try:
        tableCellText = tableCell.text.strip()

        if tableCellText[0:4] == "Year":
            return {
                "updateVar": "currentYear",
                "data": tableCellText
            }
        elif tableCellText[-7:] == "session":
            return {
                "updateVar": "currentSem",
                "data": tableCellText[:-7].strip()
            }
        elif tableCell.text[0:5].strip().isdigit():
            subjectCode = tableCell.text[0:6].strip()
            degree = subjectProps["degree"]
            
            if subjectCode not in subjects:
                return {
                    "updateVar": "subjects",
                    "data": {
                        f"{subjectCode}": {
                            'name': tableCell.text.strip()[5:-3].strip(),
                            'degreeCodes': {
                                f"{degree}": True
                            }
                        }
                    }
                }
            else:
                return {
                    "updateVar": "subjects_degreeCodes",
                    "data": subjectCode
                }
    except Exception as e: 
#         print(e)
        pass

No need to process things more than once. Read from the json file it was saved to previously.

In [None]:
# subjects = getDegreeData(getSubjectData)[1]
# print(subjects)

In [None]:
def getMajorsSubjects(code):
    print("d", f"{baseUrl}directory/{code.lower()}")
    majorPage = getPageHTML(f"{baseUrl}directory/{code.lower()}")
            
    isElective = False
    subjects = {
        "core": [],
        "elective": []
    }
    
    for table in majorPage.find_all("table"):
        for tableCell in table.find_all("td"):
            tableCellText = tableCell.text.strip()

            if tableCellText[0:5].isdigit():
                subjects["elective" if isElective else "core"].append(tableCellText[0:5])
            elif tableCellText[0:6] == "Select":
                isElective = True
            elif tableCellText[0:3] == "SMJ" or tableCellText[0:3] == "STM":
#                 print(tableCellText[:8].strip())
                subjects.update(getMajorsSubjects(tableCellText[:8].strip()))
            
    return subjects
    
def getMajors(subjects, majors, subjectProps, tableCell):
    try:
        tableCellText = tableCell.text.strip()

        if tableCellText[0:3] == "MAJ" or tableCellText[0:3] == "SMJ" or tableCellText[0:3] == "STM":
            majorCode = tableCellText[:8].strip()
            majorPage = getPageHTML(f"{baseUrl}directory/{majorCode.lower()}")
            subjects = getMajorsSubjects(majorCode)
#             print(majorPage.find("h1").text.strip()[9:])

            return {
                "updateVar": "major",
                "data": {
                    "code": majorCode,
                    "name": majorPage.find("h1").text.strip()[9:],
                    "type": "major" if tableCellText[0:3] == "MAJ" else "sub-major",
                    "degreeCode": subjectProps["degree"],
                    "subjects": subjects
                }
            }
    except Exception as e:
        pass

In [None]:
majors = getDegreeData(getMajors, 4)[0]
# print(len(courseAreas))

In [None]:
print(majors)

In [None]:
print(len(subjects))

In [None]:
print(courseAreas)

## Logic error with appending subjectCode to subjects with in courseAreas (Only run below cell if you want to attempt to store subject codes in courseAreas dictionary, not necessary now though)
Not sure why but it saves a single subject code 3 times (only for some subject codes).
- course code minus the last digit
- course code with line break (\n) at the start and minus the last digit
- the correct course code

Also there are duplicates.

Cannot find the course of this error so I will just be fixing it by doing a second parse of the dictionary which removes invalid course codes.

In [None]:
# for courseName in courseAreas:
#     if courseAreas[courseName]["ug_degrees"] != None:
#         for degree_index in range(len(courseAreas[courseName]["ug_degrees"])):
#             for year in courseAreas[courseName]["ug_degrees"][degree_index]["subjects"]:
#                 for sem in courseAreas[courseName]["ug_degrees"][degree_index]["subjects"][year]:
#                     for subject in courseAreas[courseName]["ug_degrees"][degree_index]["subjects"][year][sem]:
#                         print(subject.isdigit(), subject)
#                         if len(subject) < 5 or not subject.isdigit():
#                             courseAreas[courseName]["ug_degrees"][degree_index]["subjects"][year][sem].remove(subject)

In [None]:
# print(courseAreas)

### Degree Data is written to JSON file

In [None]:
with open('courseData.json', 'w') as fp:
    json.dump(courseAreas, fp, indent = 4)

In [None]:
with open('courseMajorsData.json', 'w') as fp:
    json.dump(majors, fp, indent = 4)

In [None]:
with open('subjectData.json', 'w') as fp:
    json.dump(subjects, fp, indent = 4)

In [None]:
with open('majorData.json', 'w') as fp:
    json.dump(majors, fp, indent = 4)

## Scrape Data for each subject

In [5]:
def jsonToDict(fileName):
    with open(fileName) as json_file: 
        return json.load(json_file) 

In [10]:
subjects = jsonToDict("subjectData.json")
majors = jsonToDict("majorData.json")

In [18]:
print(subjects)

{'26100': {'name': 'Integrating Business Perspectives', 'degreeCodes': {'c10026': True, 'c10235': True, 'c09071': True, 'c09084': True, 'c10020': True, 'c10125': True, 'c10162': True, 'c10163': True, 'c10169': True, 'c10219': True, 'c10278': True, 'c10326': True, 'c10411': True, 'c10416': True, 'c10172': True}}, '22107': {'name': 'Accounting for Business Decisions A', 'degreeCodes': {'c10026': True, 'c10235': True, 'c09071': True, 'c09084': True, 'c10020': True, 'c10125': True, 'c10162': True, 'c10163': True, 'c10169': True, 'c10219': True, 'c10278': True, 'c10326': True, 'c10411': True, 'c10416': True}}, '23115': {'name': 'Economics for Business', 'degreeCodes': {'c10026': True, 'c10235': True, 'c10342': True, 'c09071': True, 'c09084': True, 'c10020': True, 'c10125': True, 'c10162': True, 'c10163': True, 'c10169': True, 'c10219': True, 'c10278': True, 'c10326': True, 'c10343': True, 'c10355': True, 'c10411': True, 'c10412': True, 'c10416': True, 'c10432': True, 'c10444': True}}, '2613

In [14]:
print(majors)

{'STM90273': {'name': 'Core subjects (Business)', 'type': 'sub-major', 'degreeCode': 'c09084', 'subjects': {'core': ['22107', '21129', '24108', '23115', '25300', '26134', '26100', '22207'], 'elective': []}}, 'MAJ08437': {'name': 'Accounting', 'type': 'major', 'degreeCode': 'c10235', 'subjects': {'core': ['22320', '22321', '22420', '79014', '22522', '79017', '22421', '22319'], 'elective': []}}, 'MAJ09401': {'name': 'Business Law', 'type': 'major', 'degreeCode': 'c10235', 'subjects': {'core': ['70110', '79013', '79014', '79018', '79032'], 'elective': ['79606', '79015', '79033', '79006', '79603', '79011', '79036', '79017']}}, 'MAJ09209': {'name': 'Economics', 'type': 'major', 'degreeCode': 'c10235', 'subjects': {'core': ['23566', '23567', '23568', '23571', '23580'], 'elective': ['23572', '23005', '23999', '23569', '23504', '23418', '23570', '23592', '23021', '23002', '23565', '23022']}}, 'MAJ08440': {'name': 'Finance', 'type': 'major', 'degreeCode': 'c10235', 'subjects': {'core': ['25556'

In [23]:
length = 0

for major in majors:
    length += len(majors[major]["subjects"]["core"])
    length += len(majors[major]["subjects"]["elective"])
    
print(length)

963


In [19]:
uniqueCount = 0

for major in majors:
    for subject in majors[major]["subjects"]["core"]:
        if subject not in subjects:
            uniqueCount += 1
            
    for subject in majors[major]["subjects"]["elective"]:
        if subject not in subjects:
            uniqueCount += 1
            
print(uniqueCount)

120


In [24]:
subjectInfo = {}

for major in majors:
    for subject in majors[major]["subjects"]["core"]:
        subjectInfo[subject] = getPageHTML(f"{baseUrl}subjects/details/{subject}.html").find(id="content")
            
    for subject in majors[major]["subjects"]["elective"]:
        subjectInfo[subject] = getPageHTML(f"{baseUrl}subjects/details/{subject}.html").find(id="content")

aprint(subjectInfo)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [25]:
with open('subjectInfoData.json', 'w') as fp:
    json.dump(subjectInfo, fp, indent = 4)

TypeError: Object of type Tag is not JSON serializable