# UTS Course Data Script
The goal of this python script is to scrape the UTS handbook for degree, course and subject data. This is so that it can be stored in our database and the necessary algorithms can be run on it, so that we can recommend subjects automatically.

## Library Imports
* requests library: Uses HTTP to request the webpage from a server with a given URL (in this case https://www.handbook.uts.edu.au/).
* BeautifulSoup library: Parsers HTML files so that it can be searched and filtered.
* json library: Reading, writing and handling JSON files.

In [1]:
import requests
from bs4 import BeautifulSoup
import json

In [2]:
baseUrl = 'https://www.handbook.uts.edu.au/'

In [3]:
def getPageHTML(url):
    return BeautifulSoup(requests.get(url).content, 'html.parser')

In [4]:
def getDegrees(courseArea, graduateType):
    degrees = []
    degreesPage = BeautifulSoup(requests.get(f"{baseUrl}{courseArea}/{graduateType}/index.html").content, 'html.parser')
    tables = degreesPage.find(id='content').find(class_='ie-images').find_all('table')
    
    if len(tables) <= 1:
        return None
    else:
        for table in tables:
            tableCells = table.find_all("td")
            for cellNumber in range(1, len(tableCells)):
                isDegreeCode = tableCells[cellNumber-1].text.strip()[0] == "C"
                degrees.append({
                    'name': tableCells[cellNumber if isDegreeCode else cellNumber-1].text.strip(),
                    'degreeCode': (tableCells[cellNumber-1 if isDegreeCode else cellNumber].text).lower().strip(),
                    'subjects': {}
                })
                
    return degrees

In [5]:
indexPage = requests.get(baseUrl)

soup = BeautifulSoup(indexPage.content, 'html.parser')
# print(soup.find(class_='toc').find_all('a', href=True)[0]['href'])

# links to the pages for undergraduate and post graduate degrees in all course areas
courseAreas = {}

for courseArea in soup.find(class_='toc').find_all('a', href=True):
    abbreviation = courseArea['href'][1:4]
    
    courseAreas[courseArea.text] = {
        'abbreviation': abbreviation,
        'url': f"https://www.handbook.uts.edu.au{courseArea['href']}",
        'ug_degrees': getDegrees(abbreviation, 'ug'),
        'pg_degrees': getDegrees(abbreviation, 'pg')
    }

## Scraping subject data seperately
Since many degrees share common subjects, I will be scraping the subject data seperately so I can normalise the json. Which in turn will reduce the data redundancy of the database.

In [26]:
def subjectMajorIndex(text):
    containsMajor = text.find("major")
    containsMajors = text.find("majors")
    
    if containsMajor != -1:
        return containsMajor
    if containsMajors != -1:
        return containsMajors
    
    return False

def getDegreeData(callback):
    subjects = {}
    majors = {}
    
    for key in courseAreas:
        courseArea = courseAreas[key]
        if courseArea['ug_degrees'] != None:
            for index, degree in enumerate(courseArea['ug_degrees']):
    #             print("degree", degree)
                tables = getPageHTML(f"{baseUrl}courses/{degree['degreeCode']}.html").find_all('table')
                subjectProps = {
                    "currentYear": None,
                    "currentSem": None,
                    "major": None,
                    "degree": degree['degreeCode']
                }

    #             print(len(tables))
                if len(tables) > 0:
                    for table in tables:
                        for tableCell in table:
                            updateSubjectDetails = callback(subjects, majors, subjectProps, tableCell)
#                             print(updateSubjectDetails)
                            
                            if updateSubjectDetails != None:
                                updateVar = updateSubjectDetails["updateVar"]
                            
                                if updateVar == "major":
#                                     major = updateSubjectDetails["data"]
#                                     subjectProps["major"] = major
#                                     majors[major] = degree['degreeCode']
                                    majors.update(updateSubjectDetails["data"])
                                elif updateVar == "currentYear":
                                    subjectProps["currentYear"] = updateSubjectDetails["data"]
                                elif updateVar == "currentSem":
                                    subjectProps["currentSem"] = updateSubjectDetails["data"]
                                elif updateVar == "subjects":
                                    subjects.update(updateSubjectDetails["data"])
                                elif updateVar == "subjects_degreeCodes":
                                    subjects[updateSubjectDetails["data"]]['degreeCodes'][degree['degreeCode']] = True
                                else:
                                    pass
                            
    return majors, subjects
    
def getSubjectData(subjects, majors, subjectProps, tableCell):
    try:
        tableCellText = tableCell.text.strip()
        majorIndex = subjectMajorIndex(tableCellText)

        if majorIndex != -1 and "commencing" in tableCellText:
            return {
                "updateVar": "major",
                "data": tableCellText[:majorIndex].strip()
            }
        elif tableCellText[0:4] == "Year":
            return {
                "updateVar": "currentYear",
                "data": tableCellText
            }
        elif tableCellText[-7:] == "session":
            return {
                "updateVar": "currentSem",
                "data": tableCellText[:-7].strip()
            }
        elif tableCell.text[0:5].strip().isdigit():
            subjectCode = tableCell.text[0:6].strip()
            degree = subjectProps["degree"]
            
            if subjectCode not in subjects:
                return {
                    "updateVar": "subjects",
                    "data": {
                        f"{subjectCode}": {
                            'name': tableCell.text.strip()[5:-3].strip(),
                            'degreeCodes': {
                                f"{degree}": True
                            }
                        }
                    }
                }
            else:
                return {
                    "updateVar": "subjects_degreeCodes",
                    "data": subjectCode
                }
    except Exception as e: 
#         print(e)
        pass

In [23]:
subjects = getDegreeData(getSubjectData)[0]

In [27]:
def getMajorData(subjects, majors, subjectProps, tableCell):
    try:
        tableCellText = tableCell.text.strip()
        
        if tableCellText[0:3] == "MAJ":
            majorCode = tableCellText[:8].strip()
            majorPage = getPageHTML(f"{baseUrl}directory/{majorCode.lower()}")
            
#             print(majorPage.find("h1"))
            
            return {
                "updateVar": "major",
                "data": {
                    f"{majorCode}": {
                        "name": majorPage.find("h1")[9:],
                        "degreeCode": subjectProps["degree"]
                    }
                }
            }
    except:
        pass
    
majors = getDegreeData(getMajorData)[0]

In [22]:
print(majors)

{'MAJ08437': 'c10278', 'MAJ09401': 'c10235', 'MAJ09209': 'c10278', 'MAJ08440': 'c10278', 'MAJ08446': 'c10278', 'MAJ02041': 'c10235', 'MAJ08442': 'c10278', 'MAJ08438': 'c10278', 'MAJ08441': 'c10278', 'MAJ08981': 'c10278', 'MAJ09402': 'c10326', 'MAJ08060': 'c10326', 'MAJ08046': 'c10326', 'MAJ08063': 'c10326', 'MAJ08443': 'c10235', 'MAJ08445': 'c10235', 'MAJ08980': 'c10432', 'MAJ03473': 'c09128', 'MAJ03025': 'c09074', 'MAJ03517': 'c09074', 'MAJ03028': 'c09074', 'MAJ03030': 'c09074', 'MAJ03505': 'c09074', 'MAJ03522': 'c09074', 'MAJ03541': 'c09074', 'MAJ03525': 'c09074', 'MAJ01079': 'c10162', 'MAJ01080': 'c10162', 'MAJ01085': 'c10162', 'MAJ01114': 'c10162', 'MAJ01115': 'c10162', 'MAJ01082': 'c10162', 'MAJ01112': 'c10162', 'MAJ01116': 'c10162', 'MAJ01168': 'c10162', 'MAJ01100': 'c10422', 'MAJ01101': 'c10422', 'MAJ01103': 'c10422', 'MAJ01104': 'c10422', 'MAJ01110': 'c10422', 'MAJ01105': 'c10422', 'MAJ01102': 'c10422', 'MAJ01111': 'c10422', 'MAJ01126': 'c10422', 'MAJ10050': 'c10379', 'MAJ10047

In [9]:
print(len(subjects))

967


In [10]:
print(courseAreas)

{'Analytics and Data Science': {'abbreviation': 'ads', 'url': 'https://www.handbook.uts.edu.au/ads/index.html', 'ug_degrees': None, 'pg_degrees': [{'name': 'Doctor of Philosophy', 'degreeCode': 'c02062', 'subjects': {}}, {'name': 'Doctor of Philosophy', 'degreeCode': 'c03064', 'subjects': {}}, {'name': 'Master of Learning Analytics (Research)', 'degreeCode': 'c03064', 'subjects': {}}, {'name': 'Master of Data Science and Innovation', 'degreeCode': 'c04372', 'subjects': {}}, {'name': 'Master of Data Science and Innovation', 'degreeCode': 'c06124', 'subjects': {}}, {'name': 'Graduate Diploma in Data Science and Innovation', 'degreeCode': 'c06124', 'subjects': {}}, {'name': 'Graduate Diploma in Data Science and Innovation', 'degreeCode': 'c11274', 'subjects': {}}, {'name': 'Graduate Certificate in Data Science and Innovation', 'degreeCode': 'c11274', 'subjects': {}}]}, 'Business': {'abbreviation': 'bus', 'url': 'https://www.handbook.uts.edu.au/bus/index.html', 'ug_degrees': [{'name': 'Bac

## Logic error with appending subjectCode to subjects with in courseAreas (Only run below cell if you want to attempt to store subject codes in courseAreas dictionary, not necessary now though)
Not sure why but it saves a single subject code 3 times (only for some subject codes).
- course code minus the last digit
- course code with line break (\n) at the start and minus the last digit
- the correct course code

Also there are duplicates.

Cannot find the course of this error so I will just be fixing it by doing a second parse of the dictionary which removes invalid course codes.

In [11]:
# for courseName in courseAreas:
#     if courseAreas[courseName]["ug_degrees"] != None:
#         for degree_index in range(len(courseAreas[courseName]["ug_degrees"])):
#             for year in courseAreas[courseName]["ug_degrees"][degree_index]["subjects"]:
#                 for sem in courseAreas[courseName]["ug_degrees"][degree_index]["subjects"][year]:
#                     for subject in courseAreas[courseName]["ug_degrees"][degree_index]["subjects"][year][sem]:
#                         print(subject.isdigit(), subject)
#                         if len(subject) < 5 or not subject.isdigit():
#                             courseAreas[courseName]["ug_degrees"][degree_index]["subjects"][year][sem].remove(subject)

In [12]:
# print(courseAreas)

### Degree Data is written to JSON file

In [13]:
with open('courseData.json', 'w') as fp:
    json.dump(courseAreas, fp, indent = 4)

In [14]:
with open('courseMajorsData.json', 'w') as fp:
    json.dump(majors, fp, indent = 4)

In [15]:
with open('subjectData.json', 'w') as fp:
    json.dump(subjects, fp, indent = 4)

## Scrape Data for each subject