# CAO Points Analysis

http://www.cao.ie/index.php?page=points&p=2021

***

In [1]:
# Regular expressions.
import re

# Convenient HTTP requests.
import requests as rq

#Date time module in python for manipulating dates
import datetime as dt

In [2]:
# Fetch the CAO points URL.
resp = rq.get('http://www2.cao.ie/points/l8.php')


# Have a quick peek. 200 means OK.
resp


<Response [200]>

### Save a copy of the original htlm file and time stamp it
***

In [3]:
#Create a timestamp from the date and time
timeStamp = dt.datetime.now().strftime("%Y%m%d_%H%M%S")
#Create a path string
path = "data/"
#Create a filename string
filename = "Cao_2021_" + timeStamp + ".html"
print(path+filename)

data/Cao_2021_20211025_210500.html


In [4]:
#write the response text to the save file
with open(path+filename,"w", encoding='utf-8') as savefile:
    savefile.write(resp.text)


<br>

### Build and compile a regex search pattern to extract the lines I want
***

In [5]:
# Compile the regular expression for matching lines. Changed Ians original to a more suitable one for my purposes.
re_course = re.compile(r'([A-Z]{2}[0-9]{3})(\s{2})(.{50})(\s{3})(\#?[0-9]{3}?\*?)?(\s{5})?(\#?[0-9]{3}?\#?)?')

#### What the r flag mean for the regex compiler in python

Prefixing with an r merely indicates to the string that backslashes \ should be treated literally and not as escape characters for python.
https://stackoverflow.com/questions/21104476/what-does-the-r-in-pythons-re-compiler-pattern-flags-mean

<br>

#### Loop through the lines of the response

***

In [6]:
# Keep track of how many courses we process.
no_lines = 0

#Open a csv file and write the header to it (column names)
with open('data/cao-2021.csv','w', encoding='utf-8') as csvfile:
    header = 'Course,Desc,Round1,Round2' + "\n"
    csvfile.write(header)
    # loop through the lines in the response.
    for line in resp.iter_lines():
        # Match only the lines representing courses.
        res = re.match(re_course, line.decode('Windows-1252'))
        #check the groupings - comment out the 3 lines below if not using or uncomment to debug
        #if res != None:
            #print(len(res.groups())) 
            #print(res.groups())
        #result = re_course.match(line.decode('ISO-8859-1'))
        #print(result, re.match.group())
        if re_course.match(line.decode('Windows-1252')):
            # Add one to the lines counter.
            no_lines = no_lines + 1
            # Uncomment next line to see the original.
            #print(line)
            # Pick out the relevant parts of the matched line.
            csv_version = re_course.sub(r'\1,\3,\5,\7',line.decode('Windows-1252'))
            # Print the CSV-style line.
            #print(csv_version)
            write_line = csv_version + "\n"
            #print(write_line)
            csvfile.write(write_line)

# Print the total number of processed lines.
print(f"Total number of lines is {no_lines}.")

Total number of lines is 949.


***

## End