# A Comparison Analysis of CAO points for 2019, 2020, 2021
http://www.cao.ie/index.php?page=points&p=2021
***

In [25]:
# Imports libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random

# Ensures plots are shown
%matplotlib inline

## Analysis of 2021 CAO points for level 8 courses

In [26]:
# Imports regular expressions module
import re

# HTTP requests
import requests as rq

# Dates and times
import datetime as dt

In [27]:
# Retrieves CAO points from the webserver.
response = rq.get('http://www2.cao.ie/points/l8.php')

# Response 200 signifies a successful request/response.
response

<Response [200]>

## Saving the original data
***

In [28]:
# Gets the current date and time
current_time = dt.datetime.now()

# Format as a string
current_str = current_time.strftime('%Y%m%d_%H%M%S')

In [29]:
# Creates a file path for the original data
path = 'data/cao2021' + current_str + '.html'# Note the importance of the filename and how it
                                             # will be easy to find in folders/sorted appropriately

In [30]:
# Saves the original html file.
with open(path, 'w') as f:
    f.write(response.text)

<br>

## Charset error on server

Technically, server states decoding as:

```
    Content-Type: text/html; charset=iso-8859-1.
``` 

However, one line uses \x96 which isn't defined in iso-8859-1. 

Therefore, we use the similar decoding standard cp1252, which is very similar but includes \x96. 

In [31]:
# Server uses incorrect encoding
orig_encoding = response.encoding

# Corrected encoding to cp1252
response.encoding = 'cp1252'

<br>

## Using regular expressions to extract desired data
***

In [32]:
# Compiles the regular expression for matching lines so it doesn't recompile repeatedly.
re_course = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)([0-9]{3})(\*?) *')  # 'r' python treats string as raw string and doesnt evaluate back slashes
                                                                    # \ {character} means we want the literal character ie., *
                                                                    # ? means 0 or 1 of 
                                                                    # + means 1 or more of 

In [37]:
# Path for CSV file
path = 'data/cao2021' + current_str + '.csv'

# Keeping count of the courses we are processing.
course_count = 0

# Opens the CSV file for wrting.
with open(path, 'w') as f:
    # Loops through and prints data from response line by line.
    for line in response.iter_lines():
        # Decoding turns bytes into code points and those code points can be displayed on the screen
        d_line = line.decode('cp1252')
        # Matches the string specified in re_course, returning only the courses from the response
        if re_course.fullmatch(d_line):
            # Adds one to the course count
            course_count += 1
            # Using regular expressions splits a string on at least two spaces
            line_split = re.split('  +', d_line)
            # Rejoin the substrings with commas
            f.write(','.join(line_split) + '\n')
    print(f"Line count: {course_count}")

Line count: 922


***
# End