# CAO Points Analysis

[CAO](http://www.cao.ie/index.php?page=points&p=2021)

***

In [3]:
# Regular Expressions
import re

# Convenient HTTP Requests.
import requests as rq

# dates and times
import datetime as dt

In [4]:
# Fetch CAO points URL
resp = rq.get('http://www2.cao.ie/points/l8.php')
# Have a quick peek
resp

<Response [200]>

# Save original data set

In [5]:
# Get Current date and time.
now = dt.datetime.now()

# Format as string
nowstr = now.strftime('%Y%m%d_%H%M%S')

In [6]:
#Create file path for original data
path = 'data/cao2021_'+ nowstr + '.html'

### Error on Server
- Server says we should decode as per:
    Content-Type: text/html; charset=iso-8859-1
- However one line uses \x96 which is not defined in iso-8859-1
- Therefore we used the similar decoding standard cp1252
- This is very similar but includes \x96

In [7]:
# Server uses the incorrect encoding
original_encoding = resp.encoding
# Change to cp1252
resp.encoding = 'cp1252'

In [8]:
# Save The orginal html file
with open(path, 'w') as f:
    f.write(resp.text)

In [9]:
resp.encoding

'cp1252'

# Use regular Expressions to select the lines we want

In [10]:
# Compile Reg-Ex for Matching lines

re_course = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)([0-9]{3})(\*?) *')

In [11]:
# File path for data 
path = 'data/cao2021_csv_'+ nowstr + '.csv'

# Keep track of how many courses we process.
no_lines = 0

# Open CSV file for writing 
with open (path, 'w') as f:

    # loop through lines of the response
    for line in resp.iter_lines():
        #decode the line using 'cp1252' - wrong encoding
        dline = line.decode('cp1252')
        # Match only the lines representing courses.
        if re_course.fullmatch(dline):
            # Add one to the lines counter.
            no_lines = no_lines + 1
            # Split the line on two or more spaces
            linesplit = re.split('  +', dline)
            # Rejoin the substrings with commas
            f.write(','.join(linesplit) + '\n')

# Print the total number of processed lines.
print(f"Total number of lines is {no_lines}.")

Total number of lines is 922.


In [12]:

import pandas as pd
from collections import namedtuple
import requests, PyPDF2


In [13]:
# Fetch CAO points URL
resp = rq.get('http://www2.cao.ie/points/lvl8_19.pdf')
# Have a quick peek
resp

<Response [200]>

In [14]:
url = 'http://www2.cao.ie/points/lvl8_19.pdf'
response = requests.get(url)
my_raw_data = response.content

with open("cao2019.csv", 'wb') as my_data:
    my_data.write(my_raw_data)

open_pdf_file = open("cao2019.csv", 'rb')
read_pdf = PyPDF2.PdfFileReader(open_pdf_file)


In [18]:
#print(read_pdf.getPage(1).extractText())

In [19]:
i=0
while i < read_pdf.getNumPages():
    pageinfo = read_pdf.getPage(i)
    print(pageinfo.extractText())
    i = i + 1

2019 Level 8 EOS
ADMISSION DATA 2019
End of Season
Level 8
The details  given are for general information only and do not form 
part of any contract. They are not intended for use in determining 
whether any individual applicant is or is not entitled to an offer of a 
higher education place
*
Not all on this points score were offered places
#
Test / Interview / Portfolio / Audition
AQA
All qualified applicants
Course Code
INSTITUTION and COURSE
EOS
Mid
 
Athlone Institute of Technology
AL801
Software Design with Virtual Reality and Gaming
304
328
AL802
Software Design with Cloud Computing
301
306
AL803
Software Design with Mobile Apps and Connected Devices
309
337
AL805
Network Management and Cloud Infrastructure
329
442
AL810
Quantity Surveying
307
349
AL820
Mechanical and Polymer Engineering
300
358
AL830
General Nursing
410
429
AL832
Psychiatric Nursing
387
403
AL836
Nutrition and Health Science
352
383
AL837
Sports Science with Exercise Physiology
351
392
AL838
Biotechnology
302
37

In [None]:
re_course2019 = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)([0-9]{3})(\*?) *')