# CAO Points Analysis

[CAO](http://www.cao.ie/index.php?page=points&p=2021)

***

In [2]:
# Regular Expressions
import re

# Convenient HTTP Requests.
import requests as rq

# dates and times
import datetime as dt

In [3]:
# Fetch CAO points URL
resp = rq.get('http://www2.cao.ie/points/l8.php')
# Have a quick peek
resp

<Response [200]>

# Save original data set

In [4]:
# Get Current date and time.
now = dt.datetime.now()

# Format as string
nowstr = now.strftime('%Y%m%d_%H%M%S')

In [5]:
#Create file path for original data
path = 'data/cao2021_'+ nowstr + '.html'

### Error on Server
- Server says we should decode as per:
    Content-Type: text/html; charset=iso-8859-1
- However one line uses \x96 which is not defined in iso-8859-1
- Therefore we used the similar decoding standard cp1252
- This is very similar but includes \x96

In [6]:
# Server uses the incorrect encoding
original_encoding = resp.encoding
# Change to cp1252
resp.encoding = 'cp1252'

In [7]:
# Save The orginal html file
with open(path, 'w') as f:
    f.write(resp.text)

In [8]:
resp.encoding

'cp1252'

# Use regular Expressions to select the lines we want

In [9]:
# Compile Reg-Ex for Matching lines

re_course = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)([0-9]{3})(\*?) *')

In [10]:
# File path for data 
path = 'data/cao2021_csv_'+ nowstr + '.csv'

# Keep track of how many courses we process.
no_lines = 0

# Open CSV file for writing 
with open (path, 'w') as f:

    # loop through lines of the response
    for line in resp.iter_lines():
        #decode the line using 'cp1252' - wrong encoding
        dline = line.decode('cp1252')
        # Match only the lines representing courses.
        if re_course.fullmatch(dline):
            # Add one to the lines counter.
            no_lines = no_lines + 1
            # Split the line on two or more spaces
            linesplit = re.split('  +', dline)
            # Rejoin the substrings with commas
            f.write(','.join(linesplit) + '\n')

# Print the total number of processed lines.
print(f"Total number of lines is {no_lines}.")

Total number of lines is 922.


In [11]:

import pandas as pd
from collections import namedtuple
import requests, PyPDF2


In [12]:
# Fetch CAO points URL
resp = rq.get('http://www2.cao.ie/points/lvl8_19.pdf')
# Have a quick peek
resp

<Response [200]>

In [50]:
url = 'http://www2.cao.ie/points/lvl8_19.pdf'
response = requests.get(url)
my_raw_data = response.content

with open("cao2019.csv", 'wb') as my_data:
    my_data.write(my_raw_data)

open_pdf_file = open("cao2019.csv", 'rb')
read_pdf = PyPDF2.PdfFileReader(open_pdf_file)


In [64]:
print(read_pdf.getPage(1).extractText())

2019 Level 8 EOS
CW258
Cybercrime and IT Security
300
328
CW268
Computing in Interactive Digital Art and Design
274
321
CW438
Construction (options)
271
308
CW468
Architectural Technology
252
290
CW478
Civil Engineering
348
383
CW548
Mechanical Engineering
310
351
CW558
Electronic Systems
279
338
CW568
Aerospace Engineering
366
422
CW578
TV and Media Production
327
361
CW708
Law - LLB
298
328
CW728
Product Design Innovation
286
355
CW748
Early Childhood Education and Care
265
338
CW758
Applied Social Studies - Professional Social Care
290
319
CW808
Media and Public Relations
248
307
CW848
Digital Marketing with Analytics
279
307
CW858
Sports Management and Coaching (options, portfolio)
#700
700
CW908
Business (options)
260
307
CW938
Business with Law
270
288
CW948
Accounting
301
408
 
Carlow College, St. Patrick`s
 
PC405
Social, Political and Community Studies
251
296
PC410
Arts and Humanities (fulfils Teaching Council requirements)
203
289
PC411
English and History (fulfils Teaching 

In [None]:
i=0

In [61]:
if read_pdf.isEncrypted:
    read_pdf.decrypt("")
    print(read_pdf.getPage(0).extractText())
else:
    print(read_pdf.getPage(1).extractText())


2019 Level 8 EOS
CW258
Cybercrime and IT Security
300
328
CW268
Computing in Interactive Digital Art and Design
274
321
CW438
Construction (options)
271
308
CW468
Architectural Technology
252
290
CW478
Civil Engineering
348
383
CW548
Mechanical Engineering
310
351
CW558
Electronic Systems
279
338
CW568
Aerospace Engineering
366
422
CW578
TV and Media Production
327
361
CW708
Law - LLB
298
328
CW728
Product Design Innovation
286
355
CW748
Early Childhood Education and Care
265
338
CW758
Applied Social Studies - Professional Social Care
290
319
CW808
Media and Public Relations
248
307
CW848
Digital Marketing with Analytics
279
307
CW858
Sports Management and Coaching (options, portfolio)
#700
700
CW908
Business (options)
260
307
CW938
Business with Law
270
288
CW948
Accounting
301
408
 
Carlow College, St. Patrick`s
 
PC405
Social, Political and Community Studies
251
296
PC410
Arts and Humanities (fulfils Teaching Council requirements)
203
289
PC411
English and History (fulfils Teaching 

In [None]:
re_course2019 = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)([0-9]{3})(\*?) *')