# A Comparison Analysis of CAO points for 2019, 2020, 2021
http://www.cao.ie/index.php?page=points&p=2021
***

<br>

## Import Libraries

***

<br>


In [29]:
# Imports libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
# Regular expressions
import re

# HTTP requests
import requests as rq

# Dates and times
import datetime as dt

# Opening URLs 
import urllib.request as urlrq

# PDF conversion
import camelot

# Ensures plots are shown
%matplotlib inline

In [30]:
# Gets the current date and time
cur_time = dt.datetime.now()

# Format as a string
current_time = cur_time.strftime('%Y%m%d_%H%M%S')

<br>

# 2021 CAO data

http://www.cao.ie/index.php?page=points&p=2021&bb=points

***

<br>

<br>

## Retrieve data from webserver

In [31]:
# Retrieves CAO points from the webserver.
response = rq.get('http://www2.cao.ie/points/l8.php')

# Response 200 signifies a successful request/response.
response

<Response [200]>

<br>

## Save original data

In [32]:
# Creates a file path for the original data
path = 'data/cao2021' + current_time + '.html'# Note the importance of the filename and how it
                                              # will be easy to find in folders/sorted appropriately

In [33]:
# Saves the original html file.
with open(path, 'w') as f:
    f.write(response.text)

<br>

## Charset error on server

***

<br>

Technically, server states decoding as:

```
    Content-Type: text/html; charset=iso-8859-1.
``` 

However, one line uses \x96 which isn't defined in iso-8859-1. 

Therefore, we use the similar decoding standard cp1252, which is very similar but includes \x96. 

In [34]:
# Server uses incorrect encoding
orig_encoding = response.encoding

# Corrected encoding to cp1252
response.encoding = 'cp1252'

<br>

## Using regular expressions to extract desired data
***


In [35]:
# Compiles the regular expression for matching lines so it doesn't recompile repeatedly.
re_course = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)')  # 'r' python treats string as raw string and doesnt evaluate back slashes
                                                                    # \ {character} means we want the literal character ie., *
                                                                    # ? means 0 or 1 of 
                                                                    # + means 1 or more of 

In [36]:
# Path for CSV file
path = 'data/cao2021' + current_time + '.csv'

# Keeping count of the courses we are processing.
course_count = 0

# Opens the CSV file for wrting.
with open(path, 'w') as f:
    # Loops through and prints data from response line by line.
    for line in response.iter_lines():
        # Decoding turns bytes into code points and those code points can be displayed on the screen
        d_line = line.decode('cp1252')
        # Match the string specified in re_course, returning only the courses from the response
        if re_course.fullmatch(d_line):
            # Adds one to the course count
            course_count += 1
            # Use regular expressions splits a string where there are at least two spaces
            line_split = re.split('  +', d_line)
            # Course code.
            course_code = d_line[:5]
            # Course title.
            course_title = d_line[7:57]
            # Points.
            course_points = d_line[60:]
            #{course_code}''{course_title}'
            print(f"'{course_points}'")
            # Rejoin the substrings with commas
            f.write(','.join(line_split) + '\n')
    print(f"Line count: {course_count}")

'300      '
'313      '
'350      '
'321      '
'328      '
'327      '
'451*     444'
'440*     431'
'356      '
'346      '
'357      '
'324      '
'325      '
'346      '
'477      476*'
'338      '
'306      '
'297      '
'309      '
'302      '
'336      '
'300      299'
'309      '
'304      '
'308      '
'301      '
'#575     '
'#747     '
'306      '
'484*     467*'
'307      '
'260      '
'#700     '
'292      '
'250      '
'270      '
'270      '
'266      '
'307      '
'430      423'
'388      '
'451      '
'272      '
'295      '
'293      '
'292      '
'291      '
'260      251'
'291      283'
'465      '
'330      328'
'280      '
'371      359'
'318      '
'292      '
'246      '
'290      '
'360      358'
'247      '
'269      '
'#700     '
'272      '
'270      '
'319      '
'263      235'
'262      230'
'243      224'
'443      '
'431      '
'434      '
'396      '
'336      '
'390      '
'365      '
'#904     #904'
'#1028    '
'#525     '
'350      '
'#450     '
'#40

<br>

## 2020 CAO data

http://www.cao.ie/index.php?page=points&p=2020
***


In [37]:
# Create a file path for the original data set
path = 'CAO2020' + current_time + '.xlxs'

# Opening URL
urlrq.urlretrieve('http://www2.cao.ie/points/CAOPointsCharts2020.xlsx', path)


('CAO202020211112_181626.xlxs', <http.client.HTTPMessage at 0x116497f40>)

<br>

## Load data with panda
https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html
***

<br>

In [38]:
# Load and parse spread sheet
df_20 = pd.read_excel('http://www2.cao.ie/points/CAOPointsCharts2020.xlsx', skiprows=10)
df_20

SyntaxError: invalid syntax (__init__.py, line 1187)

In [None]:
# Delete irrelevant columns
# df_20.drop(columns=['COURSE TITLE', 
#                     'CATEGORY (i.e.ISCED description)', 
#                     'R1 Random *', 
#                     'R2 Random*', 
#                     'EOS', 
#                     'EOS Random *', 
#                     'EOS Mid-point', 
#                     'HEI', 
#                     'avp', 
#                     'v', 
#                     'Column1', 'Column2', 'Column3', 'Column4', 'Column5', 'Column6', 
#                     'Column7', 'Column8'])


Unnamed: 0,COURSE CODE2,R1 POINTS,R2 POINTS,LEVEL,Test/Interview #
0,AC120,209,,8,
1,AC137,252,,8,
2,AD101,#+matric,,8,#
3,AD102,#+matric,,8,#
4,AD103,#+matric,,8,#
...,...,...,...,...,...
1459,WD208,188,,7,
1460,WD210,279,,8,
1461,WD211,271,,8,
1462,WD212,270,,8,


In [None]:
# Checking random row to ensure data integrity.
df_20.iloc[650]

CATEGORY (i.e.ISCED description)                                             Arts
COURSE TITLE                        Arts (Drama, Theatre and Performance Studies)
COURSE CODE2                                                                GY118
R1 POINTS                                                                     451
R1 Random *                                                                   NaN
R2 POINTS                                                                     NaN
R2 Random*                                                                    NaN
EOS                                                                           451
EOS Random *                                                                  NaN
EOS Mid-point                                                                 492
LEVEL                                                                           8
HEI                                        National University of Ireland, Galway
Test/Interview #

In [None]:
# Another random check
df_20.iloc[-1]


CATEGORY (i.e.ISCED description)          Engineering and engineering trades
COURSE TITLE                        Mechanical and Manufacturing Engineering
COURSE CODE2                                                           WD230
R1 POINTS                                                                253
R1 Random *                                                              NaN
R2 POINTS                                                                NaN
R2 Random*                                                               NaN
EOS                                                                      253
EOS Random *                                                             NaN
EOS Mid-point                                                            369
LEVEL                                                                      8
HEI                                        Waterford Institute of Technology
Test/Interview #                                                         NaN

In [None]:
# Saving data to CSV file.
path = 'data/cao2020_' + current_time + '.csv'
df_20.to_csv(path)


<br>

## 2019 CAO data
 
http://www2.cao.ie/points/lvl8_19.pdf

<br>

***

<br>

## Convert pdf to csv

<br>

***

What did I do to prepare data before reading in csv file below?

Copy and pasted pdf in Preview, pasted into a Word document so it formats nicely.

Then copied the data from the Word document to a csv file, while deleting preamble and unnecessary data such as preamble, page numbers and full Higher Education INstitution's names while keeping the course code and points etc. 

In [None]:
df19 = pd.read_csv('data/cao2019_20211101_213010.csv', sep='\t')
df19

Unnamed: 0,Course Code,INSTITUTION and COURSE,EOS,Mid
0,AL801,Software Design with Virtual Reality and Gaming,304,328
1,AL802,Software Design with Cloud Computing,301,306
2,AL803,Software Design with Mobile Apps and Connected...,309,337
3,AL805,Network Management and Cloud Infrastructure,329,442
4,AL810,Quantity Surveying,307,349
...,...,...,...,...
925,WD200,Arts (options),221,296
926,WD210,Software Systems Development,271,329
927,WD211,Creative Computing,275,322
928,WD212,Recreation and Sport Management,274,311


('data/201920211101_201444.pdf', <http.client.HTTPMessage at 0x11e1c6910>)

***
# End