# A Comparison Analysis of CAO points for 2019, 2020, 2021
http://www.cao.ie/index.php?page=points&p=2021
***

<br>

## Import Libraries

***

<br>


In [12]:
# Imports libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random

# Ensures plots are shown
%matplotlib inline

In [28]:
# Regular expressions
import re

# HTTP requests
import requests as rq

# Dates and times
import datetime as dt

# Opening URLs 
import urllib.request as urlrq

<br>

## 2021 CAO data

http://www.cao.ie/index.php?page=points&p=2021&bb=points

***

<br>

<br>

## Retrieve data from webserver
***
<br>

In [14]:
# Retrieves CAO points from the webserver.
response = rq.get('http://www2.cao.ie/points/l8.php')

# Response 200 signifies a successful request/response.
response

<Response [200]>

## Save original data
***
<br>


In [34]:
# Gets the current date and time
cur_time = dt.datetime.now()

# Format as a string
current_time = cur_time.strftime('%Y%m%d_%H%M%S')

In [16]:
# Creates a file path for the original data
path = 'data/cao2021' + current_str + '.html'# Note the importance of the filename and how it
                                             # will be easy to find in folders/sorted appropriately

In [17]:
# Saves the original html file.
with open(path, 'w') as f:
    f.write(response.text)

<br>

## Charset error on server

***

<br>

Technically, server states decoding as:

```
    Content-Type: text/html; charset=iso-8859-1.
``` 

However, one line uses \x96 which isn't defined in iso-8859-1. 

Therefore, we use the similar decoding standard cp1252, which is very similar but includes \x96. 

In [18]:
# Server uses incorrect encoding
orig_encoding = response.encoding

# Corrected encoding to cp1252
response.encoding = 'cp1252'

<br>

## Using regular expressions to extract desired data

<br>

***


In [19]:
# Compiles the regular expression for matching lines so it doesn't recompile repeatedly.
re_course = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)([0-9]{3})(\*?) *')  # 'r' python treats string as raw string and doesnt evaluate back slashes
                                                                    # \ {character} means we want the literal character ie., *
                                                                    # ? means 0 or 1 of 
                                                                    # + means 1 or more of 

In [20]:
# Path for CSV file
path = 'data/cao2021' + current_str + '.csv'

# Keeping count of the courses we are processing.
course_count = 0

# Opens the CSV file for wrting.
with open(path, 'w') as f:
    # Loops through and prints data from response line by line.
    for line in response.iter_lines():
        # Decoding turns bytes into code points and those code points can be displayed on the screen
        d_line = line.decode('cp1252')
        # Matches the string specified in re_course, returning only the courses from the response
        if re_course.fullmatch(d_line):
            # Adds one to the course count
            course_count += 1
            # Uses regular expressions splits a string where there are at least two spaces
            line_split = re.split('  +', d_line)
            # Rejoins the substrings with commas
            f.write(','.join(line_split) + '\n')
    print(f"Line count: {course_count}")

Line count: 922


<br>

## 2021 CAO data

http://www.cao.ie/index.php?page=points&p=2020&bb=points

<br>

***


In [35]:
# Create a file path for the original data set
path = 'CAO2020' + current_time + '.xlxs'

# Opening URL
urlrq.urlretrieve('http://www2.cao.ie/points/CAOPointsCharts2020.xlsx', path)


('CAO202020211101_201444.xlxs', <http.client.HTTPMessage at 0x11db75b80>)

<br>

## Load data with panda
https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html
***

<br>

In [49]:
# Load and parse spread sheet
df_20 = pd.read_excel('http://www2.cao.ie/points/CAOPointsCharts2020.xlsx', skiprows=10)
df_20

Unnamed: 0,CATEGORY (i.e.ISCED description),COURSE TITLE,COURSE CODE2,R1 POINTS,R1 Random *,R2 POINTS,R2 Random*,EOS,EOS Random *,EOS Mid-point,...,avp,v,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8
0,Business and administration,International Business,AC120,209,,,,209,,280,...,,,,,,,,,,
1,Humanities (except languages),Liberal Arts,AC137,252,,,,252,,270,...,,,,,,,,,,
2,Arts,"First Year Art & Design (Common Entry,portfolio)",AD101,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
3,Arts,Graphic Design and Moving Image Design (portfo...,AD102,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
4,Arts,Textile & Surface Design and Jewellery & Objec...,AD103,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1459,Manufacturing and processing,Manufacturing Engineering,WD208,188,,,,188,,339,...,,,,,,,,,,
1460,Information and Communication Technologies (ICTs),Software Systems Development,WD210,279,,,,279,,337,...,,,,,,,,,,
1461,Information and Communication Technologies (ICTs),Creative Computing,WD211,271,,,,271,,318,...,,,,,,,,,,
1462,Personal services,Recreation and Sport Management,WD212,270,,,,270,,349,...,,,,,,,,,,


In [57]:
# Delete irrelevant columns
# df_20.drop(columns=['COURSE TITLE', 
#                     'CATEGORY (i.e.ISCED description)', 
#                     'R1 Random *', 
#                     'R2 Random*', 
#                     'EOS', 
#                     'EOS Random *', 
#                     'EOS Mid-point', 
#                     'HEI', 
#                     'avp', 
#                     'v', 
#                     'Column1', 'Column2', 'Column3', 'Column4', 'Column5', 'Column6', 
#                     'Column7', 'Column8'])


Unnamed: 0,COURSE CODE2,R1 POINTS,R2 POINTS,LEVEL,Test/Interview #
0,AC120,209,,8,
1,AC137,252,,8,
2,AD101,#+matric,,8,#
3,AD102,#+matric,,8,#
4,AD103,#+matric,,8,#
...,...,...,...,...,...
1459,WD208,188,,7,
1460,WD210,279,,8,
1461,WD211,271,,8,
1462,WD212,270,,8,


In [59]:
# Checking random row to ensure data integrity.
df_20.iloc[650]

CATEGORY (i.e.ISCED description)                                             Arts
COURSE TITLE                        Arts (Drama, Theatre and Performance Studies)
COURSE CODE2                                                                GY118
R1 POINTS                                                                     451
R1 Random *                                                                   NaN
R2 POINTS                                                                     NaN
R2 Random*                                                                    NaN
EOS                                                                           451
EOS Random *                                                                  NaN
EOS Mid-point                                                                 492
LEVEL                                                                           8
HEI                                        National University of Ireland, Galway
Test/Interview #

In [61]:
# Another random check
df_20.iloc[-1]


CATEGORY (i.e.ISCED description)          Engineering and engineering trades
COURSE TITLE                        Mechanical and Manufacturing Engineering
COURSE CODE2                                                           WD230
R1 POINTS                                                                253
R1 Random *                                                              NaN
R2 POINTS                                                                NaN
R2 Random*                                                               NaN
EOS                                                                      253
EOS Random *                                                             NaN
EOS Mid-point                                                            369
LEVEL                                                                      8
HEI                                        Waterford Institute of Technology
Test/Interview #                                                         NaN

In [62]:
# Saving data to CSV file.
path = 'data/cao2020_' + current_time + '.csv'
df_20.to_csv(path)


<br>

## 2019 CAO data
 
http://www2.cao.ie/points/lvl8_19.pdf

<br>

***

***
# End