## CAO Points Analysis

***

In [1]:
# To access a URL from python
# import urllib

# Convenient HTTP requests
import requests as rq

# Regular expressions
import re

# Should always call this out on the requirements txt file as it doesn't come as standard package with python
# Only if you have anaconda does it come with the package

# Dates and times
import datetime as dt

# Data frames
import pandas as pd

# for downloading 
import urllib.request as urlrq

<br>

## 2021 Points 
https://www.cao.ie/index.php?page=points&p=2021&bb=points
***

In [2]:
# Fetch the CAO points URL.
resp = rq.get('http://www2.cao.ie/points/l8.php')
# Have a quick peek (http code for ok)
resp
# To see the whole text
# resp.text

<Response [200]>

### Save Original data set

***

In [3]:
# Get the current date and time
now = dt.datetime.now()

# Format as a string
nowstr = now.strftime('%Y%m%d_%H%M%S')

In [4]:
# Create a file path for the original data
path = 'data/cao2021_' + nowstr + 'html'

<br>

### Error on server

***

Technically, the server says we should decode as per:
```
Content-Type: text/html; charset=iso-8859-1
```
However, one line uses \x96 which isn't defined in iso-8859-1.
<br>Therefore we use the similar decoding standard cp1252, which is very similar but includes #x96.

In [5]:
# The server uses the wrong encoding, fix it
original_encoding = resp.encoding
# Change to cp1252
resp.encoding = 'cp1252'

In [6]:
# Save the original html file
with open (path, 'w') as f:
    f.write(resp.text)

<br>

### Use regular expressions to select lines we want

***

In [7]:
# Compile the regular expression for matching lines
re_course = re.compile('([A-Z]{2}[0-9]{3})  (.*)([0-9]{3})(\*?) *')

<br>

### Loop through the lines of the response

***

In [8]:
# The file path for the csv file
path = 'data/cao2021_csv_' + nowstr + '.csv'

# Keep track of how many courses we process 
no_lines = 0

# Open the cav file for writing
with open (path, 'w') as f:
    # Loop through lines of the respone
    for line in resp.iter_lines():
        # Decode the line, changed from ISO-8859-1 to cp1252
        # as one line uses \x96 which isn't defined in iso-8859-1
        dline = line.decode('cp1252')
        # Match only the lines representing courses
        if re_course.fullmatch(dline):
            # Add one to the lines counter
            no_lines = no_lines + 1
            # Uncomment next lines to see the original
            # print(line)
            # Pick out the relevant parts of the matched line
            # csv_version = re_course.sub(r'\1,\2,\3,\4', dline)
            # Print the CSV-style line
            # print(csv_version)
            # Split the line on two or more spaces
            linesplit = re.split('  +', dline)
            # print(linesplit)
            # print(','.join(linesplit))
            # Rejoin the substrings with commas between them
            f.write(','.join(linesplit) + '\n')

# Print the total number of processed lines
print(f"Total number of lines is {no_lines}.")

Total number of lines is 922.


<br>

## 2020 Points
https://www.cao.ie/index.php?page=points&p=2020&bb=points

***

<br>

#### Save Original File

***

In [9]:
# Create a file path for the original data
path = 'data/cao2020_' + nowstr + 'xslx'

In [10]:
# Save original file to disk
urlrq.urlretrieve('http://www2.cao.ie/points/CAOPointsCharts2020.xlsx', path)

('data/cao2020_20211109_211554xslx',
 <http.client.HTTPMessage at 0x17cf071a160>)

<br>

#### Load Spreadsheet using pandas

***

In [11]:
# Download and parse the excel spreadsheet
df = pd.read_excel('http://www2.cao.ie/points/CAOPointsCharts2020.xlsx', skiprows=10)

In [12]:
df

Unnamed: 0,CATEGORY (i.e.ISCED description),COURSE TITLE,COURSE CODE2,R1 POINTS,R1 Random *,R2 POINTS,R2 Random*,EOS,EOS Random *,EOS Mid-point,...,avp,v,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8
0,Business and administration,International Business,AC120,209,,,,209,,280,...,,,,,,,,,,
1,Humanities (except languages),Liberal Arts,AC137,252,,,,252,,270,...,,,,,,,,,,
2,Arts,"First Year Art & Design (Common Entry,portfolio)",AD101,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
3,Arts,Graphic Design and Moving Image Design (portfo...,AD102,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
4,Arts,Textile & Surface Design and Jewellery & Objec...,AD103,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1459,Manufacturing and processing,Manufacturing Engineering,WD208,188,,,,188,,339,...,,,,,,,,,,
1460,Information and Communication Technologies (ICTs),Software Systems Development,WD210,279,,,,279,,337,...,,,,,,,,,,
1461,Information and Communication Technologies (ICTs),Creative Computing,WD211,271,,,,271,,318,...,,,,,,,,,,
1462,Personal services,Recreation and Sport Management,WD212,270,,,,270,,349,...,,,,,,,,,,


In [13]:
# Spot check random row
df.iloc[1463]

CATEGORY (i.e.ISCED description)          Engineering and engineering trades
COURSE TITLE                        Mechanical and Manufacturing Engineering
COURSE CODE2                                                           WD230
R1 POINTS                                                                253
R1 Random *                                                              NaN
R2 POINTS                                                                NaN
R2 Random*                                                               NaN
EOS                                                                      253
EOS Random *                                                             NaN
EOS Mid-point                                                            369
LEVEL                                                                      8
HEI                                        Waterford Institute of Technology
Test/Interview #                                                         NaN

In [14]:
# Spot check the 4th last row
df.iloc[-4]

CATEGORY (i.e.ISCED description)    Information and Communication Technologies (ICTs)
COURSE TITLE                                             Software Systems Development
COURSE CODE2                                                                    WD210
R1 POINTS                                                                         279
R1 Random *                                                                       NaN
R2 POINTS                                                                         NaN
R2 Random*                                                                        NaN
EOS                                                                               279
EOS Random *                                                                      NaN
EOS Mid-point                                                                     337
LEVEL                                                                               8
HEI                                                 Wa

In [15]:
# Create a file path for the pandas data
path = 'data/cao2020_' + nowstr + 'csv'

In [16]:
# Save pandas data frame to disk
df.to_csv(path) 

<br>

## 2019 Points 
https://www.cao.ie/index.php?page=points&p=2019&bb=points
***

***
### End