# CAO Points Analysis

[CAO](http://www.cao.ie/index.php?page=points&p=2021)

***

In [1]:
# Regular Expressions
import re

# Convenient HTTP Requests.
import requests as rq

# dates and times
import datetime as dt

# For PDF Scrape CAO 2019
import PyPDF2
import openpyxl

In [2]:
# Fetch CAO points URL
resp = rq.get('http://www2.cao.ie/points/l8.php')
# Have a quick peek
resp

<Response [200]>

# Save original data set

In [3]:
# Get Current date and time.
now = dt.datetime.now()

# Format as string
nowstr = now.strftime('%Y%m%d_%H%M%S')

In [4]:
#Create file path for original data
path = 'data/cao2021_'+ nowstr + '.html'

### Error on Server
- Server says we should decode as per:
    Content-Type: text/html; charset=iso-8859-1
- However one line uses \x96 which is not defined in iso-8859-1
- Therefore we used the similar decoding standard cp1252
- This is very similar but includes \x96

In [5]:
# Server uses the incorrect encoding
original_encoding = resp.encoding
# Change to cp1252
resp.encoding = 'cp1252'

In [6]:
# Save The orginal html file
with open(path, 'w') as f:
    f.write(resp.text)

In [7]:
resp.encoding

'cp1252'

# Use regular Expressions to select the lines we want

In [8]:
# Compile Reg-Ex for Matching lines

re_course = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)([0-9]{3})(\*?) *')

In [9]:
# File path for data 
path = 'data/cao2021_csv_'+ nowstr + '.csv'

# Keep track of how many courses we process.
no_lines = 0

# Open CSV file for writing 
with open (path, 'w') as f:

    # loop through lines of the response
    for line in resp.iter_lines():
        #decode the line using 'cp1252' - wrong encoding
        dline = line.decode('cp1252')
        # Match only the lines representing courses.
        if re_course.fullmatch(dline):
            # Add one to the lines counter.
            no_lines = no_lines + 1
            # Split the line on two or more spaces
            linesplit = re.split('  +', dline)
            # Rejoin the substrings with commas
            f.write(','.join(linesplit) + '\n')

# Print the total number of processed lines.
print(f"Total number of lines is {no_lines}.")

Total number of lines is 922.


In [10]:


# Fetch CAO 2019 points URL - PDF format
resp2 = rq.get('http://www2.cao.ie/points/lvl8_19.pdf')
# Have a quick peek
resp2

<Response [200]>

In [11]:
#Create file path for original data
path2 = 'data/cao2019_'+ nowstr + '.html'

In [12]:
# Save The orginal html file
with open(path2, 'w') as f:
    f.write(resp.text)

In [13]:
# File path for data 
path2 = 'data/cao2019_csv_'+ nowstr + '.csv'


In [14]:
# Keep track of how many courses we process.
#no_lines = 0

# Open CSV file for writing 
with open (path2, 'w') as f:

    # loop through lines of the response
    for line in resp.iter_lines():
            no_lines = no_lines + 1


# Print the total number of processed lines.
print(f"Total number of lines is {no_lines}.")

Total number of lines is 3226.


In [30]:


pdfFileObj = open('http://www2.cao.ie/points/lvl8_19.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pdfReader.numPages



SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (<ipython-input-30-eb9ee9adfd36>, line 14)

In [None]:
pageObj = pdfReader.getPage(0)
mytext = pageObj.extractText()


wb = openpyxl.load_workbook('C:\Users\lynch08\Desktop\GMITWinter2021\Fundementals_Of_Data_Analysis\CAO Points.CAO2019.csv')
sheet = wb.active
sheet.title = 'MyPDF'
sheet['A1'] = mytext

wb.save('C:\Users\lynch08\Desktop\GMITWinter2021\Fundementals_Of_Data_Analysis\CAO Points.CAO2019.csv')
print('DONE!!')