### Read in a pdf file using tabula read pdf
***

Link to the documentation
https://tabula-py.readthedocs.io/en/latest/

In [1]:
#set up the url where the pdf is held
url = "http://www2.cao.ie/points/lvl8_19.pdf"

In [2]:
import pandas as pd
from tabula import read_pdf

#read_pdf returns a list of dataframes
df = read_pdf(url,pages='all',pandas_options={"header": None})

In [3]:
#Had to install it - uncomment if needed - add this to requirements file
#%pip install tabula-py

In [4]:
#see how many pages were returned

len(df)

18

In [5]:
# read throught the returned pages and display the head of each page
for i in range(len(df)):
    print(df[i].head())


             0                                                  1    2    3
0  Course Code                             INSTITUTION and COURSE  EOS  Mid
1          NaN                    Athlone Institute of Technology  NaN  NaN
2        AL801    Software Design with Virtual Reality and Gaming  304  328
3        AL802               Software Design with Cloud Computing  301  306
4        AL803  Software Design with Mobile Apps and Connected...  309  337
       0                                                1    2      3
0  CW258                       Cybercrime and IT Security  300  328.0
1  CW268  Computing in Interactive Digital Art and Design  274  321.0
2  CW438                           Construction (options)  271  308.0
3  CW468                         Architectural Technology  252  290.0
4  CW478                                Civil Engineering  348  383.0
       0                               1    2      3
0  CR500      Engineering (Common Entry)  381  424.0
1  CR510  Sustaina

Read_pdf brings in a list of dataframes - one per page/table of document

In [6]:
#access an individual page
page=df[0]

In [7]:
print(page)

              0                                                  1     2    3
0   Course Code                             INSTITUTION and COURSE   EOS  Mid
1           NaN                    Athlone Institute of Technology   NaN  NaN
2         AL801    Software Design with Virtual Reality and Gaming   304  328
3         AL802               Software Design with Cloud Computing   301  306
4         AL803  Software Design with Mobile Apps and Connected...   309  337
5         AL805        Network Management and Cloud Infrastructure   329  442
6         AL810                                 Quantity Surveying   307  349
7         AL820                 Mechanical and Polymer Engineering   300  358
8         AL830                                    General Nursing   410  429
9         AL832                                Psychiatric Nursing   387  403
10        AL836                       Nutrition and Health Science   352  383
11        AL837            Sports Science with Exercise Physiolo

In [8]:
#todo - write out the original file as a copy, write the dataframe to a csv file for future use, check out df.concat

In [9]:
#loop through the pages and 
#drop any row that has Nan in the first column( column 0)
for i in range(len(df)):
    df[i].dropna(subset=[0],inplace=True)


In [10]:
#have a look at the first page index 0 again to see if the this worked as expected
#lines 1 and 29 should be gone these were the names of the institution
print(df[0])

              0                                                  1     2    3
0   Course Code                             INSTITUTION and COURSE   EOS  Mid
2         AL801    Software Design with Virtual Reality and Gaming   304  328
3         AL802               Software Design with Cloud Computing   301  306
4         AL803  Software Design with Mobile Apps and Connected...   309  337
5         AL805        Network Management and Cloud Infrastructure   329  442
6         AL810                                 Quantity Surveying   307  349
7         AL820                 Mechanical and Polymer Engineering   300  358
8         AL830                                    General Nursing   410  429
9         AL832                                Psychiatric Nursing   387  403
10        AL836                       Nutrition and Health Science   352  383
11        AL837            Sports Science with Exercise Physiology   351  392
12        AL838                                      Biotechnolo

In [11]:
#have a look at the last page 18 index 17
print(df[17])

        0                                                  1     2      3
0   TL881                                        Social Care   308  356.0
1   TL889  Counselling with Addiction - Mature applicants...     #    NaN
2   TL890              General Nursing and Mature Applicants   407  431.0
3   TL891        Mental Health Nursing and Mature Applicants  367*  379.0
5   WD001                        Applied Computing (options)   279  358.0
6   WD002                                  Science (options)   302  377.0
7   WD005                          Health Sciences (options)   288  347.0
8   WD006                        Exercise Sciences (options)   282  337.0
9   WD007                              Engineering (options)   287  382.0
10  WD025            Construction Management and Engineering   269  327.0
11  WD027                                              Music  #290  317.0
12  WD048                                 Business (options)   278  357.0
13  WD084                             

In [12]:
#This is looking good now need to figure out how to merge it into one dataframe

In [13]:
#going to try using pandas concatenation function to join each frame into a new data frame
#need to drop column indexes from initial dataframes when creating this new df
#the returned df is already a list of dfs so we can do the following:
df2019 = pd.concat(df, ignore_index=True)

In [14]:
#check if above worked by getting the length of the new df
len(df2019)


931

In [15]:
#Looks ok - going to check head and tail and do some cross checking with original file
df2019.head()

Unnamed: 0,0,1,2,3
0,Course Code,INSTITUTION and COURSE,EOS,Mid
1,AL801,Software Design with Virtual Reality and Gaming,304,328
2,AL802,Software Design with Cloud Computing,301,306
3,AL803,Software Design with Mobile Apps and Connected...,309,337
4,AL805,Network Management and Cloud Infrastructure,329,442


In [16]:
df2019.tail()

Unnamed: 0,0,1,2,3
926,WD200,Arts (options),221,296.0
927,WD210,Software Systems Development,271,329.0
928,WD211,Creative Computing,275,322.0
929,WD212,Recreation and Sport Management,274,311.0
930,WD230,Mechanical and Manufacturing Engineering,273,348.0


In [17]:
#have a look at the whole dataframe
df2019

Unnamed: 0,0,1,2,3
0,Course Code,INSTITUTION and COURSE,EOS,Mid
1,AL801,Software Design with Virtual Reality and Gaming,304,328
2,AL802,Software Design with Cloud Computing,301,306
3,AL803,Software Design with Mobile Apps and Connected...,309,337
4,AL805,Network Management and Cloud Infrastructure,329,442
...,...,...,...,...
926,WD200,Arts (options),221,296.0
927,WD210,Software Systems Development,271,329.0
928,WD211,Creative Computing,275,322.0
929,WD212,Recreation and Sport Management,274,311.0


In [18]:
#ok need to drop column indexes from initial dataframes when creating this new df ( redo to demo ?  remove ignore_index from above if so)

In [19]:
#save the dataframe as a csv file to disk
df2019.to_csv('cao-points-main/data/cao-2019.csv',sep=',',encoding='utf-8')

***
## End