In [1]:
import tabula
import os
import sys
import pandas as pd
from PyPDF2 import PdfFileReader
import time
import re
from decimal import Decimal
from IPython.core.display import clear_output

In [2]:
def update_progress(progress):
    bar_length = 20
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1
    block = int(round(bar_length * progress))
    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

In [3]:
pdfPath = os.path.join('..', '..', 'dat', 'raw', 'pdf', 'windspeedData.pdf')

In [4]:
#get number of pages
with open(pdfPath, "rb") as p:
    n = PdfFileReader(p).numPages

In [5]:
start = time.time()

#define interval for managing Java heap space
#keeping it small so we can watch a progress bar
interval = 10

dfs = []
k = 0
for i in range(1,n,interval):
    i = i + k
    j = i + interval
    if j > n:
        j = n
    k += 0
        
    try:
        #read tables
        tables = tabula.read_pdf(pdfPath,
                                 pages=(str(i)+'-'+str(j)))

        #drop first row (it's part of a column name) and concat
        tables = [d.iloc[1:] for d in tables]
        df = pd.concat(tables,
                       ignore_index=True)

        #clean up column name issue cited above
        df = df.rename(columns={'Outdoor': 'Outdoor Temperature (ºF)'})

        #append to large df
        dfs.append(df)
    
    except:
        print('Error around page ', i)
        sys.exit(0)
        
    update_progress(i / n)

dfs = pd.concat(dfs,
               ignore_index=True)
dfs = dfs.iloc[:,:-2]
end = time.time()

update_progress(1) 
print("Done! " + str(end-start) + " seconds elapsed.")

Progress: [####################] 100.0%
Done! 4133.923712968826 seconds elapsed.


In [6]:
dfs

Unnamed: 0,Date,Count,Sulfur Dioxide,Wind Speed – Scalar,Wind Direction – Resultant,Peak Wind Gust,Outdoor Temperature (ºF),Grouping
0,1/1/17 0:00,1.0,0.0,5.2,204.0,7.5,,K2HS‐K3HS‐K4CS‐K5CS
1,1/1/17 0:05,1.0,0.3,9.3,223.9,12.9,,K2HS‐K3HS‐K4CS‐K5CS
2,1/1/17 0:10,1.0,0.3,7.8,215.5,11.6,,K2HS‐K3HS‐K4CS‐K5CS
3,1/1/17 0:15,1.0,0.4,8.2,229.0,13.8,,K2HS‐K3HS‐K4CS‐K5CS
4,1/1/17 0:20,1.0,0.3,8.6,286.8,14.4,,K2HS‐K3HS‐K4CS‐K5CS
...,...,...,...,...,...,...,...,...
285916,7/8/19 7:35,1.0,0.4,4.7,249.5,7.5,82.5,
285917,7/8/19 7:40,1.0,0.5,4.7,255.6,6.8,82.7,
285918,7/8/19 7:45,1.0,0.4,4.6,256.2,6.2,82.9,
285919,7/8/19 7:50,1.0,0.4,6.2,262.2,9.0,83.0,


In [7]:
csvPath = os.path.join('..', '..', 'dat', 'clean', 'csv', 'windspeedData.csv')
dfs.to_csv(csvPath, index=False)

In [8]:
import session_info
session_info.show()