# NLP Project - Stage 1

## The ask

Write a function that can take a URL as input and read the PDF document from there and save it to the local filesystem.

## Programme Function

This programme fetches a pdf file from a remote url, saves it locally and returns the text from the document as a list - collected by page.

In [7]:
# Import packages
import urllib.request
from io import BytesIO
from pdfminer3.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer3.converter import TextConverter
from pdfminer3.layout import LAParams
from pdfminer3.pdfpage import PDFPage

### Function Definition

In [4]:
#PDF to text Function. 
def pdf_to_text(path):
    manager = PDFResourceManager()
    retstr = BytesIO()
    layout = LAParams(all_texts=True)
    device = TextConverter(manager, retstr, laparams=layout)
    filepath = open(path, 'rb')
    interpreter = PDFPageInterpreter(manager, device)
    
    text_list = []
    for page in PDFPage.get_pages(filepath, check_extractable=True):
        interpreter.process_page(page)
        text_list.append(retstr.getvalue())

    filepath.close()
    device.close()
    retstr.close()
    return text_list

In [5]:
# Fetch PDF from remote URL function
def get_pdf_from_url(url,filename=None,print_text=False):
    # Takes a URL and saves the data locally.
    
    if filename == None:
        filename = url.split('/')[-1]
    
    webFile = urllib.request.urlopen(url)
    with open(filename,'wb') as localFile:
        contents = webFile.read()
        localFile.write(contents)
        
    webFile.close()
    
    # get text from the pdf file
    if print_text:
        text = pdf_to_text(filename)
        print(text)

In [6]:
# url = 'https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf'
url = 'http://www.africau.edu/images/default/sample.pdf'
get_pdf_from_url(url)