## PDF Converter

In [1]:
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import os
import sys, getopt
import csv
import wget

#converts pdf, returns its text content as a string
def convert(fname, pages=None):
    """Function takes a pdf as input and converts a
    single pdf to a text file"""
    if not pages:
        
        
        pagenums = set()
    else:
        pagenums = set(pages)

    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close
    return text

In [None]:
def convertMultiple(pdfDir, txtDir):
    """converts all pdfs in directory pdfDir, 
    saves all resulting txt files to txtdir"""
    lst_errors = []
    # Existing conversions
    existing_files = os.listdir(txtDir)
    if pdfDir == "": pdfDir = os.getcwd() + "\\" #if no pdfDir passed in 
    for pdf in os.listdir(pdfDir): #iterate through pdfs in pdf directory
        if not pdf+'.txt' in existing_files:
            pdfFilename = pdfDir + pdf
            try:
                text = convert(pdfFilename) #get string of text content of pdf
                textFilename = txtDir + pdf + ".txt"
                textFile = open(textFilename, "w") #make text file
                textFile.write(text) #write text to text file
            except:
                print('error filename:',pdf)
                lst_errors.append(pdf)
                pass
    
    # Write any errors to a lst
    with open('convert_errors.csv', 'w') as myfile:
        wr = csv.writer(myfile,lineterminator='\n')
        for error in lst_errors:
            wr.writerow([error])

In [None]:
def pdf_directory_convertor():
    """Loop through all folders containing pdfs and return
    a text file and store it in a text folder"""

    pdfDir = 'pdf/'
    txtDir = 'txt/'
    # Make directory if it does not exist in the text folder
    if not os.path.exists(txtDir):
        os.makedirs(txtDir)
    convertMultiple(pdfDir, txtDir)

## JLMR Scraper

In [8]:
import csv
import pandas as pd
import bs4 as bs
import re
import urllib

In [114]:
def get_jmlr_links(primaryurl):
    """Gets all the links for individula papers from
    jmlr.org"""
    domain = 'http://www.jmlr.org'
    full_jmlr_links = list()
    fhand = urllib.request.urlopen(primaryurl).read()
    soup = bs.BeautifulSoup(fhand,"lxml")
    links = soup.find_all('div',{'id':'content'})
    links_lst = links[0].find_all('a')
    links_lst = [(primaryurl+x.get('href')) for x in links_lst[:-1]]
    for each in links_lst:
        fh = urllib.request.urlopen(each).read()
        soup_paper = bs.BeautifulSoup(fh,"lxml")
        papers = soup_paper.find_all('a', {'target':'_blank'})
        paper_links_lst = [y.get('href') for y in papers]
        for link in paper_links_lst:
            if 'pdf' == link[-3:]:
                if 'http' not in link:
                    full_jmlr_links.append(domain+link)
                else:
                    full_jmlr_links.append(link)
    return full_jmlr_links

In [111]:
def paper_downloader(pdf_list, output_directory):
    """This function downloads all the papers based on 
    a list of links sent to it"""
    for pdf_link in pdf_list:
        if not os.path.exists(output_directory):
            os.makedirs(output_directory)
        filename = wget.download(pdf_link, out=output_directory)

In [10]:
'1609.06935.pdf'+'.txt' in os.listdir('txt/computer_science/')

True

In [6]:
os.listdir('pdf/computer_science/')

['.DS_Store', '1609.06935.pdf', '2.pdf', '3 copy.pdf']

In [107]:
primaryurl = 'http://www.jmlr.org/papers/'
output_directory = 'pdf/'

In [108]:
pdf_list=get_jmlr_links(primaryurl)
print('Finished updating paper list')
paper_downloader(pdf_list, output_directory)
print('Finished downloading paper, will start converting')
pdf_directory_convertor()
print('Finished')