#### Imports

In [8]:
import PyPDF2
import csv
import re

#### Enter info about the pdf

In [9]:
pdfFileObj = open('directory_2017.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
print("Number of pages:", pdfReader.numPages)

Number of pages: 1422


#### Scrape key to facility codes
- 2 problems to solve if accurate key info is needed (right now just using the list of abbr.): 
    - titles of sections?
    - lines for value are sometimes splitted thus not included

In [10]:
abbr_key = {}
def clean_key_page(raw_abbr_key):
    raw_abbr_key[:] = [l for l in raw_abbr_key if (l.strip()!= '')]
    i = 0
    while i < len(raw_abbr_key):
        #print(raw_abbr_key[i])
        if raw_abbr_key[i].isupper():
            abbr_key[raw_abbr_key[i].strip()] = raw_abbr_key[i+1].strip()
            i += 2
        else:
            spitted = raw_abbr_key[i].split()
            if spitted[0].isupper():
                abbr_key[spitted[0]] = ', '.join(spitted[1:])
            i += 1

In [11]:
raw_abbr_key1 = pdfReader.getPage(4).extractText()
raw_abbr_key1 = raw_abbr_key1.split('\n')
raw_abbr_key1 = raw_abbr_key1[1:len(raw_abbr_key1) - 4]
clean_key_page(raw_abbr_key1)

In [12]:
raw_abbr_key2 = pdfReader.getPage(5).extractText()
raw_abbr_key2 = raw_abbr_key2.split('\n')
raw_abbr_key2 = raw_abbr_key2[:len(raw_abbr_key2) - 4]
clean_key_page(raw_abbr_key2)

In [13]:
raw_abbr_key3 = pdfReader.getPage(6).extractText()
raw_abbr_key3 = raw_abbr_key3.split('\n')
raw_abbr_key3 = raw_abbr_key3[26:len(raw_abbr_key3)-1]
clean_key_page(raw_abbr_key3)

In [14]:
abbr_key

{'GHF': 'General Health Services',
 'MHF': 'Mental Health Treatment Services',
 'MHSAF': 'Mix of Mental Health and Substance Abuse',
 'SAF': 'Substance Abuse Treatment Services',
 'ACM': 'Acamprosate (Campral®)',
 'BM': 'Buprenorphine, maintenance',
 'BMW': 'Buprenorphine, maintenance, for, a, pre-',
 'BU': 'Buprenorphine, used, in, treatment',
 'DB': 'Buprenorphine, detoxi˝cation',
 'DM': 'Methadone, detoxi˝cation',
 'DSF': 'Disul˝ram (Antabuse®)',
 'DT': 'Detoxi˝cation',
 'HH': 'Transitional, housing, or, halfway, house',
 'METH': 'Methadone',
 'MM': 'Methadone maintenance',
 'MMW': 'Methadone maintenance for pre-determined',
 'MOA': 'Accepts clients on opioid medication',
 'MPD': 'Medications for psychiatric disorders',
 'NMOA': 'Does not use medication for opioid addiction',
 'NOOP': 'Does not treat opioid addiction',
 'NXN': 'Naltrexone, (oral)',
 'OTP': 'SAMHSA-certi˝ed Opioid Treatment Program',
 'OTPA': 'All clients in Opioid Treatment Program',
 'PAIN': 'Use methadone/buprenor

#### Scraper test on a page of data
- data starts on page 15 (getPage(14))

In [345]:
def clean_data(rawdata):
    '''
    Roughly split data, returns list of list.
    Inputs:
        rawdata_from_page(list): list of items on the page that were seperated by /n
    '''
    #split erroneously connected data
    data = []
    temp = []
    i = 0
    while i < len(rawdata) - 1:
        #if i is the last line of keys, i+1 is facility name, i-1 is the phone number or other line of keys
        if rawdata[i].isupper() and (rawdata[i].split()[0] in abbr_key) and \
        (not rawdata[i+1].isupper()) and (rawdata[i-1][0] == '(' or rawdata[i-1].isupper()): 
            temp.append(rawdata[i])
            data.append(temp)
            temp = []
        else:
            temp.append(rawdata[i])
        i += 1
    temp.append(rawdata[i])
    data.append(temp)
    return data


def clean_facility_name(fname):
    '''
    Split facility name into ["junk", actual name]
    fname(str): name of the facility
    '''
    front = ''
    i = 0
    while i < len(fname) - 1:
        if (fname[i].isupper() or fname[i] == ' ')\
        and (fname[i+1].isupper() or fname[i+1] == ' '):
            front += fname[i]
            i += 1
        else:
            break
            
    if front in abbr_key:
        return [front, fname[i:]]
    
    #if its just capitalized name...
    elif front == 'YMCA' or front == 'BAART' or fname[i:i+3] == 'Inc':
        return ['', fname]
    #if it is accidentally included city name
    else:
        return ['', fname[i:]]
    
    
def clean_all_facility_name(data):
    '''
    Move and link wrongly splitted data, changes in place
    '''
    i = 0
    while i < len(data):
        facility_name_raw = data[i][0]
        if facility_name_raw[:2].isupper():
            code, fac_name = clean_facility_name(facility_name_raw)
            data[i][0] = fac_name
            if i != 0:
                data[i - 1].append(code)
        i += 1

In [346]:
def split_center_street(cn_sa, page):
    '''
    Split a string containing center name and street address into two.
    Inputs:
        cn_sa(str)
    Returns: a list
    '''
    #Exception to split at number :
        #When it ends with "P.O. Box",
        #When the number being found is at the very front (wrongly included page number or 
        #center name starts with number)
        
    #If the number is at the end(suite xxx or building xxx) lets include the word in front of it)
    #print(cn_sa)
    #print(page)
    if cn_sa[0].isnumeric():
        temp = cn_sa.split()
        if temp[0] == str(page):
            cn_sa = ' '.join(temp[1:])
            
    m = re.finditer(r"\d+", cn_sa)

    for num in m:
        span = num.span()
        if span[0] != 0:    
            if cn_sa[span[0] - 9:span[0] - 1] == 'P.O. Box':
                return ([cn_sa[:span[0] - 10], cn_sa[span[0] - 9:]])
            if span[1] != len(cn_sa):
                return ([cn_sa[:span[0] - 1], cn_sa[span[0]:]])
            if span[1] == len(cn_sa):
                return ([' '.join(cn_sa.split()[:-2]), ' '.join(cn_sa.split()[-2:])])
    return [cn_sa, '']

In [376]:
def write_data(data, year, page, filename):
    '''
    write data into right columns
    '''
    with open(filename, 'a', newline = '') as file:
        writer = csv.writer(file)
        #loop over centers
        for dc in data:
            #print(dc)##
            i = 0
            data_holder = [year, page]
        #Center name, street address
            cn_sa = ''
            while (',' not in dc[i]) or not dc[i][-5:].isnumeric(): #(len(dc[i]) < 2) or 
                #print(i)##
                cn_sa += ' ' + dc[i]
                i += 1
            data_holder += split_center_street(cn_sa.strip(), page)
            #print(data_holder)
                
        #City, State, Postal_code
            city, state_postcode = dc[i].split(',')
            state, postcode = state_postcode.split()
            data_holder += [city, state, postcode]
            i += 1
        #phone
            phone = dc[i]
            i += 1
            while not dc[i][0:2].isupper():
                phone += ' ' + dc[i]
                i += 1
            data_holder.append(phone)
        #keys
            data_holder.append(' '.join(dc[i:]))
        #write data
            writer.writerow(data_holder)

## Test on a bunch of pages
- Implement a geograpy package to better clean data?
- page 93 (BAART Programs) BAART is deleted because of the capitalization
- Manually entered: 
    - 2017: page 885, 1206, 1334

In [392]:
def run(dir_filename, year, start_page):
    
    #read file
    pdfFileObj = open(dir_filename, 'rb')
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
    
    #pages to scrape
    end_page = pdfReader.numPages - 1
    
    #open/create csv
    save_as_filename = str(year) + '.csv'
    if os.path.isfile(save_as_filename):
        with open(save_as_filename, 'a', newline='') as file:
            writer = csv.writer(file)
    else:
        with open(save_as_filename, 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(["Year", "Page", "Center_name", "Street_address", "City", \
                             "State", "Postal_code", "Phone", "Keys"])

    for i in range(start_page, end_page + 1):
        page = pdfReader.getPage(i).extractText()
        rawdata = page.split('\n')
        #delete na lines
        rawdata[:] = [l.strip() for l in rawdata if not (l.strip() == '' or l == 'KEY')]
        #clean
        if rawdata and not (rawdata == [str(i) + 'KEY']):
            data = clean_data(rawdata)
            clean_all_facility_name(data)
            write_data(data, year, i, save_as_filename)

In [393]:
run('directory_2012.pdf', 2012, 15)

In [391]:
#debug purposes
year = 2012
pagen = 424
filename = str(year)+'.csv'

pdfFileObj = open('directory_{}.pdf'.format(str(year)), 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)

with open(filename, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Year", "Page", "Center_name", "Street_address", "City", "State", "Postal_code", "Phone", "Keys"])

page = pdfReader.getPage(pagen).extractText()
rawdata = page.split('\n')
#delete na lines
rawdata[:] = [l.strip() for l in rawdata if not (l.strip() == '' or l == 'KEY')]
if rawdata and not (rawdata == [str(pagen) + 'KEY']):
    data = clean_data(rawdata)
    clean_all_facility_name(data)
    write_data(data, year, pagen, filename)

True

In [362]:
s = 'SAFSA DT ACM MPD NXN VTRL NMOACBT DBT SACA TRC REBT TWFA BIA CMI MOTI ANG RELP SMONRES OP RS RL RD OD ODT OITPVTSTAG STDH JCSF PI MI CO GL VET ADM MF CJ SE WN MN HV TRMA XA DV TAYCM PEER HS NRT NSC STU TCC SSA SMHD CSAA CMHA BABA DAUT SHB SHC HIVT STDT TBS DP ACC SSD SAE TA MHS ACU SHG ADTX BDTX CDTX MDTX ODTX ICO GCO FCO MCOYAD ADLTFEMMALE'
s.replace('', ' ')

'SAF SA DT ACM MPD NXN VTRL NMOA CBT DBT SACA TRC REBT TWFA BIA CMI MOTI ANG RELP  SMON RES OP RS RL RD OD ODT OIT PVT STAG STDH JC SF PI MI  CO GL VET ADM MF CJ SE WN MN HV TRMA XA DV TAY CM PEER HS NRT NSC STU TCC SSA SMHD CSAA CMHA BABA DAUT SHB SHC HIVT STDT TBS DP ACC SSD SAE TA MHS ACU SHG ADTX BDTX CDTX MDTX ODTX ICO GCO FCO MCO YAD ADLT FEMMALE'