#### Imports

In [1]:
import PyPDF2
import csv
import re
import json
import os
import us
import numpy as np

# OCR
- https://github.com/sirfz/tesserocr
- https://www.geeksforgeeks.org/python-reading-contents-of-pdf-using-ocr-optical-character-recognition/
- https://pypi.org/project/tesserocr/

- Testing tesserocr

In [2]:
from tesserocr import PyTessBaseAPI, RIL
from pdf2image import convert_from_path 
import pytesseract
from PIL import Image 

In [3]:
import matplotlib.pyplot as plt

In [4]:
PDF_file = "1998/directory_1998.pdf"

In [6]:
# Store all the pages of the PDF in a variable 
pages = convert_from_path(PDF_file, 500) 

In [7]:
box1 = {'x': 0, 'y': 0, 'w': 1450, 'h': 5100}
box2 = {'x': 1450, 'y': 0, 'w': 1200, 'h': 5100}
box3 = {'x': 2600, 'y': 0, 'w': 1400, 'h': 5100} 
boxes = [box1, box2, box3]

i = 1
data = []
for i in range(12, 562): # i is the page number in PDF
    pages[i - 1].save("1998/page.jpeg", 'JPEG') 
    with PyTessBaseAPI() as api:
        api.SetImageFile("1998/page.jpeg")
        txt = ''
        for i, box in enumerate(boxes):
            api.SetRectangle(box['x'], box['y'], box['w'], box['h'])
            ocrResult = api.GetUTF8Text()
            txt += ocrResult
        data.append(txt)

In [8]:
#save the data just because
with open('1998/rawdata.txt', 'w') as f:
    for item in data:
        f.write("%s\n\n\n" % item)

# Write data into csv

In [9]:
abbr_keys = ["TX", "ML", "PV", "OS", "DT",\
             "OD", "OR", "IO", "ID", "IR", "RD", "RR",\
             "AD", "DD", "HV", "PW",\
             "SS", "GH", "PH", "OH", "TC", "HH", "CM", "CH", \
             "CO", "SG", "SC", "CJ",\
             "MC", "MD", "FG", "PI"]

In [10]:
def clean_keys(string_of_keys):
    return string_of_keys.replace('/', '').replace('0', 'O').replace('!', '').replace('5', 'S').\
                                     replace('8', 'S').replace('[', 'I').replace(']', 'I').replace('l', 'I').replace('1', 'I').\
                                     split()

In [11]:
def clean_data(rawdata):
    '''
    Roughly split data, returns list of list.
    Inputs:
        rawdata_from_page(list): list of items on the page that were seperated by /n
    '''
    #split erroneously connected data
    data = []
    temp = []
    i = 0
    prev_is_keys = False
    rawdata = rawdata.replace('\n\n', '\n').split('\n')
    rawdata[:] = [l.strip() for l in rawdata if not (l.strip() == '' or l == 'KEY')]
    #clear first line if it is city name
    if rawdata[0].isupper():
        rawdata = rawdata[1:]
    #split data
    while i < len(rawdata): 
        #print(rawdata[i]) ###
        as_keys = clean_keys(rawdata[i])
        if as_keys[0].isupper() and as_keys[-1].isupper() and (as_keys[0][:2] in abbr_keys) and (rawdata[i-1][0] == '(' or prev_is_keys):
            if prev_is_keys:
                temp[-1] += ' ' + ' '.join(as_keys)
            else:
                temp.append(' '.join(as_keys))
            prev_is_keys = True
        else:
            if prev_is_keys:
                data.append(temp)
                temp = []
            if not (rawdata[i].isupper() and prev_is_keys):
                temp.append(rawdata[i])
            prev_is_keys = False
        i += 1
    data.append(temp)
    return data

In [50]:
address_indicators = ['Box', 'Road', 'Highway', 'Route', 'Building']

def write_data(data, year, pagen, writer, at_row):
    '''
    write data into right columns
    '''
    #loop over centers
    for fc in data:
        if fc: #if fc is not empty
            if len(fc) > 10: #if the data likely wrongly splitted
                with open('1998/warning.txt', 'a') as f:
                    f.write("\n{}".format(at_row))
            at_row += 1        
            print(fc, '\n')
            
            i = 1
            data_holder = [year, pagen, fc[0]]
            #Center name
            while True:
                #the next line is street address
                if fc[i][0].isnumeric() or [i for i in fc[i].split() if i in address_indicators]\
                or fc[i].split()[-1].replace(' ', '').replace('—', '-').replace('-', '').replace('[', '1').replace(']', '1').replace('I', '1')\
                .isnumeric(): #the next line is state city zip
                    break
                data_holder[-1] += ' ' + fc[i]
                i += 1
            #street_address
            data_holder.append('')
            while True:
                if (',' in fc[i]):
                    city, state_zip = fc[i].split(',', 1)
                    break
                if ('.' in fc[i]) and not ('Box' in fc[i]) and not ('U.S.' in fc[i]) and not ('US.' in fc[i]):
                    city, state_zip = fc[i].split('.', 1)
                    break
                as_street_address = fc[i].split()
                if as_street_address[-1].isnumeric() and as_street_address[-2].isupper():
                    city = ' '.join(as_street_address[0:-2])
                    state_zip = ' ' + ' '.join(as_street_address[-2:])
                    break
                data_holder[-1] += ' ' + fc[i]
                i += 1
            data_holder[-1] = data_holder[-1].strip()
            #city, state, zip
            data_holder.append(city) 
            data_holder.append(state_zip[1:3])
            data_holder.append(state_zip[3:].replace(' ', '').replace('—', '-').replace('[', '1').replace(']', '1').replace('I', '1'))
            i += 1
            #phone
            data_holder.append(' '.join(fc[i:-1]))
            #keys
            keys = []
            for key in fc[-1].split():
                if len(key) == 3:
                    if key[1:] in abbr_keys:
                        keys.append(key[1:])
                    else:
                        keys.append(key[:2])
                if len(key) == 4:
                    keys.append(key[:2])
                    keys.append(key[2:])
                else:
                    keys.append(key)
            data_holder.append(' '.join(keys))
            #print(data_holder)
            writer.writerow(data_holder)
    return at_row ###

In [51]:
year = 1998
filename = "{0}/{0}.csv".format(year)
with open(filename, 'w', newline = '') as file:
    writer = csv.writer(file)
    writer.writerow(["Year", "Page", "Center_name", "Street_address", "City", "State", "Postal_code", "Phone", "Keys"])
    pagen = 12
    at_row = 2
    for page in data:
        print('---------', pagen, '---------')
        page_cont = clean_data(page)
        at_row = write_data(page_cont, year, pagen, writer, at_row)
        pagen += 1

--------- 12 ---------
['Lighthouse of Tallapoosa County Inc', '36 Franklin Street', 'Alexander City, AL 35010', '(205)234-4894', 'TX RR HH'] 

['Anniston Fellowship House Inc', '106 East 22nd Street', 'Anniston, AL 36201', '(256)236-7229', 'TX RR HH CJ'] 

['Calhoun/Clebumc Mental Health Center', 'New Directions', '331 East 8th Street', 'Anniston, AL 36202', '(256)236-3403', 'Hotlines:', '(256)236-3403', '(256)236-8003', 'TX PV OS OR IO AD DD SS CM'] 

['Alcoholism Recovery Services Inc', '2701 Jefferson Avenue SW', 'Birmingham, AL 3521 1', '(205)923-6552', 'TX PV OS DT OR IO RD RR HV SS OH MD'] 

['Aletheia House', '201 Finley Avenue West', 'Birmingham, AL 35204', '(205)324-6502', 'TX OS OR IO SS MD'] 

['Birmingham Health Care for', 'The Homeless', '712 25th Street North', 'Birmingham. AL 35203', '(205 )439-72 1 6', 'TX PV OS OR IO SS FG'] 

['Bradford Health Services', 'Birmingham Regional Office Jefferson', '631 Beacon Parkway West', 'Suite 21]', 'Birmingham, AL 35209', '(800)293-

IndexError: list index out of range

In [29]:
errorpagen = 143 #33, 111, 143

In [30]:
#clean_data(data[errorpagen-12])

In [31]:
#data[errorpagen-12].replace('\n\n', '\n').split('\n')

In [32]:
data[errorpagen-12]

'TAMUNIN G\n\nDept of Mental Health and Subs: Abuse\nSubstance Abuse Drug and Alcohol Prev\n790 Gov Carlos G Camacho Road\nTamuning, GU 96911\n\n(671)647-5330 x 5445\n\nHotline:\n\n(671)647-8833\n\nTX PV 08/ OD OR 10/ AD DD HV/ SS P\nTC CM ~\n\nGUAM\n\n'

In [33]:
data[errorpagen-12] = 'TAMUNIN G\n\nDept of Mental Health and Subs: Abuse\nSubstance Abuse Drug and Alcohol Prev\n790 Gov Carlos G Camacho Road\nTamuning, GU 96911\n\n(671)647-5330 x 5445\n\nHotline:\n\n(671)647-8833\n\nTX PV 08/ OD OR 10/ AD DD HV/ SS P\nTC CM'

In [34]:
#save the data just because
with open('1998/rawdata.txt', 'w') as f:
    for item in data:
        f.write("%s\n\n\n\n\n\n" % item)