In [88]:
from collections import defaultdict
import json
import re

In [89]:
import numpy as np
import codecs
from matplotlib import pyplot as plt

In [90]:
pos = np.loadtxt("posinfo2.csv", delimiter=",",usecols=(2,6,7,8));
text = np.loadtxt("posinfo2.csv", dtype=np.unicode, delimiter=",",usecols=(9,));
text = np.asarray([t[4:-2] for t in text]) #Hack: np is not parsing text cols right

In [91]:

def def_column(row):
    h, tx, ty, p = row
    if tx<130:
        return "address"
    elif tx<250:
        return "names"
    elif tx<480:
        return "contact"
    else:
        return "service"


In [92]:
def isheader(row):
    h, tx, ty, p = row
    return h>15
    

In [93]:
header_pages = np.unique(pos[(np.apply_along_axis(isheader, 1, pos)), 3])

In [94]:
cols = np.apply_along_axis(def_column, 1, pos)
#list(map(print,cols))
cols

array(['address', 'contact', 'service', ..., 'service', 'service',
       'service'], 
      dtype='<U7')

In [95]:
def isValidRow(row, header_pages):
    h,tx,ty,p = row
    if ty<30: 
        return False
    elif p in header_pages and ty>500:
        return False
    else:
        return True

In [96]:
print(len(text[cols == 1] ))
print(len(text[cols == 2] ))
print(len(text[cols == 3] ))
print(len(text[cols == 4] ))

99
99
99
99


  if __name__ == '__main__':
  from ipykernel import kernelapp as app
  app.launch_new_instance()


In [97]:
def get_rows(pos, text):
    header_pages = np.unique(pos[(np.apply_along_axis(isheader, 1, pos)), 3])
    valid_rows =  np.asarray([isValidRow(row, header_pages) for row in pos])
    
    vpos = pos[valid_rows,:]
    vtext = text[valid_rows]
    entries = defaultdict(lambda : {'address': '', 'names': '', 'contact': '', 'service':''} )
    for page in np.unique(vpos[:,-1]):
        print(page)
        page_row_inds = vpos[:,-1]==page
        page_pos = vpos[page_row_inds, :]
        page_text = vtext[page_row_inds]

        cell_bottom_inds = np.logical_or(page_text == "Fax", page_text == "Fax:")
        cell_bottoms_ty = page_pos[cell_bottom_inds, -2]
        cell_bottoms_ty.sort()
        
        
        
        row_assignments = []
        for (ii,((h,tx,ty,p), tt)) in enumerate(zip(page_pos, page_text)):
            for (row_ii, bottom_ty) in enumerate(cell_bottoms_ty[::-1]):
                if ty>= bottom_ty:
                    row_assignments.append(row_ii)
                    #print("good ", tt, " at ", row_ii)
                    break 
            else:
                pass
                #print("warn: ", tt)
                #row_assignments.append(length(cell_bottoms_ty))
                
        col_assignments = np.apply_along_axis(def_column, 1, page_pos)
        
        
        for (tt, col_ass, row_ass) in zip(page_text, col_assignments, row_assignments):
            entries[(page, row_ass)][col_ass]+=tt
            
    return entries

In [98]:
entries = get_rows(pos,text)

0.0
1.0
2.0
3.0
4.0
5.0
6.0


In [99]:
def varients(alt_name):
    yield alt_name
    yield alt_name.lower()
    yield "".join(alt_name.split())
    yield "".join(alt_name.lower().split())
    yield re.sub(r'[^\w]', '', alt_name)
    yield re.sub(r'[^\w]', '', alt_name.lower())
    #Maybe we miss things with spaces and symbols?
    
    
    
def load_service_aliases():
    servise_names = json.load(open("./spec_types.json","r"))
    aliases = {}
    for canon_name, altnames in  servise_names.items():
        
        for altname in altnames:
            for varient in varients(altname):
                if len(varient) == 1:
                    continue
                    
                aliases[varient] = canon_name
        for varient in canon_name:
            if len(varient) == 1:
                continue

            aliases[varient] = canon_name
    return aliases
        

    
    SERVICE_ALIASES = load_service_aliases()

In [100]:

def NameCase(s):
    return s[0].upper() + s[1:].lower() 


def parse_address(tt):
    loc_parts = re.findall(r"[A-Z][A-Z]+", tt)
    if loc_parts==['WA']:
        #Only WA found
        loc_parts = re.findall(r"[A-Z|a-z]+(?=\sWA)", tt)
    
     
    loc_parts=filter(lambda p: p!="WA", loc_parts)
    loc_parts = map(NameCase, loc_parts)

    
    return "area", " ".join(loc_parts)


def parse_names(tt):
#    uc = [t==t.upper]

    
    names =[]
    
    namey_bits =  re.findall(r"[A-Z][A-Z]+[a-z]+", tt)
    if len(namey_bits)>0:
        for namey_bit in namey_bits:
            fistname_start = re.search("[a-z]", namey_bit).start() - 1
            last_name = NameCase(namey_bit[:fistname_start-1])
            first_name = NameCase(namey_bit[fistname_start:])
            
            names.append(first_name+" "+last_name)
    else:
        names = map(NameCase,re.findall(r"[A-Z][A-Z]+", tt))
        
    names = ["Dr " + name for name in names]
    return  "name"," ".join( names)



def parse_services(tt):
    mad_skills = set()
    for skill_varname, canon_name in SERVICE_ALIASES.items():
        if skill_varname in tt:
            mad_skills.add(canon_name)
    return  "expertise", list(mad_skills)


def parse_contact(tt):
    numbers = re.findall(r"[0-9][0-9][0-9][0-9] [0-9][0-9][0-9][0-9]", tt)
    if len(numbers) == 0: 
        return  "number",""        
    else:
        return "number", numbers[-1]


parsers = {'address': parse_address, 'names': parse_names, 'contact': parse_contact, 'service': parse_services}

def make_data_for_jvb(entry):
    jvb_format = {'specialistType': "Paediatrican"}
    
    for key, tt in entry.items():
        jvb_key, jvb_value = parsers[key](tt)
        jvb_format[jvb_key] = jvb_value
        
    return jvb_format
    
    
def make_all_data_for_jon(entries):
    data=[]
    for entry in entries.values():
        datum = make_data_for_jvb(entry)
        if len(datum['name'].strip()) > 0: 
            data.append(datum)
    return data

        

In [101]:
make_data_for_jvb(list(entries.values())[2])

{'area': 'Joondalup',
 'expertise': ['Autism Spectrum Disorder', 'Adolescents'],
 'name': 'Dr Jongelin',
 'number': '9400 9910',
 'specialistType': 'Paediatrican'}

In [102]:
with open("../../www/default-data/auto_ped.json","w") as fh:
    json.dump(make_all_data_for_jon(entries), fh, sort_keys=True, indent=4, separators=(',', ': '))


In [103]:
!ls ../../www/default-data/

auto_ped.json  auto_psych.json
