#### Author: Arash Farahani
#### Date: 8/1/2017

### Usage
This program goes over all the Manhattan GBATed DOL-QCEW files and checks the error sheet in Google Maps API to retreive Formatted Addresses and Lat-Lon. The process is repeated twice: first with the original address, and next with the Trade/Legal name added to the address. 

#### The next steps
1. Check the final addresses manually: do the Google Formatted Addresses make sense?
2. Run through GBAT again using a final address.

In [274]:
import logging
logging.basicConfig(filename='Y:/ython Procedures/QCEW_Address_Check.log',level=logging.DEBUG)

import googlemaps
from datetime import datetime
import pandas as pd
import re, os
import configparser

config = configparser.RawConfigParser()
config.read('API_Keys.cfg')
google_api_key = config.get('Google', 'QCEW_API_Key')

gmaps = googlemaps.Client(key=google_api_key)

#test
geocode_result=gmaps.geocode("BETH ISRAEL MEDICAL CENTER, 1ST AVE AT 16TH ST, New York, NY 10003")
print(geocode_result[0]['geometry']['location']['lat'],geocode_result[0]['geometry']['location']['lng'])
#print(geocode_result)


40.7333526 -73.9823138


In [257]:
directory = os.fsencode("Z:/EAD/DOL Data/QCEW to RPAD address merge/forgbat/Manhattan/")

# these files were already processed by google. I don't want to pay for processig them again, so I'll skip them.
processed_files=['forgbatmn01.xlsx','forgbatmn05.xlsx']

for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".xlsx") and '_GoogOut_' not in filename and filename not in processed_files:
        print(os.path.join(directory,filename.encode()))
        adds=pd.ExcelFile(os.path.join(directory,filename.encode()).decode())
        print(adds.sheet_names)
        df = adds.parse('GBATErr')
        df = df.drop(['BadRecordId','GRC','ReasonCode'], 1)   #drop the GBAT Err Columns
        print(filename, ' was loaded')
        df.head()
        
        
        df.fillna('')
        # trade2 is trade name when available, and legal name when not.
        df['trade2'] = ""
        df.loc[df.trade.fillna('')!= '', 'trade2'] = df.trade 
        df.loc[df.trade.fillna('')== '', 'trade2'] = df.legal
        # trade(or legal) name + Original Address + City, State, Zip
        df['NameAddress']= df.trade2.fillna('')+', '+df.originaladdress.fillna('')+', New York, NY '+df.pzip.apply(str)
        df['Address']= df.originaladdress.fillna('')+', New York, NY '+df.pzip.apply(str)
        df['NameAddress'].head()
        
        #df.reset_index(inplace=True)
        df['Gformatted_address0']= ""
        df['Glat0']= 0
        df['Glon0']= 0
        df['GPartial0']= False
        df['Gtypes0']=""
        df['Gformatted_address1']= ""
        df['Glat1']= 0
        df['Glon1']= 0
        df['GPartial1']= False
        df['Gtypes1']=""
        i=-1
        for var in ['Address','NameAddress']:
            print('Started checking variable ',var)
            i+=1
            for index, row in df.iterrows():
                if index<=len(df.index):
                    geocode_result=gmaps.geocode(row[var])
                    if len(geocode_result)>0:
                        if 'partial_match' in geocode_result:
                            df.loc[df.index == index,['GPartial'+ str(i),'Gformatted_address'+ str(i), 'Glat'+ str(i), 'Glon'+ str(i), 'Gtypes'+ str(i)]]=(
                                                 geocode_result[0]['partial_match'], geocode_result[0]['formatted_address']
                                                 ,geocode_result[0]['geometry']['location']['lat'],geocode_result[0]['geometry']['location']['lng'],str(geocode_result[0]['types']))
                        else:
                            df.loc[df.index == index,['Gformatted_address'+ str(i), 'Glat'+ str(i), 'Glon'+ str(i), 'Gtypes'+ str(i)]]=(
                                geocode_result[0]['formatted_address'],geocode_result[0]['geometry']['location']['lat']
                                ,geocode_result[0]['geometry']['location']['lng'],str(geocode_result[0]['types']))
                    else:
                        df.loc[df.index == index,['Gformatted_address'+ str(i), 'Glat'+ str(i), 'Glon'+ str(i), 'Gtypes'+ str(i)]]=(
                                                                                                    'Not Found',0,0,str([0]))
            # Prepare for Second Run of GBAT
            df['Gzip'+ str(i)]= 0
            pat2= r".*([0-9]{5}).*"
            repl0 = lambda m: m.group(1)
            df['Gzip1']=df['Gformatted_address'+ str(0)].str.replace(pat, repl0)
            
            pat= r"([0-9\-]+)(.*?)(,.*)"
            repl1 = lambda m: m.group(1)
            repl2 = lambda m: m.group(2)
            df['Gnumber'+ str(i)]=df['Gformatted_address'+ str(i)].str.replace(pat, repl1)
            df['Gstreet'+ str(i)]=df['Gformatted_address'+ str(i)].str.replace(pat, repl2)
            df['boro']=1
            df[['Gnumber'+ str(i),'Gformatted_address'+ str(i)]].head(10)
            
        # Save The Results
        df['same_GF_Address']=(df['Gformatted_address1']==df['Gformatted_address0'])
        pd.ExcelFile(os.path.join(directory,filename.encode()).decode())
        writer = pd.ExcelWriter(os.path.join(directory,('_GoogOut_'+filename).encode()).decode())
        df.to_excel(writer,'Sheet1')
        print('Processed data and saved: ', '_GoogOut_'+filename )
        #df2.to_excel(writer,'Sheet2')
        writer.save()
    else:
        continue


## SOME OTHER STUFF
# Look up an address with reverse geocoding
#reverse_geocode_result = gmaps.reverse_geocode((40.714224, -73.961452))

# Request directions via public transit
#now = datetime.now()
#directions_result = gmaps.directions("Sydney Town Hall",
#                                     "Parramatta, NSW",
#                                     mode="transit",
#                                     departure_time=now)

## Some Other Tests
#geocode_result=gmaps.geocode("BETH ISRAEL MEDICAL CENTER, 1ST AVE AT 16TH ST, New York, NY 10003")
#print(geocode_result[0]['geometry']['location']['lat'],geocode_result[0]['geometry']['location']['lng'])
#print(geocode_result[0]['partial_match'])
#print(geocode_result[0])



b'Z:/EAD/DOL Data/QCEW to RPAD address merge/forgbat/Manhattan/forgbatmn01.xlsx'
['forgbatmn01', 'GBATOut', 'GBATErr']
forgbatmn01.xlsx  was loaded
Started checking variable  Address
Started checking variable  NameAddress
Processed data and saved:  _GoogOut_forgbatmn01.xlsx
b'Z:/EAD/DOL Data/QCEW to RPAD address merge/forgbat/Manhattan/forgbatmn05.xlsx'
['forgbatmn05', 'GBATOut', 'GBATErr']
forgbatmn05.xlsx  was loaded
Started checking variable  Address
Started checking variable  NameAddress
Processed data and saved:  _GoogOut_forgbatmn05.xlsx
b'Z:/EAD/DOL Data/QCEW to RPAD address merge/forgbat/Manhattan/forgbatmn07.xlsx'
['forgbatmn07', 'GBATOut', 'GBATErr', 'GBATExt']
forgbatmn07.xlsx  was loaded
Started checking variable  Address


TransportError: ('Connection aborted.', OSError("(10054, 'WSAECONNRESET')",))

In [256]:
print(len(df.index))

2588


In [273]:
df['zip1'].head()

0    11225
1    10035
2    10009
3    10025
4    10019
Name: zip1, dtype: object