## Location Modeling

### Address Extraction, Address Parsing and Geocoding

## Imports

In [2]:
import pandas as pd
import numpy as np

import codecs, json
import requests
from bs4 import BeautifulSoup

from address_parser import Parser
import usaddress
from postal.parser import parse_address
from postal.expand import expand_address

import censusdata
from census import Census
from us import states

from geocodio import GeocodioClient
import googlemaps

In [3]:
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns',100)

### Import Combined NTU & Kaggle Tweet Dataset

In [4]:
df_1 = pd.read_csv('./data/df_combined.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
df_1.shape

(267682, 3)

In [6]:
df_1.head()

Unnamed: 0,date,id,tweet
0,2017-08-13,8.96827e+17,Gert could become a quite intense post-tropica...
1,2017-08-14,8.97013e+17,"Weather Street: Tropical Storm Harvey, Hurrica..."
2,2017-08-14,8.97087e+17,Tropical Storm #Gert intensifying. Tropical St...
3,2017-08-14,8.97088e+17,Tropical Storm #Gert intensifying. Tropical St...
4,2017-08-14,8.97088e+17,"RT YourNews15 ""Tropical Storm #Gert intensifyi..."


### Import Labeled Data CSV

In [7]:
# 'Nb'label' is CSV from Naive Bayes classification model in previous notebook

df_nb = pd.read_csv('./data/nb_labels.csv', index_col = 0)

In [10]:
# Concatenate Tweet dataframe and labels dataframe

df = pd.concat([df_1, df_nb], axis=1)

In [11]:
df.head()

Unnamed: 0,date,id,tweet,nb_label
0,2017-08-13,8.96827e+17,Gert could become a quite intense post-tropica...,0
1,2017-08-14,8.97013e+17,"Weather Street: Tropical Storm Harvey, Hurrica...",0
2,2017-08-14,8.97087e+17,Tropical Storm #Gert intensifying. Tropical St...,0
3,2017-08-14,8.97088e+17,Tropical Storm #Gert intensifying. Tropical St...,0
4,2017-08-14,8.97088e+17,"RT YourNews15 ""Tropical Storm #Gert intensifyi...",0


In [13]:
# Check Emergency labeled value counts

df['nb_label'].value_counts()

0    266677
1      1005
Name: nb_label, dtype: int64

#### Create Filtered Dataframe for only Emergency Tweets

In [14]:
df_help = df[df['nb_label']==1]

In [15]:
df_help.shape

(1005, 4)

### Import Texas Cities Census Data

In [18]:
# Texas cities from US CensusData Population Estimates

tx_cities = pd.read_csv('./data/geodb/texas_cities.csv', skiprows=1)

In [19]:
tx_cities.head()

Unnamed: 0,Id,Id2,Geography,"April 1, 2010 - Census","April 1, 2010 - Estimates Base",Population Estimate (as of July 1) - 2010,Population Estimate (as of July 1) - 2011,Population Estimate (as of July 1) - 2012,Population Estimate (as of July 1) - 2013,Population Estimate (as of July 1) - 2014,Population Estimate (as of July 1) - 2015,Population Estimate (as of July 1) - 2016,Population Estimate (as of July 1) - 2017,Population Estimate (as of July 1) - 2018
0,1620000US4800100,4800100,"Abbott city, Texas",356,361,362,362,361,358,354,354,357,363,367
1,1620000US4800160,4800160,"Abernathy city, Texas",2805,2812,2818,2833,2822,2796,2743,2725,2747,2745,2724
2,1620000US4801000,4801000,"Abilene city, Texas",117063,117512,117806,118749,119852,119792,120647,121694,121856,122210,122999
3,1620000US4801108,4801108,"Ackerly city, Texas",220,220,220,219,219,225,228,230,231,226,227
4,1620000US4801240,4801240,"Addison town, Texas",13056,13062,13091,13798,15199,15437,15501,15587,15516,15497,15945


In [20]:
# Extract Geography-Cities Column to a list

cities = tx_cities['Geography']

In [21]:
cities.head()

0       Abbott city, Texas
1    Abernathy city, Texas
2      Abilene city, Texas
3      Ackerly city, Texas
4      Addison town, Texas
Name: Geography, dtype: object

In [22]:
# Clean City name to remove city/town suffix and State

tx_city_list = []
for city in cities: 
    city = ' '.join(city.split()[:-2]).lower()
    tx_city_list.append(city)

### Scrape Houston Streets from Geographic.org 

In [32]:
# Define function to scrape street names from Geographic.org
# with state and city parameters


def scrape_streets(state, city):
    base = 'https://geographic.org/streetview/usa/'
    suffix = '.html'
    url = base + state +'/' + city + suffix
    request = requests.get(base + state +'/' + city + suffix).text
    soup = BeautifulSoup(request, 'lxml')
    my_table = soup.find('span',{'class':'listspan'})
    st_tags = my_table.findAll('a')

    streets = []
    for i in range(1,len(st_tags)):
        streets.append(st_tags[i].string)
    
    return streets

In [33]:
# Scrape streets for Houston, Texas

streets = scrape_streets('tx', 'houston')

In [34]:
# 12.5k streets from Google street view

len(streets)

12561

In [35]:
# Lower case street names

streets_low = [st.lower() for st in streets]

In [36]:
streets_low[100:105]

['adelle st', 'adina springs ln', 'adirondack dr', 'adler dr', 'adler lake dr']

In [37]:
# Drop Duplicate Street Names

streets_low = list(set(streets_low))
streets_low.sort()

In [39]:
# Set variable for streets to drop
# Street inside aiport causes some issues during lookup

st_to_drop = ['a ave - william p. hobby airport (hou)']
streets_low = [x for x in streets_low if x not in st_to_drop]

In [40]:
# Find Streets that have single name duplicate with 
# streets that have street suffix

st_to_drop = []
for i in range(len(streets_low)): 
    if len(streets_low[i].split()) < 2:
        if streets_low[i].split()[0] == streets_low[i+1].split()[0]: 
            st_to_drop.append(streets_low[i])

In [41]:
# Drop Street Duplicates from above

streets_low = [x for x in streets_low if x not in st_to_drop]

In [42]:
# Check how many street names remaining

len(streets_low)

12298

In [44]:
# Function to Process Street Name List for Lookup 
# Reducing Street Name down to 1 or 2 words,without suffixes except for short words
# Based on research, users only include suffix for short words

def street_processing(street_list):

    st_first  = [st.split(' ')[0] for st in street_list]  # create list of all first words

    main_search = []                     # list to hold street_names used for lookup

    for street in street_list: 
        first_word = street.split()[0]   # set first_word of street name to variable for test

        if len(street.split()) < 2:      # if street name is only one word

            if first_word[0].isdigit() == False:  # if first character is a letter

                main_search.append(street)         # append the one word street name to main search list
                #one_wd_st.append(street)           

            else: 
                main_search.append(street +' '+ 'st')  # append one word digit name + 'st' to main search_list

        elif len(street.split()) == 2:        # if street name is two words 

            second_word = street.split()[1]   # create second word variable for streets with at least 2 words


            if len(first_word) < 6:          # if first word is less than 6 characters

                #if first_word[0].isdigit() == False:   # if first character is char, this doesnt matter

                main_search.append(first_word + ' ' + second_word)    # append first and second word to search

            elif len(first_word) >= 6:         # if first word is at least 6 characters

                if first_word[0].isdigit():   # first word starts with num

                    main_search.append(first_word + ' ' + second_word) # append first and second word to search

                else:                         # starts w char

                    main_search.append(first_word)  # append only first word

        elif len(street.split()) > 2:         # street name is more than two words

            second_word = street.split()[1]   # create second word again for different if statement

            main_search.append(first_word + ' ' + second_word)   # append first two words

    return main_search

In [45]:
# Execut Street Processing Function

street_lookup = street_processing(streets_low)

In [47]:
## Remove Duplicates from newly cleaned street list

street_lookup = list(set(street_lookup))
street_lookup.sort()

### Emergency Tweets for Address Extraction Testing

Test tweets were manually pulled directly from Twitter, for known Harvey emergency related tweets on 
City of Houston Twitter account during the hurricane.  <br>
<br>
These Tweets are not in our dataset.

In [51]:
twt_1 = ('My 83 y.o. Parents in imminent danger at 4922 Loch Lomond #Meyerland. Water knee deep inside home.'
         ' Mom=heart condition. Dad=Alzheimer\'s')

twt_2 = ('We are not at inches, we are at 4-5 feet in this neighborhood. Wood Shadows II 11607 Lafferty Oaks')

twt_3 = ('Plz help!! brother &family stuck in Dickson. 4901 38th street'
         ' dickinson, tx 77539 his name is Rey (409) 999-0010')

twt_4 = ('911 and coast guard ring busy. 4923 Braesvalley 77096.  4 adults, one disabled teen. on 2nd floor'
         ' Elderly couple across street trapped.')

twt_5 = ('We\'re at the Redford Square Apartments 9406 Redford Street off 45 and Edgebrook'
         ' we have a new born baby please help us the first floor Apart')

twt_6 = ('412 Texas St. South Houston TX 77587')

twt_7 = ('Apt 2105 at Meyer Forest Apts, she is handicapped and she cant get out, this is in Meyerland')

#twt_8 = 'How \'bout 12"?  Is 12" in a home a danger to an elderly person with limited mobility? '\
#         '2608 Martin Street, @PasadenaTX @PasadenaPD'
    
twt_8 = ('How \'bout 12"?  Is 12" in a home a danger to an elderly person with limited mobility?'
         ' 2608 Martin Street, PasadenaTX PasadenaPD')

In [52]:
# combine tweets into list

test_twts = [twt_1, twt_2,twt_3, twt_4, twt_5, twt_6, twt_7, twt_8]

In [53]:
# lower case tweets

twts_low = [twt.lower() for twt in test_twts ]

In [None]:
# Common words after numbers that are not addresses, will be used to filter tweets with
# digits that are not address components

num_excluders = ['feet', 'inches', 'people', 'adults']



### Testing: Lookup Street Name and Extract Address

In [55]:
# Test Tweet 2

matches = []
count = 0
for street in street_lookup: 
    if street in twt_2.lower(): 
        matches.append(street)
        print(street)

lafferty
lafferty oaks
shadow


In [60]:
# Extracting Street Number

twt_2_tok = twt_2.lower().split()

match_idx = twt_2.lower().split().index(matches[0])

if twt_2_tok[match_idx - 1].isdigit():
    street_num = twt_2_tok[match_idx - 1]
    
street_num

'11607'

In [66]:
# Combine above extractions

min_address = []
for match in matches: 
    if len(match.split()) > 1:
        match_idx = twt_2.lower().split().index(match.split()[0])
        if twt_2_tok[match_idx - 1].isdigit():
            street_num = twt_2_tok[match_idx - 1]
            min_address.append(street_num +' '+match)


In [67]:
min_address

['11607 lafferty oaks']

Address Extraction method above requires additional development to extract the full address.  Because it was not fully complete, ultimately it was not used.  <br>
<br>
For this method to be effective, we will need to include street names for other cities and townships outside of Houston which are not on the current street lookup list.  Our dataset contains many tweets from Port Arthur and Dickinson, both outside of Houston. For tweets with those addresses, without those street lists we would overlook those address during the lookup and filter step.  <br>
<br> 

### Filter Emergency (df_help) dataframe 

In [159]:
df_help.head(10)

Unnamed: 0,date,id,tweet,nb_label
6624,2017-08-24,9.0074e+17,Please don't forget about your pets during Tro...,1
13617,2017-08-27,9.01916e+17,Here's how you can help with #Tropical_Storm_H...,1
14131,2017-08-27,9.01931e+17,Come on #tropicalstorm #Harvey u need to go!!!...,1
27074,2017-08-30,9.0289e+17,Did you forget Harvey is a Tropical Storm and ...,1
33222,2017-08-25,9.01e+17,Stay Safe #Texas Evacuate if you must & don't ...,1
33564,2017-08-25,9.01e+17,Better hurry,1
33648,2017-08-25,9.01e+17,"In your runs to get food and water, make sure ...",1
33848,2017-08-25,9.01e+17,"I?ve had stronger hurricanes at Pat O? Briens,...",1
33984,2017-08-25,9.01e+17,#HurricaneHarvey pls come thru. I'm tryna slee...,1
34346,2017-08-25,9.01e+17,#TRUMP will be there #HurricaneHarvey #TEXAS e...,1


In [160]:
df_help.shape

(1005, 4)

### Check Help Tweets for Potential Address


In [69]:
# Define 'has number' function to test if string has a digit

def has_num(input_str):
     return any(char.isdigit() for char in input_str)

In [70]:
# Check for Street Name match and presence of a digit in tweet
# Create new columns with Binary Identifier 1 for match & list of street matches

matches = []
bin_match = []
for twt in df_help['tweet']: 
    st_match = []
    add_ct = 0 
    bin_ct = 0
    
    for street in street_lookup:
        if street in twt.lower():
            if has_num(twt.lower()):
                add_ct += 1
                st_match.append(street)
                bin_ct = 1
    matches.append(st_match)
    bin_match.append(bin_ct)
df_help['st_matches'] = pd.Series(matches, index=df_help.index)
df_help['bin_match'] = pd.Series(bin_match, index=df_help.index)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [71]:
df_help.head()

Unnamed: 0,date,id,tweet,nb_label,st_matches,bin_match
6624,2017-08-24,9.0074e+17,Please don't forget about your pets during Tro...,1,[],0
13617,2017-08-27,9.01916e+17,Here's how you can help with #Tropical_Storm_H...,1,"[harvey, tropical]",1
14131,2017-08-27,9.01931e+17,Come on #tropicalstorm #Harvey u need to go!!!...,1,[],0
27074,2017-08-30,9.0289e+17,Did you forget Harvey is a Tropical Storm and ...,1,[],0
33222,2017-08-25,9.01e+17,Stay Safe #Texas Evacuate if you must & don't ...,1,[],0


In [72]:
# Check Street name + num matching counts

df_help['bin_match'].value_counts()

0    809
1    196
Name: bin_match, dtype: int64

In [142]:
# Create Dataframe filtered for Address partial match

df_add = df_help[df_help['bin_match']==1]

In [143]:
df_add.shape

(196, 6)

## Address Parsing Pipeline

### Addresses Manually Inspected and Extracted
Below addresses were pulled for testing and mapping in the following notebook prior to completion of above Parser function. These address were used in mapping for the demonstration of mapping functionality in the Presentation.  Some but not all of these address overlap with the identified addresses in the above filtered dataframe.

In [99]:
idx = [265179, 267654, 267451, 267422, 265192, 265161, 265146, 265145]
address = ['9407 Cranleigh Ct. Houston 77096', '4724 Amalie St Houston','3226 Ave G',
           '8015 Serenity Court Houston, TX', '5400 Bayou Dr. Dickson, TX', 
           'Big Bend Avenue, 39th St., Port Arthur','340 West 17th St. Port Arthur, TX 77640',
          '3605 Jimmy Johnson Blvd Apt. 1002 Port Arthur TX 77642']
lat = ['29.6784058','29.8551516' '29.465968099', '29.6948069', '29.4494273','29.9147006', '29.8748434', 
      '29.8748434', '29.945638']
lng = ['-95.4633444','-95.323505099',  '-95.059360599', '-95.422081499', '-95.0609667', '-93.9485301',
      '-93.9523373999999', '-93.952337399','-93.9755919999']


In [101]:
tweet_1 = df.loc[265179, 'tweet']
tweet_2 = df.loc[265145, 'tweet']
tweet_3 = df.loc[267654, 'tweet']
tweet_4 = df.loc[267451, 'tweet']
tweet_5 = df.loc[267422, 'tweet']
tweet_6 = df.loc[265161, 'tweet']
tweet_7 = df.loc[265146, 'tweet']
tweet_8 = df.loc[265145, 'tweet']

#### Testing Libpostal parser on raw Extracted Tweets

In [102]:
parse_address(tweet_1)

[('@houstonpolice #harveyrescue #harveysos', 'house'),
 ('9407', 'house_number'),
 ('cranleigh ct.', 'road'),
 ('houston', 'city'),
 ('77096', 'postcode'),
 ("please family of 5 need help. they're trapped on the roof #", 'house'),
 ('houston', 'city')]

In [103]:
parse_address(tweet_2)

[('@harveyrescue 3 adults and 4 children need help.', 'house'),
 ('3605', 'house_number'),
 ('jimmy johnson blvd', 'road'),
 ('apt. 1002', 'unit'),
 ('port arthur', 'city'),
 ('tx', 'state'),
 ('77642', 'postcode'),
 ('#harveysos', 'house')]

In [104]:
parse_address(tweet_3)

[("i've got a scared friend at 4724 amalie st. please send help @khou @houstonpolice @abc13houston @houstontx",
  'house')]

In [105]:
parse_address(tweet_4)

[('my fil frank emmitte is trapped', 'house'),
 ('3226', 'house_number'),
 ('ave g', 'road'),
 ("with 4-5' water. he is elderly cannot walk well. pls help retweet send info #hurricaneharvery",
  'house')]

### Address Parser Functions to Manipulate output

In [106]:
# Function uses libpostal address parser and reformats output into a dictionary

def libpost_parser(tweet):
    libpost = parse_address(tweet)
    parsed_address = {}
    for value, key in libpost: 
        parsed_address[key] = value

    return parsed_address

In [107]:
# Test function on several tweets

parsed_address = libpost_parser(twt_2)

In [108]:
parsed_address

{'house': 'we are not at inches we are at 4-5 feet in this neighborhood. wood shadows ii',
 'house_number': '11607',
 'road': 'lafferty oaks'}

In [109]:
libpost_parser(twt_6)

{'house_number': '412',
 'road': 'texas st.',
 'city': 'south houston',
 'state': 'tx',
 'postcode': '77587'}

### Expand libpostal parser function
Process tweets in dataframe based on multiple conditionals about the validity of output from the libpostal parser


In [82]:
# Full address parser function that asseses validity of the output from libpostal parser
# based on city and street lookups, and returns a combined formatted address.

def parser_full(tweet): 
    
    tweet = tweet.replace('#', ' ').replace('@', ' ').replace('/', ' ')
    libpost = parse_address(tweet)
    parsed_address = {}
    
    for value, key in libpost: 
        parsed_address[key] = value
    
    st_num_bin = 0
    street_bin = 0
    city_bin = 0
    state = 0
    zip_bin = 0
    
    if 'house_number' in parsed_address:
        if parsed_address['house_number'].isdigit() and (len(parsed_address['house_number']) < 6): 
            street_num = parsed_address['house_number']
            st_num_bin = 1
    else: 
        street_num = 'NA'
    
    if 'road' in parsed_address:
        #if parsed_address['road'] in street_lookup:
        street = parsed_address['road']
        street_bin = 1  
            
        # Below section requires further testing to validate street    
        #elif 'city' in parsed_address:
            #if parsed_address['city'] in tx_city_list: #and parsed_address['city'] != 'houston':
                #street = parsed_address['road']
                #street_bin = 1 
            #else: 
                #street = 'NA'
        #else: 
            #street = 'NA'
    else: 
        street = 'NA'
    
    if 'city' in parsed_address:
        if parsed_address['city'] in tx_city_list:
            city = parsed_address['city']
            city_bin = 1
    #else: 
        #city = 'houston'
        #city_bin = 1

    # Below block should be used when pulling data from multiple states     
    if 'state' in parsed_address:
        if len(parsed_address['state']) == 2:
            state = parsed_address['state']
            state_bin = 1;
    else: 
        state = 'tx'
        state_bin = 1
    
    if 'postcode' in parsed_address:
        if parsed_address['postcode'].isdigit() and (len(parsed_address['postcode']) == 5):    
            zipcode = parsed_address['postcode']
            zip_bin = 1
            
    if st_num_bin == 0 and street_bin == 0: 
        formatted_addr = 'NA'
        
    if street_bin == 1:
        if st_num_bin == 1: 
            if city_bin == 1: 
                if zip_bin == 1:
                    formatted_addr = f'{street_num} {street}, {city}, {state} {zipcode}'
                else: 
                    formatted_addr = f'{street_num} {street}, {city}, {state}'
            elif zip_bin == 1: 
                formatted_addr = f'{street_num} {street}, {state} {zipcode}'
            else: 
                formatted_addr = f'{street_num} {street}, {state}'
        else:
            if city_bin == 1: 
                if zip_bin == 1:
                    formatted_addr = f'{street}, {city}, {state} {zipcode}'
                else: 
                    formatted_addr = f'{street}, {city}, {state}'
            elif zip_bin == 1: 
                formatted_addr = f'{street}, {state} {zipcode}'  
            else: 
                formatted_addr = 'NA'
    else: 
        formatted_addr = 'NA'
    
    
    return formatted_addr

In [85]:
# Test parser function

parser_full(twt_6)

'412 texas st., south houston, tx 77587'

In [83]:
parsed_address

{'house_number': '412',
 'road': 'texas st.',
 'city': 'south houston',
 'state': 'tx',
 'postcode': '77587'}

In [449]:
parse_address(df.loc[265164, 'tweet'])

[('forward 1 adult 8 year old w autism on top of car in garage', 'house'),
 ('2935', 'house_number'),
 ('40th st.', 'road'),
 ('port arthur', 'city'),
 ('tx', 'state'),
 ('77642', 'postcode'),
 ('sos help harvey harveysos', 'house')]

In [88]:
parser_full(df.loc[265164, 'tweet'])

'2935 40th st., port arthur, tx 77642'

In [144]:
# Apply Address Parser to entire tweet column in dataframe creating new column

df_add['formatted_addr'] = df_add['tweet'].apply(parser_full)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [155]:
# Check Dataframe for non-'NA' values

df_add[df_add['formatted_addr'] != 'NA']

Unnamed: 0,date,id,tweet,nb_label,st_matches,bin_match,formatted_addr
39830,2017-08-25,9.01e+17,#Fema claims to be on the ground ready for #Hu...,1,"[e o, harvey]",1,"1 translation, tx"
64158,2017-08-25,9.01e+17,AMERICA our fellow countrymen in Texas will ne...,1,"[country, harvey]",1,"4 once, tx"
100630,2017-08-26,9.01e+17,Volunteers: If U want 2 help with the aftermat...,1,[harvey],1,"2 u want, tx"
108894,2017-08-26,9.01e+17,Can't find the Bayport chapter on here. I went...,1,"[harvey, ina]",1,"4 hurricaneharvey, tx"
123008,2017-08-26,9.01e+17,RT this Thank you 4 making ppl aware of these ...,1,"[e o, harvey]",1,"4 rt this thank you, tx"
265145,30 Aug 2017,/KaraBranch12/status/902802041478803456,@HarveyRescue 3 adults and 4 children need hel...,1,"[arthur, harvey, johnson]",1,"3605 jimmy johnson blvd, port arthur, tx 77642"
265146,30 Aug 2017,/tegan626/status/903020721378648068,My great grandfather needs a rescue at 340 Wes...,1,"[arthur, h st, harvey, tierra]",1,"west 17th st., tx 77640"
265161,30 Aug 2017,/nyli/status/902834821449359360,#HarveySOS In Port Arthur on Big Bend Avenu...,1,"[arthur, avenue, big bend, e o, h st, harvey, ...",1,"18 bend avenue off of 39th st. approx., tx"
265164,30 Aug 2017,/MomSkelton/status/902788660038295552,"Forward\n ( 1 adult, 8 year old w/autism)\nOn...",1,"[arthur, h st, harvey]",1,"2935 40th st., port arthur, tx 77642"
265171,28 Aug 2017,/OneofTwin/status/902196100962148354,We need help in CE King Parkway Forest Subdivi...,1,"[forest, harvey, parkway, sherry, vision]",1,"8615 sherrywood drive, tx 77044"


First 6 records are not addresses but escaped through the parser. These should be labeled as 'NA'.  Of the remaining 10 addresses, 8 appear to be useable, while 2 still include a a large amount extraneous text.  Parser needs further refinement.  We can also manually modify these address before sending them to the Geocoder.


## Geocoding

#### Geocodio

In [146]:
# Insert your personal API Key


client = GeocodioClient('your_api_key')

In [148]:
# Test Geocodio

geo_loc2 = client.geocode('4901 38th Street,dickinson, tx')
geo_loc2['results'][0]

{'address_components': {'number': '4901',
  'predirectional': 'E',
  'street': '38th',
  'suffix': 'St',
  'formatted_street': 'E 38th St',
  'city': 'Dickinson',
  'county': 'Galveston County',
  'state': 'TX',
  'zip': '77539',
  'country': 'US'},
 'formatted_address': '4901 E 38th St, Dickinson, TX 77539',
 'location': {'lat': 29.466603, 'lng': -95.038147},
 'accuracy': 0.9,
 'accuracy_type': 'rooftop',
 'source': 'Galveston'}

Geocodio is nice that it gives us an accuracy score, which indicates a level of confidence in accuracy of the coordinates.

In [114]:
# Parse Geocodio dictionary for Accuracy

geo_loc2['results'][0]['accuracy']

0.9

In [115]:
# Parse Geocodio Dictionary for Lat/Long

geo_loc2['results'][0]['location']

{'lat': 29.466603, 'lng': -95.038147}

In [116]:
# Parse Geocodio Dictionary for Formatted Address

geo_loc2['results'][0]['formatted_address']

'4901 E 38th St, Dickinson, TX 77539'

In [117]:
# Testing Geocodio without City or zip

client.geocode('4922 Loch Lomond, tx')


{'input': {'address_components': {'number': '4922',
   'city': 'Loch Lomond',
   'state': 'TX',
   'country': 'US'},
  'formatted_address': '4922, Loch Lomond, TX'},
 'results': [],

Geocodio does not perform well without a city or zip code. So we likely will end up using Google.

In [None]:
# Test how Geocodio will handle the 'NA' values

# client.geocode('NA')

Geocodio's Error indicates that the city or zip is needed to geocode the address. 

#### GoogleMaps

In [120]:
# Insert your personal API key

gmaps = googlemaps.Client(key='your_api_key')

In [121]:
# Testing Google Geocoder without city or zip
# Google does a much better job at identifying address without city or zip.

goog_geo = gmaps.geocode('4922 loch lomond, tx')


In [122]:
# Format Lat and Long

goog_geo[0]['geometry']['location']

{'lat': 29.6820753, 'lng': -95.4646344}

In [123]:
# Google Formatted Address

goog_geo[0]['formatted_address']

'4922 Loch Lomond Dr, Houston, TX 77096, USA'

Google performs better on the above address without a city or zip. It can process the address with only street number, name and state.

In [124]:
# Test how Google will handle 'NA' values

gmaps.geocode('NA')

[{'address_components': [{'long_name': 'United States',
    'short_name': 'US',
    'types': ['country', 'political']}],
  'formatted_address': 'United States',
  'geometry': {'bounds': {'northeast': {'lat': 71.5388001, 'lng': -66.885417},
    'southwest': {'lat': 18.7763, 'lng': 170.5957}},
   'location': {'lat': 37.09024, 'lng': -95.712891},
   'location_type': 'APPROXIMATE',
   'viewport': {'northeast': {'lat': 49.38, 'lng': -66.94},
    'southwest': {'lat': 25.82, 'lng': -124.39}}},
  'place_id': 'ChIJCzYy5IS16lQRQrfeQ5K5Oxw',
  'types': ['country', 'political']}]

#### Run Address through Google Geocoder

In [149]:
# Clean corrupt formatted address that Google Geocoder cannot process

df_add.loc[239885, 'formatted_addr'] = 'NA'

In [170]:
# Run Non Null Address through Google Geocoder and Create new columns

def goog_geocoder(df, col):

    gmaps = googlemaps.Client(key='your_api_key')
    
    lat = []
    long = []
    goog_addr = []
    
    for row in df.index:
    
        if df.loc[row, col] != 'NA':
            goog_geo = gmaps.geocode(df.loc[row,col])
        
            lat.append(goog_geo[0]['geometry']['location']['lat'])
            long.append(goog_geo[0]['geometry']['location']['lng'])
            goog_addr.append(goog_geo[0]['formatted_address'])
    
        else: 
            lat.append('NA')
            long.append('NA')
            goog_addr.append('NA')
        
    return lat, long, goog_addr

In [154]:
df_add.shape

(196, 7)

In [171]:
# Run Google Geocoder to create latitude, longitude, and google address variables

lat, lon, goog_addr = goog_geocoder(df_add, 'formatted_addr')

In [174]:
# Create new columns for the three variables

df_add['goog_lat'] = pd.Series(lat, index=df_add.index)
df_add['goog_lon'] = pd.Series(lon, index=df_add.index)
df_add['goog_addr'] = pd.Series(goog_addr, index=df_add.index)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [178]:
# View Dataframe for non-null Google generated latitudes

df_add[df_add['goog_lat'] != 'NA']

Unnamed: 0,date,id,tweet,nb_label,st_matches,bin_match,formatted_addr,goog_lat,goog_lon,goog_addr
39830,2017-08-25,9.01e+17,#Fema claims to be on the ground ready for #Hu...,1,"[e o, harvey]",1,"1 translation, tx",32.7579,-97.3246,"Fort Worth, TX 76102, USA"
64158,2017-08-25,9.01e+17,AMERICA our fellow countrymen in Texas will ne...,1,"[country, harvey]",1,"4 once, tx",29.9557,-95.5864,"12525 Jones Rd, Houston, TX 77070, USA"
100630,2017-08-26,9.01e+17,Volunteers: If U want 2 help with the aftermat...,1,[harvey],1,"2 u want, tx",29.986,-95.1624,"18455 W Lake Houston Pkwy #140, Humble, TX 773..."
108894,2017-08-26,9.01e+17,Can't find the Bayport chapter on here. I went...,1,"[harvey, ina]",1,"4 hurricaneharvey, tx",31.9686,-99.9018,"Texas, USA"
123008,2017-08-26,9.01e+17,RT this Thank you 4 making ppl aware of these ...,1,"[e o, harvey]",1,"4 rt this thank you, tx",31.9686,-99.9018,"Texas, USA"
265145,30 Aug 2017,/KaraBranch12/status/902802041478803456,@HarveyRescue 3 adults and 4 children need hel...,1,"[arthur, harvey, johnson]",1,"3605 jimmy johnson blvd, port arthur, tx 77642",29.9456,-93.9756,"3605 Jimmy Johnson Blvd, Port Arthur, TX 77642..."
265146,30 Aug 2017,/tegan626/status/903020721378648068,My great grandfather needs a rescue at 340 Wes...,1,"[arthur, h st, harvey, tierra]",1,"west 17th st., tx 77640",29.8732,-93.9542,"W 17th St, Port Arthur, TX 77640, USA"
265161,30 Aug 2017,/nyli/status/902834821449359360,#HarveySOS In Port Arthur on Big Bend Avenu...,1,"[arthur, avenue, big bend, e o, h st, harvey, ...",1,"18 bend avenue off of 39th st. approx., tx",31.9686,-99.9018,"Texas, USA"
265164,30 Aug 2017,/MomSkelton/status/902788660038295552,"Forward\n ( 1 adult, 8 year old w/autism)\nOn...",1,"[arthur, h st, harvey]",1,"2935 40th st., port arthur, tx 77642",29.9157,-93.9492,"2935 40th St, Port Arthur, TX 77642, USA"
265171,28 Aug 2017,/OneofTwin/status/902196100962148354,We need help in CE King Parkway Forest Subdivi...,1,"[forest, harvey, parkway, sherry, vision]",1,"8615 sherrywood drive, tx 77044",29.8507,-95.2063,"8615 Sherrywood Dr, Houston, TX 77044, USA"


Google generated an address with coordinates for everything we throw at it, even though we know that at least four of the addresses are not actual addresses.

### Reduce Dataframe and Export to CSV

In [179]:
df_add = df_add[df_add['goog_lat'] != 'NA']

In [181]:
idx_to_drop = [39830, 64158, 100630, 108894, 123008, 265161, 265210]

In [182]:
df_add.drop(idx_to_drop, axis=0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [183]:
cols_to_drop = ['date', 'id', 'st_matches', 'formatted_addr']

In [184]:
df_add.drop(columns = cols_to_drop, inplace=True)

In [187]:
# Display Dataframe to export

df_add

Unnamed: 0,tweet,nb_label,bin_match,goog_lat,goog_lon,goog_addr
265145,@HarveyRescue 3 adults and 4 children need hel...,1,1,29.9456,-93.9756,"3605 Jimmy Johnson Blvd, Port Arthur, TX 77642..."
265146,My great grandfather needs a rescue at 340 Wes...,1,1,29.8732,-93.9542,"W 17th St, Port Arthur, TX 77640, USA"
265164,"Forward\n ( 1 adult, 8 year old w/autism)\nOn...",1,1,29.9157,-93.9492,"2935 40th St, Port Arthur, TX 77642, USA"
265171,We need help in CE King Parkway Forest Subdivi...,1,1,29.8507,-95.2063,"8615 Sherrywood Dr, Houston, TX 77044, USA"
265179,@houstonpolice #HarveyRescue #HarveySOS 9407 C...,1,1,29.6784,-95.4633,"9407 Cranleigh Ct, Houston, TX 77096, USA"
265192,A mom trapped in #HoustonFloods plz help:5400 ...,1,1,29.6843,-95.2639,"5400 N Bayou Dr, Houston, TX 77017, USA"
266374,Donating some clothes to #HurricaneHarvey vict...,1,1,29.8029,-95.4572,"2020 Mangum Rd, Houston, TX 77092, USA"
267422,"PLEASE SEND HELP AND RETWEET, Kristi Hammerly...",1,1,29.6953,-95.4221,"Serenity Ct, Houston, TX 77025, USA"
267451,My FIL Frank Emmitte is trapped 3226 Ave G wit...,1,1,29.466,-95.0594,"3226 Avenue G, Dickinson, TX 77539, USA"


In [185]:
# Export to CSV

df_add.to_csv('./data/geo_coords.csv')