# Predicting and Structuring Unstructured Addresses

## Libraries

In [6]:
import googlemaps
gmaps = googlemaps.Client(key='AIzaSyA0t9sRKtArExv7Ay_e0OnjjQnlI-R9x2I')

In [7]:
import pandas as pd
import numpy as np

## Reading the Data

In [3]:
#Data
address = pd.read_csv("C:/Users/Gourab/Desktop/test_address.csv")

In [4]:
address.head()

Unnamed: 0,Address
0,pitampura
1,"#3, Sr no 33/6, saifitness building, near icc..."
2,1027 sector 28 ground floor faridabad haryana
3,106b U&V block shalimar bagh delhi
4,"1077,sector-4/a,bokaro steel city, jharkhand,8..."


## Text Cleaning



    For each address do:

        1:    Convert to lower case
        2:    Spot comma and check if there is a space after that, if not then include a space

        3:    Remove full stop
        4:    Remove hyphen

        5:    If starting with a single or double letter alphabet 
        6:    then Check if the next str contains number and if True then Join them together

        7:    If starting with either of (h. no, h. no., h.no, h.no., h/no., hno, hno-, hno., house#, house #, #, house no, 
              house no., house number) then strip these strings off

        8:    Remove extra white spaces


In [None]:
addrs = address.Address

In [None]:
#Cleaned Data

address_cln = []

for addr in addrs:
    #Cleaning 1: Convert the address to lower case
    addr = str(addr)
    addr = addr.lower()
    
    
    #Cleaning 2: Adjust the commas
    #Counting the number of comma present
    commas = addr.count(',')
    #Checking if there is a space after commas (if not include a space)
    if commas > 0:
        indx = 0
        for i in range(0,commas):
            indx = addr.find(',', indx) + 1
            if addr[indx] == " ":
                continue
            else:
                addr = addr[0:indx] + ' ' + addr[indx:]
    
    
    
    #Cleaning 4: Remove full stops
    addr = addr.replace('.', ' ')
    
    
    #Cleaning 4: #Remove - with ''
    addr = addr.replace('-', '')
    
    
    
    #Cleaning 5:
    #Check if the string starts with single/double letters followed by a string containing numbers. Join them together
    firstword = addr[:addr.find(" ")]
    nextword  = addr[addr.find(" ")+1:addr.find(" ",addr.find(" ")+1)]
    
    if (len(firstword)) <= 2 and any(char.isdigit() for char in addr):
        addr = firstword + nextword + addr[addr.find(' ', addr.find(' ')+1):]
        
    

    #Cleaning 6:
    #(h. no, h. no., h.no, h.no., h/no., hno, hno-, hno., house#, house #, #, house no, house no., house number)...
    #If address starts with any of the above, then remove them from the beginning
    
    remove = ['h/no','hno', 'hno-', 'hno.', 'h no', 'house#', 'house #', '#', 'house no', 'house no.', 'house number']
    
    for word in remove:
        if addr.startswith(word):
            addr = addr.lstrip(word)
            break
            
    
    #Cleaning 7: Remove extra white spaces
    addr = addr.strip()
    
    
    #Cleaning 8:
    #Spot the first '/' and remove the pace immediately before and after it (if any)
    slash = addr.find('/')
    if addr[slash+1] == ' ':
        addr = addr[:slash+1] + addr[slash+2:]
    if addr[slash-1] == ' ':
        addr = addr[:slash-1] + addr[slash:]
        
        
    #Cleaning 9:
    #Check if the string starts with single/double letters followed by a string containing numbers. Join them together
    firstword = addr[:addr.find(" ")]
    nextword  = addr[addr.find(" ")+1:addr.find(" ",addr.find(" ")+1)]
    
    if (len(firstword)) <= 2 and any(char.isdigit() for char in addr):
        addr = firstword + nextword + addr[addr.find(' ', addr.find(' ')+1):]
   
    
    
    #Claening 9: Remove the near, opposite and behind information
    if (addr.find('near') != -1) and (addr.find(',',addr.find('near')) != -1):
        addr = addr[:addr.find('near')] + addr[addr.find(',',addr.find('near'))+1:]
    
    if (addr.find('behind') != -1) and (addr.find(',',addr.find('behind')) != -1):
        addr = addr[:addr.find('behind')] + addr[addr.find(',',addr.find('behind'))+1:]
    
    if (addr.find('opposite') != -1) and (addr.find(',',addr.find('opposite')) != -1):
        addr = addr[:addr.find('opposite')] + addr[addr.find(',',addr.find('opposite'))+1:]
        
    
    #Cleaning 10: Cleaning brackets and its contents
    if (addr.find('(') != -1) and (addr.find(')',addr.find('(')) != -1):
        addr = addr[:addr.find('(')] + addr[addr.find(')',addr.find('('))+1:]
    
    
    
    #Storing the cleaned address
    address_cln.append(addr)
    


In [None]:
address_cln[:5]

## Retrieving the Formatted Address from Google Map API for the Cleaned Data

In [None]:
frmt_addr = []
for addr in address_cln[:10]:
    geocode = gmaps.geocode(addr)
    print(geocode)
    if len(geocode) == 0:
        frmt_addr.append("NA")
    else:
        frmt_addr.append(geocode[0]['formatted_address'])


In [None]:
address['Formatted Address'] = frmt_addr

In [None]:
address.head()