## 

# Correct delivery Adress

Library import

In [1]:
import random
import string
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

session = tf.Session()
%config Completer.use_jedi = False

### Dataset creation

In [2]:
n = 10 #Number of streets
street_names = ['diagon', 'elm', 'abbey', 'grand', 'python']
street_type = ['callejon', 'calle', 'carrera', 'via', 'avenida']
street_zips = [random.randint(20000, 29999) for i in range(5)]  #Postal Code
numbers = [random.randint(1, 999) for i in range(n)]

Feature engineering. Create Street adresses using the names and zips defined using random choice.

In [3]:
streets = [random.choice(street_names) for i in range(n)]
street_pref = [random.choice(street_type) for i in range(n)]  #Street prefix
zips = [random.choice(street_zips) for i in range(n)]  #Street zip
fulls_streets = [x +' '+ y +' '+str(z) for x,y,z in zip(street_pref, streets, numbers)]  #Street name+prefix+zip
reference_data = [list(x) for x in zip(fulls_streets, zips)]  #Zip code adding

Function to create typos (errors) in the data, induced by a probability.

In [4]:
def create_typo(s, prob = 0.75):
    if random.uniform(0,1) < prob:
        rand_idx = random.choice(range(len(s)))
        s_list = list(s)
        s_list[rand_idx] = random.choice(string.ascii_lowercase)
        s = ''.join(s_list)
    return(s)

Create typos in the street name

In [5]:
typo_streets = [create_typo(x) for x in streets]
typo_full_streets = [x+' '+y+' '+str(z) for x,y,z in zip(street_pref, typo_streets, numbers)]
test_data = [list(x) for x in zip(typo_full_streets, zips)]

In [6]:
test_data

[['calle diagcn 424', 25712],
 ['carrera kbbey 383', 25712],
 ['callejon diagon 695', 25712],
 ['calle dython 350', 22710],
 ['callejon elm 18', 21460],
 ['avenida ezm 137', 29682],
 ['calle pythpn 522', 21460],
 ['calle diagor 541', 21460],
 ['avenida epm 411', 22710],
 ['via exm 902', 25712]]

## TF variable creation

Data variables:

In [7]:
test_address = tf.sparse_placeholder(dtype = tf.string)
test_zip = tf.placeholder(shape = [None, 1], dtype = tf.float32)

ref_address = tf.sparse_placeholder(dtype = tf.string)
ref_zip = tf.placeholder(shape = [None, n], dtype = tf.float32)

Distances between zip codes and between adresses

In [8]:
zip_dist = tf.square(tf.subtract(ref_zip, test_zip))
adress_dist = tf.edit_distance(test_address, ref_address, normalize = True)

Insted of using distances to compute difference between 2 objects, we introduce the similarity concept.

- $S(x,y) = 0$ if $x$ & $y$ are totally different
- $S(x,x) = 1$ due every object is similar to itself
- $S(x,y) = \frac{D - d(x,y)}{D-d}$, $D$ = max distance, $d$ = min distance

In [16]:
zip_max = tf.gather(tf.squeeze(zip_dist), tf.argmax(zip_dist, 1))
zip_min = tf.gather(tf.squeeze(zip_dist), tf.argmin(zip_dist, 1))

zip_sim = tf.divide(tf.subtract(zip_max, zip_dist), tf.subtract(zip_max, zip_min))

address_sim = tf.subtract(1.0, adress_dist)  #-> Due address dist is normalized, Dmax =1 D_min = 0

Now we can combine the similarities using a weighted parameter.

$$S(x,y) = \sum_{i=1}^k w_iS_k(x,y):\quad \sum_{i=1}^k w_i = 1$$

In this case the weighted parameters are:

In [17]:
address_wi = 0.5 
zip_wi = 1.0 - address_wi

weighted_sim = tf.add(tf.transpose(tf.multiply(address_wi, address_sim)), tf.multiply(zip_wi, zip_sim))

The best match index is given by:

In [18]:
top_match_idx = tf.argmax(weighted_sim,1)

A function defined to create sparse matrix based on the number of letters of each word

In [19]:
def sparse_from_word_vector(word_vector):
    num_words = len(word_vector)
    idx = [[xi, 0, yi] for xi, x in enumerate(word_vector) for yi, y in enumerate(x)]
    chars = list(''.join(word_vector))
    return(tf.SparseTensorValue(idx, chars, [num_words, 1, 1]))

Separate the data into address and zip, then feed the sparse matrix creator function

In [20]:
reference_address = [x[0] for x in reference_data]
reference_zips = np.array([[x[1] for x in reference_data]])

sparse_ref_set = sparse_from_word_vector(reference_address)

In [22]:
for i in range(n):
    test_address_entry = test_data[i][0]
    test_zip_entry = [[test_data[i][1]]]
    
    test_address_rep = [test_address_entry]*n
    sparse_test_set = sparse_from_word_vector(test_address_rep)
    
    feed_dict = {test_address: sparse_test_set,
                test_zip: test_zip_entry,
                ref_address: sparse_ref_set,
                ref_zip: reference_zips}
    best_match = session.run(top_match_idx, feed_dict = feed_dict)
    best_address = reference_address[best_match[0]]
    [best_zip] = reference_zips[0][best_match]
    [[test_zip_aux]] = test_zip_entry
    
    print('Original Address = '+str(test_address_entry)+', '+str(test_zip_aux))
    print('Corrected Address = '+str(best_address)+', '+str(best_zip)+'\n')

Original Address = calle diagcn 424, 25712
Corrected Address = calle diagon 424, 25712

Original Address = carrera kbbey 383, 25712
Corrected Address = carrera abbey 383, 25712

Original Address = callejon diagon 695, 25712
Corrected Address = callejon diagon 695, 25712

Original Address = calle dython 350, 22710
Corrected Address = calle python 350, 22710

Original Address = callejon elm 18, 21460
Corrected Address = callejon elm 18, 21460

Original Address = avenida ezm 137, 29682
Corrected Address = avenida elm 137, 29682

Original Address = calle pythpn 522, 21460
Corrected Address = calle python 522, 21460

Original Address = calle diagor 541, 21460
Corrected Address = calle diagon 541, 21460

Original Address = avenida epm 411, 22710
Corrected Address = avenida elm 411, 22710

Original Address = via exm 902, 25712
Corrected Address = via elm 902, 25712

