# Goal of Analysis:

### Write an algorithm that return 10 most efficient stops covering the home address for all the employees and minimize the overall walking distance between employee homes and the bus stop

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import googlemaps

import warnings
warnings.filterwarnings('ignore')

In [2]:
address = pd.read_csv('Employee_Addresses.csv')
address.head()

Unnamed: 0,address,employee_id
0,"98 Edinburgh St, San Francisco, CA 94112, USA",206
1,"237 Accacia St, Daly City, CA 94014, USA",2081
2,"1835 Folsom St, San Francisco, CA 94103, USA",178
3,"170 Cambridge St, San Francisco, CA 94134, USA",50
4,"16 Roanoke St, San Francisco, CA 94131, USA",1863


In [3]:
stops = pd.read_csv('Potentail_Bust_Stops.csv')
stops.head()

Unnamed: 0,Street_One,Street_Two
0,MISSION ST,ITALY AVE
1,MISSION ST,NEW MONTGOMERY ST
2,MISSION ST,01ST ST
3,MISSION ST,20TH ST
4,MISSION ST,FREMONT ST


In [4]:
# get unique address and stop information

unique_address = list(set(address['address']))

def merge_stop(x):
    return x[0] + ' & ' + x[1] + ' CA'

unique_stop = list(set(stops.apply(merge_stop, axis=1)))

In [5]:
unique_address[:10]

['1142 Munich St, San Francisco, CA 94112, USA',
 '216 Whittier St, Daly City, CA 94014, USA',
 '742 Chenery St, San Francisco, CA 94131, USA',
 '69 Madison St, San Francisco, CA 94134, USA',
 '2044 Alemany Blvd, San Francisco, CA 94112, USA',
 '128 Elmira St, San Francisco, CA 94124, USA',
 '1390 Market St, San Francisco, CA 94102, USA',
 '410 Van Ness Ave, San Francisco, CA 94102, USA',
 '1669 Geneva Ave, San Francisco, CA 94134, USA',
 '1990 Quint St, San Francisco, CA 94124, USA']

In [6]:
unique_stop[:10]

['MISSION ST & ITALY AVE CA',
 'MISSION ST & THERESA ST CA',
 'MISSION ST & 17TH ST CA',
 'MISSION ST & YERBA BUENA LN CA',
 'MISSION ST & JESSIE EAST ST CA',
 'MISSION ST & 20TH ST CA',
 'MISSION ST & NAGLEE AVE CA',
 'MISSION ST & SAN JUAN AVE CA',
 'MISSION ST & NIAGARA AVE CA',
 'MISSION ST & OTTAWA AVE CA']

Another way to get the unique stops we can have:

In [8]:
import re
import csv
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')

pattern = re.compile(r"(\d+) (.+), (.+), CA (\d+), USA")

class Employee(object):
    def __init__(self,segments):
        # address
        matched = pattern.match(segments[0])
        if matched is None:
            raise Exception('format not supported')

        self.building_no = int( matched.group(1) )
        self.street = matched.group(2).lower()
        self.city = matched.group(3)
        self.zipcode = int( matched.group(4) )

        # employee-id
        self.id = int(segments[1])

employees = []
invalid_employees = []

address_file = "Employee_Addresses.csv"
with open(address_file,"rt") as inf:
    reader = csv.reader(inf)
    for segments in reader:
        try:
            employees.append(Employee(segments))
        except:
            invalid_employees.append(segments[0])


            
streets_counter = Counter((e.street for e in employees))
streets_counts = pd.Series(streets_counter)
streets_counts.sort_values(ascending=False)

16th st                              61
geneva ave                           50
alemany blvd                         46
folsom st                            46
mission st                           42
                                     ..
peabody st                            1
adair st                              1
van ness avenue, 100 van ness ave     1
gennessee st                          1
york st                               1
Length: 266, dtype: int64

Next step would be to use googlemap to measure the distance to see if they are the closest stops we can have, but the service is a charged service, therefore I cannot directly show it. The basic idea is to query the distance between the address and the closest stop and then add them up to find the minimum measure.