# Segmenting and Clustering Neighborhoods in Florida, Ohio and Texas

### Organizing the dataset

#### Downloading the Neighborhood data from online source

In [3]:
# Importing the libraries
import pandas as pd
import numpy as np
import geocoder
import folium

In [None]:
# Downloading the table of Neighborhoods and Boroughs for the states

dataset = pd.read_html('https://en.wikipedia.org/wiki/List_of_municipalities_in_Florida')[1]
dataset = dataset.iloc[:,[1,2]]
dataset.columns = ['Neighborhood', 'Borough']

florida = dataset.groupby('Borough')['Neighborhood'].apply(', '.join) .reset_index(name='Neighborhood')

dataset2 = pd.read_html('https://en.wikipedia.org/wiki/List_of_cities_in_Texas')[1]
dataset2.columns = ['Neighborhood', 'Borough']

texas = dataset2.groupby('Borough')['Neighborhood'].apply(', '.join).reset_index(name='Neighborhood')

dataset3 = pd.read_html('https://en.wikipedia.org/wiki/List_of_cities_in_Ohio')[0]
dataset3 = dataset3.iloc[:,[0,2]]
dataset3.columns = ['Neighborhood', 'Borough']

ohio = dataset3.groupby('Borough')['Neighborhood'].apply(', '.join).reset_index(name='Neighborhood')

#### Adding Longitude and Latitude to the Dataframes

In [None]:
# Using Geocoder ArcGis API to get the coordinates of the different Boroughs in Florida

florida_lat = []
florida_long = []
for i in range(len(florida)):
    borough = str(florida['Borough'][i])
    address = borough +', '+ 'Florida, USA'
    loc = geocoder.arcgis(address)
    florida_lat.append(loc.lat)
    florida_long.append(loc.lng)

florida['Latitude'] = florida_lat
florida['Longitude'] = florida_long

In [None]:
# Using Geocoder ArcGis API to get the coordinates of the different Boroughs in Texas

texas_lat = []
texas_long = []
for i in range(len(texas)):
    borough = str(texas['Borough'][i])
    address = borough +', '+ 'Texas, USA'
    loc = geocoder.arcgis(address)
    texas_lat.append(loc.lat)
    texas_long.append(loc.lng)

texas['Latitude'] = texas_lat
texas['Longitude'] = texas_long

In [None]:
# Using Geocoder ArcGis API to get the coordinates of the different Boroughs in Ohio

ohio_lat = []
ohio_long = []
for i in range(len(ohio)):
    borough = str(ohio['Borough'][i])
    address = borough +', '+ 'Ohio'
    loc = geocoder.arcgis(address)
    ohio_lat.append(loc.lat)
    ohio_long.append(loc.lng)

ohio['Latitude'] = ohio_lat
ohio['Longitude'] = ohio_long

#### Removing Outliers and saving the prepared dataframes

In [23]:
# Removing Outliers from the datasets

florida.drop(index=51, axis=0, inplace=True)
florida.reset_index(drop=True, inplace=True)

texas.drop(index=[122,124], axis=0, inplace=True)
texas.reset_index(drop=True, inplace=True)

In [24]:
# Saving the prepared dataframes in csv file

florida.to_csv('Florida_geodata.csv')
texas.to_csv('Texas_geodata.csv')
ohio.to_csv('Ohio_geodata.csv')

### Visualizing the Neighborhoods of Florida, Ohio and Texas

In [25]:
# Importing the dataset from saved csv files

florida = pd.read_csv('Florida_geodata.csv', index_col=0)
texas = pd.read_csv('Texas_geodata.csv', index_col=0)
ohio = pd.read_csv('Ohio_geodata.csv', index_col=0)

In [26]:
florida.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Alachua,"Alachua, Archer, Gainesville, Hawthorne, High ...",29.79309,-82.49428
1,Baker,"Glen St. Mary, Macclenny",30.79729,-86.68238
2,Bay,"Callaway, Lynn Haven, Mexico Beach, Panama Cit...",30.278741,-85.61548
3,Bradford,"Brooker, Hampton, Lawtey Nour Town, Starke",28.0685,-82.5269
4,BradfordClay,Keystone Heights,30.131051,-81.759014


In [27]:
florida.shape

(71, 4)

In [28]:
ohio.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Allen,"Delphos, Lima",40.77151,-84.105802
1,Ashland,Ashland,40.86865,-82.3155
2,Ashtabula,"Ashtabula, Conneaut, Geneva",41.88926,-80.78673
3,Athens,"Athens, Nelsonville",39.3285,-82.10444
4,Auglaize,"Saint Marys, Wapakoneta",40.560902,-84.22174


In [29]:
ohio.shape

(72, 4)

In [30]:
texas.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Anderson,Palestine,30.48798,-95.98652
1,Andrews,Andrews,32.31898,-102.54667
2,Angelina,"Burke, Diboll, Hudson, Huntington, Lufkin, Zav...",31.257302,-94.632618
3,Aransas,"Aransas Pass*, Corpus Christi*, Rockport",28.245978,-97.041817
4,Archer,"Archer City, Holliday, Scotland, Windthorst",33.615354,-98.687714


In [31]:
texas.shape

(234, 4)

In [21]:
# Function for creating map

def create_map(data:list, loc:list = [], clusters=0):
    import matplotlib.cm as cm
    import matplotlib.colors as colors
    from random import randint
    
    

    if loc != []:
        loc_init = np.array(loc).mean(axis=0)
        lat_init = loc_init[0]
        long_init = loc_init[1]
        
    else:
        i = randint(0,len(data[0]))
        lat_init = data[0]['Latitude'][i]
        long_init = data[0]['Longitude'][i]
        
    data_map = folium.Map(location=[lat_init, long_init], zoom_start=5)
    
    if clusters == 0:
        x = np.arange(len(data))
        ys = [i + x + (i*x)**2 for i in range(len(data))]
        colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
        rainbow = [colors.rgb2hex(i) for i in colors_array]
        
        for ind, state in enumerate(data):
            for lat, lng, borough, neighborhood in zip(state['Latitude'], state['Longitude'], state['Borough'], state['Neighborhood']):
                label = '{}:\n\n {}'.format(borough, neighborhood)
                label = folium.Popup(label, parse_html=True)
                folium.CircleMarker(
                    [lat, lng],
                    radius=5,
                    popup=label,
                    color=rainbow[ind],
                    fill=True,
                    fill_color=rainbow[ind],
                    fill_opacity=0.7,
                    parse_html=False).add_to(data_map) 
    else:
        x = np.arange(clusters)
        ys = [i + x + (i*x)**2 for i in range(clusters)]
        colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
        rainbow = [colors.rgb2hex(i) for i in colors_array]
        
        for ind in range(len(data)):
            state = data[ind]
            for lat, lon, borough, cluster,neighborhood in zip(state['Latitude'], state['Longitude'], state['Borough'], state['Cluster Label'],state['Neighborhoods']):
        
                if cluster in range(0,clusters): 
                    label = folium.Popup(str(borough) +'\n'+ str(neighborhood) +'\n' +'Cluster: ' + str(int(cluster)), parse_html=True)
                    folium.CircleMarker(
                        [lat, lon],
                        radius=5,
                        popup=label,
                        color=rainbow[int(cluster)-1],
                        fill=True,
                        fill_color=rainbow[int(cluster)-1],
                        fill_opacity=0.7).add_to(data_map)
                else:
                    label = folium.Popup(str(borough)+'\n'+ str(neighborhood) +'\n' + '\nCluster: ' + str(cluster), parse_html=True)
                    folium.CircleMarker(
                        [lat, lon],
                        radius=5,
                        popup=label,
                        color='black',
                        fill=True,
                        fill_color='black',
                        fill_opacity=0.7).add_to(data_map)
  
        
    return data_map    

In [19]:
# Getting the Latitude and Longitude of the states

Florida = geocoder.arcgis('Florida, USA')
florida_loc = Florida.latlng

Texas = geocoder.arcgis('Texas, USA')
texas_loc = Texas.latlng

Ohio = geocoder.arcgis('Ohio, USA')
ohio_loc = Ohio.latlng

print('Florida {}\nTexas {}\nOhio {}\n'.format(florida_loc, texas_loc, ohio_loc))

Florida [28.56604489500006, -81.68864878999995]
Texas [31.46273304600004, -99.33305008999997]
Ohio [40.41305679900006, -82.71121518599995]



In [32]:
# Creating the map using the create_map function

group_map = create_map([florida,texas,ohio],[florida_loc,texas_loc,ohio_loc])
group_map.save('Map of Florida, Texas and Ohio Neighborhoods.html')   

group_map

### Exploring the venues in Florida, Ohio and Texas using Foursquare API

In [33]:
# function for exploring venues in the different Boroughs

def exploreVenues(boroughs,neighborhoods, latitudes, longitudes, radius=2000):
    import requests
    
    CLIENT_ID = 'YBLV0QG20R03TB5FGAQSDJTMUY2I1QU1ZQ0RYHB5ZEL0WX4T'
    CLIENT_SECRET = '1OMSMMCZEUH5TKS3QWIGCQ0UF1BRW2TUK0TQAYKVX2K10RSG'
    VERSION = '20190219' # Foursquare API version
    LIMIT = 5000
    
    venues_list=[]
    for borough, neighborhood, lat, lng in zip(boroughs,neighborhoods, latitudes, longitudes):
        print(borough)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            borough,
            neighborhood,
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Borough', 
                  'Neighborhood',
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
# Exploring venues in Florida

florida_venues = exploreVenues(boroughs = florida['Borough'],
                                neighborhoods = florida['Neighborhood'],
                                latitudes = florida['Latitude'],
                                longitudes = florida['Longitude'])
florida_venues.to_csv('florida_venues.csv')

In [None]:
# Exploring venues in Texas
texas_venues = exploreVenues(boroughs = texas['Borough'],
                                neighborhoods = texas['Neighborhood'],
                                latitudes = texas['Latitude'],
                                longitudes = texas['Longitude'])
texas_venues.to_csv('texas_venues.csv')

In [None]:
# Exploring venues in Ohio

ohio_venues = exploreVenues(boroughs = ohio['Borough'],
                                neighborhoods = ohio['Neighborhood'],
                                latitudes = ohio['Latitude'],
                                longitudes = ohio['Longitude'])
ohio_venues.to_csv('ohio_venues.csv')

### One Hot Encoding the dataframe of venues 

In [34]:
# Importing the venues dataset from the saved csv files

florida_venues = pd.read_csv('florida_venues.csv', index_col=0)
texas_venues = pd.read_csv('texas_venues.csv',index_col=0)
ohio_venues = pd.read_csv('ohio_venues.csv', index_col=0)

In [35]:
florida_venues.head()

Unnamed: 0,Borough,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Alachua,"Alachua, Archer, Gainesville, Hawthorne, High ...",29.79309,-82.49428,Conestogas Restaurant,29.792109,-82.495888,American Restaurant
1,Alachua,"Alachua, Archer, Gainesville, Hawthorne, High ...",29.79309,-82.49428,New York Pizza Plus,29.795401,-82.502413,Pizza Place
2,Alachua,"Alachua, Archer, Gainesville, Hawthorne, High ...",29.79309,-82.49428,Mi Apá Latin Café,29.798116,-82.502088,Latin American Restaurant
3,Alachua,"Alachua, Archer, Gainesville, Hawthorne, High ...",29.79309,-82.49428,Walgreens,29.794937,-82.495102,Pharmacy
4,Alachua,"Alachua, Archer, Gainesville, Hawthorne, High ...",29.79309,-82.49428,Bev's Better Burgers,29.79284,-82.495964,American Restaurant


In [36]:
florida_venues.shape

(1455, 8)

In [37]:
texas_venues.head()

Unnamed: 0,Borough,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Anderson,Palestine,30.48798,-95.98652,Shell,30.487355,-95.988212,Gas Station
1,Anderson,Palestine,30.48798,-95.98652,Redbox,30.486944,-95.986667,Video Store
2,Anderson,Palestine,30.48798,-95.98652,Kotts Cafe,30.489403,-95.988269,Diner
3,Anderson,Palestine,30.48798,-95.98652,Fanthorp Inn State Historic Site,30.483121,-95.984195,History Museum
4,Andrews,Andrews,32.31898,-102.54667,Buddy's Drive In,32.318701,-102.545626,American Restaurant


In [38]:
texas_venues.shape

(3687, 8)

In [39]:
ohio_venues.head()

Unnamed: 0,Borough,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Allen,"Delphos, Lima",40.77151,-84.105802,Casa Lu Al,40.772352,-84.108993,Italian Restaurant
1,Allen,"Delphos, Lima",40.77151,-84.105802,Happy Daz,40.768829,-84.109068,Burger Joint
2,Allen,"Delphos, Lima",40.77151,-84.105802,Fat Jack's Pizza,40.76799,-84.1083,Pizza Place
3,Allen,"Delphos, Lima",40.77151,-84.105802,Ollie's Bargain Outlet,40.765639,-84.103967,Outlet Store
4,Allen,"Delphos, Lima",40.77151,-84.105802,Dollar General,40.76564,-84.10346,Discount Store


In [40]:
ohio_venues.shape

(1453, 8)

In [42]:
# function for encoding venues in the different states

def encoder(venues):
    onehot = pd.get_dummies(venues[['Venue Category']], prefix="", prefix_sep="")
    
    # add neighborhood column back to dataframe
    venues_onehot = venues['Borough']
    venues_onehot = pd.concat([venues_onehot,onehot], axis=1)
    venues_grp = venues_onehot.groupby('Borough').mean().reset_index()        
    features_list = []
    for i in range(venues_grp.shape[0]):
        borough = venues_grp.Borough[i]
        ind = venues[venues['Borough'] == borough].index
        ind = ind[0]
        features = list(venues.iloc[ind,1:4])
        features_list.append(features)
    features = pd.DataFrame(features_list, columns=['Neighborhood','Latitude','Longitude'])
    
    encoded_venues = pd.concat([venues_grp.iloc[:,0],features,venues_grp.iloc[:,1:]], axis=1)

        
    return encoded_venues

In [43]:
# Using the encoder function to encode the state venues

florida_onehot = encoder(florida_venues)
texas_onehot = encoder(texas_venues)
ohio_onehot = encoder(ohio_venues)

In [44]:
florida_onehot.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Accessories Store,Advertising Agency,Airport,Airport Service,Airport Terminal,American Restaurant,...,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,Alachua,"Alachua, Archer, Gainesville, Hawthorne, High ...",29.79309,-82.49428,0.0,0.03125,0.0,0.0,0.0,0.0625,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Baker,"Glen St. Mary, Macclenny",30.79729,-86.68238,0.0,0.0,0.0,0.0,0.0,0.125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Bay,"Callaway, Lynn Haven, Mexico Beach, Panama Cit...",30.278741,-85.61548,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bradford,"Brooker, Hampton, Lawtey Nour Town, Starke",28.0685,-82.5269,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.018868,0.0,0.0,0.018868,0.0,0.0,0.0,0.0,0.0
4,BradfordClay,Keystone Heights,30.131051,-81.759014,0.021739,0.0,0.0,0.0,0.0,0.0,...,0.021739,0.021739,0.0,0.0,0.0,0.021739,0.0,0.0,0.0,0.0


In [45]:
florida_onehot.shape

(54, 242)

In [46]:
texas_onehot.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Accessories Store,Airport,American Restaurant,Antique Shop,Arcade,Art Gallery,...,Water Park,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo
0,Anderson,Palestine,30.48798,-95.98652,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Andrews,Andrews,32.31898,-102.54667,0.0,0.0,0.08,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Angelina,"Burke, Diboll, Hudson, Huntington, Lufkin, Zav...",31.257302,-94.632618,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Armstrong,Claude,26.92001,-97.79151,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Atascosa,"Charlotte, Jourdanton, Lytle*, Pleasanton, Poteet",29.26356,-98.73713,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
texas_onehot.shape

(167, 306)

In [48]:
ohio_onehot.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,ATM,Airport,Airport Service,American Restaurant,Art Gallery,Art Museum,...,Travel Agency,Truck Stop,Video Game Store,Video Store,Water Park,Waterfront,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,Allen,"Delphos, Lima",40.77151,-84.105802,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Ashland,Ashland,40.86865,-82.3155,0.0,0.0,0.0,0.020408,0.0,0.0,...,0.0,0.0,0.0,0.040816,0.0,0.0,0.0,0.0,0.0,0.0
2,Ashtabula,"Ashtabula, Conneaut, Geneva",41.88926,-80.78673,0.0,0.0,0.0,0.166667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Athens,"Athens, Nelsonville",39.3285,-82.10444,0.0,0.0,0.0,0.012658,0.025316,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012658,0.0
4,Auglaize,"Saint Marys, Wapakoneta",40.560902,-84.22174,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0


In [49]:
ohio_onehot.shape

(66, 219)

### Gettint the most common venues for Boroughs in Florida, Ohio and Texas

In [50]:
# Function for getting the 10 most common venues in the different Boroughs

def mostCommonVenues(venues_onehot):
    indicators = ['st', 'nd', 'rd']
    columns = ['Borough']
    for ind in np.arange(10):
        try:
            columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
        except:
            columns.append('{}th Most Common Venue'.format(ind+1))
    venues_sorted = pd.DataFrame(columns=columns)
    venues_sorted['Borough'] = venues_onehot['Borough']
    
    for ind in np.arange(venues_onehot.shape[0]):
        row = venues_onehot.iloc[ind, 4:]
        row_sorted = row.sort_values(ascending = False)
        most_common_venues = row_sorted.index.values[0:10]
        venues_sorted.iloc[ind,1:] = most_common_venues
        
    return venues_sorted

In [51]:
# Using the mostCommonVenues function to get most common venues

florida_most_common = mostCommonVenues(florida_onehot)
texas_most_common = mostCommonVenues(texas_onehot)
ohio_most_common = mostCommonVenues(ohio_onehot)

In [52]:
florida_most_common.head()

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Alachua,Pizza Place,Mexican Restaurant,Discount Store,American Restaurant,Pharmacy,Fast Food Restaurant,BBQ Joint,Park,Clothing Store,Sandwich Place
1,Baker,Taco Place,Farmers Market,Garden Center,Park,Intersection,American Restaurant,Grocery Store,Farm,Food & Drink Shop,Food Court
2,Bay,Gift Shop,Lake,Zoo Exhibit,Fondue Restaurant,Frozen Yogurt Shop,Fried Chicken Joint,French Restaurant,Fountain,Forest,Football Stadium
3,Bradford,Pharmacy,Pizza Place,Intersection,Restaurant,Golf Course,Liquor Store,Bakery,Discount Store,Bagel Shop,Ice Cream Shop
4,BradfordClay,Fast Food Restaurant,Breakfast Spot,Pizza Place,Gym,Grocery Store,Sandwich Place,Accessories Store,Pharmacy,Big Box Store,Shipping Store


In [53]:
texas_most_common.head()

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Anderson,Video Store,Gas Station,History Museum,Diner,Zoo,Farmers Market,Fast Food Restaurant,Field,Fish & Chips Shop,Fish Market
1,Andrews,Convenience Store,Fast Food Restaurant,American Restaurant,Food,Pizza Place,Café,Coffee Shop,Sandwich Place,Deli / Bodega,Ice Cream Shop
2,Angelina,Business Service,Pool Hall,Zoo,Flower Shop,Fast Food Restaurant,Field,Fish & Chips Shop,Fish Market,Fishing Spot,Flea Market
3,Armstrong,Farm,Stables,Flower Shop,Farmers Market,Fast Food Restaurant,Field,Fish & Chips Shop,Fish Market,Fishing Spot,Flea Market
4,Atascosa,Locksmith,Convenience Store,RV Park,Campground,Mexican Restaurant,Flea Market,Fast Food Restaurant,Field,Fish & Chips Shop,Fish Market


In [54]:
ohio_most_common.head()

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Allen,Pizza Place,Pharmacy,Grocery Store,Fast Food Restaurant,Gas Station,Sandwich Place,Bar,Rental Car Location,Discount Store,Italian Restaurant
1,Ashland,Pizza Place,Convenience Store,Ice Cream Shop,Fast Food Restaurant,Discount Store,Bank,Bar,Coffee Shop,Pharmacy,Video Store
2,Ashtabula,American Restaurant,Convenience Store,Discount Store,Pharmacy,Seafood Restaurant,Ice Cream Shop,Grocery Store,Gas Station,Fast Food Restaurant,Coffee Shop
3,Athens,Pizza Place,Coffee Shop,Dive Bar,Fast Food Restaurant,Bar,Bakery,Hotel,Sports Bar,Gas Station,Bank
4,Auglaize,Dance Studio,Water Park,Nightlife Spot,Ice Cream Shop,Fair,Dessert Shop,Farmers Market,Food Service,Food Court,Food & Drink Shop


### Clustering the Neighboorhoods of the 3 States

#### Preparing the dataset for the KMeans Clustering Algorithm

In [55]:
# Combining the encoded venues data for the 3 states into one large dataframe

venues_concat = pd.concat([florida_onehot.iloc[:,4:],texas_onehot.iloc[:,4:],ohio_onehot.iloc[:,4:]], ignore_index=True, sort=False)
features = pd.concat([florida_onehot.iloc[:,:4],texas_onehot.iloc[:,:4],ohio_onehot.iloc[:,:4]], ignore_index=True, sort=False)
features.rename(columns = {"Neighborhood":"Neighborhoods"}, inplace = True)

venues_concat = pd.concat([features,venues_concat], axis=1)
venues_concat.fillna(0, inplace=True)

In [56]:
venues_concat.head()

Unnamed: 0,Borough,Neighborhoods,Latitude,Longitude,Accessories Store,Advertising Agency,Airport,Airport Service,Airport Terminal,American Restaurant,...,Library,Lighthouse,Mountain,Music School,Nature Preserve,Night Market,Nightlife Spot,Ski Area,Social Club,Waterfront
0,Alachua,"Alachua, Archer, Gainesville, Hawthorne, High ...",29.79309,-82.49428,0.0,0.03125,0.0,0.0,0.0,0.0625,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Baker,"Glen St. Mary, Macclenny",30.79729,-86.68238,0.0,0.0,0.0,0.0,0.0,0.125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Bay,"Callaway, Lynn Haven, Mexico Beach, Panama Cit...",30.278741,-85.61548,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bradford,"Brooker, Hampton, Lawtey Nour Town, Starke",28.0685,-82.5269,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,BradfordClay,Keystone Heights,30.131051,-81.759014,0.021739,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
venues_concat.shape

(287, 364)

#### KMeans Clustering

In [61]:
# Using KMeans to cluster the Neighborhoods of the 3 states

from sklearn.cluster import KMeans
clusters = 10
kmeans = KMeans(n_clusters=clusters, n_jobs=-1)
kmeans.fit(venues_concat.iloc[:,4:])

venues_concat['Cluster Label'] = kmeans.labels_

In [62]:
kmeans.labels_

array([1, 0, 2, 0, 1, 1, 0, 0, 9, 3, 0, 0, 5, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 1, 3, 1, 5, 0, 0, 1, 0, 0, 1, 0, 7, 0, 4, 1, 0, 0, 1, 1, 3, 1,
       0, 0, 0, 0, 0, 0, 2, 0, 7, 7, 0, 1, 0, 3, 0, 0, 9, 1, 1, 0, 1, 9,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 6, 1, 9, 9, 5, 1, 4, 0, 1, 1, 1,
       1, 0, 0, 9, 1, 0, 0, 0, 7, 0, 2, 0, 6, 4, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 0, 0, 3, 1, 1, 1, 0, 2, 9, 1, 7, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       2, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 3, 1, 2, 0, 1, 0, 1, 0, 2,
       1, 1, 1, 0, 0, 0, 1, 2, 1, 4, 1, 2, 1, 4, 1, 3, 1, 0, 0, 1, 1, 2,
       0, 0, 3, 0, 1, 1, 1, 0, 1, 1, 7, 1, 0, 0, 1, 1, 8, 1, 1, 1, 5, 0,
       2, 1, 1, 1, 5, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 3, 0,
       3, 1, 1, 1, 0, 0, 0, 0, 0, 2, 7, 7, 1, 1, 0, 7, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 1, 4, 1, 1, 0, 1, 1, 7, 5, 7, 1, 1, 0, 1, 1, 1, 1, 9, 1,
       1, 1, 1, 7, 4, 1, 1, 1, 0, 0, 1, 0, 1, 9, 2, 0, 1, 1, 0, 8, 7, 1,
       0])

#### Visualizing the Clusters of Neighborhoods in Florida, Ohio and Texas

In [64]:
# Creating the map showing the clusters using the create_map function

cluster_map = create_map(data=[venues_concat],loc=[florida_loc,texas_loc,ohio_loc], clusters = 10)
cluster_map.save('Map of Clusters.html')
cluster_map