In [1]:
# import libraries

import pandas as pd 
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
import numpy as np 
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, normalize, scale
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score

import requests 
import time
from bs4 import BeautifulSoup
from urllib.request import urlopen
import json # If reading in data from Json file
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import folium # maps
from geopy.geocoders import Nominatim # uses address to get lat/long

# Installations
#!conda install -c conda-forge geopy 
#!conda install -c conda-forge folium=0.5.0 

#  Obtain Data

#### Scrape Wikipedia for Location of Cities in the California Bay Area

In [2]:
# Obtain location data from Wikipedia
# https://stackoverflow.com/questions/54890708/scraping-data-from-wikipedia-table

url = 'https://en.wikipedia.org/wiki/List_of_cities_and_towns_in_the_San_Francisco_Bay_Area'  
df = pd.read_html(url, header=1)[0]

df.head()

Unnamed: 0,Name,Type,County,Population (2010)[8][9],sq mi,km2,Incorporated[7]
0,Alameda,City,Alameda,73812,10.61,27.5,"April 19, 1854"
1,Albany,City,Alameda,18539,1.79,4.6,"September 22, 1908"
2,American Canyon,City,Napa,19454,4.84,12.5,"January 1, 1992"
3,Antioch,City,Contra Costa,102372,28.35,73.4,"February 6, 1872"
4,Atherton,Town,San Mateo,6914,5.02,13.0,"September 12, 1923"


#### Explore Data

In [3]:
# View datatypes
df.dtypes

Name                        object
Type                        object
County                      object
Population (2010)[8][9]      int64
sq mi                      float64
km2                        float64
Incorporated[7]             object
dtype: object

In [4]:
# Check for nulls
df.isnull().sum()

Name                       0
Type                       0
County                     0
Population (2010)[8][9]    0
sq mi                      0
km2                        0
Incorporated[7]            0
dtype: int64

In [5]:
# View shape
df.shape

(101, 7)

In [6]:
# Rename columns

df.rename(columns={
    'Population (2010)[8][9]': 'population',
    'Incorporated[7]': 'incorporated',
    'Name': 'city/town'
}, inplace=True)

In [7]:
# Confirm name changes
df.head(1)

Unnamed: 0,city/town,Type,County,population,sq mi,km2,incorporated
0,Alameda,City,Alameda,73812,10.61,27.5,"April 19, 1854"


In [8]:
# Format column names
df.columns = [c.replace(' ', '_').lower() for c in df.columns]
df.head()

Unnamed: 0,city/town,type,county,population,sq_mi,km2,incorporated
0,Alameda,City,Alameda,73812,10.61,27.5,"April 19, 1854"
1,Albany,City,Alameda,18539,1.79,4.6,"September 22, 1908"
2,American Canyon,City,Napa,19454,4.84,12.5,"January 1, 1992"
3,Antioch,City,Contra Costa,102372,28.35,73.4,"February 6, 1872"
4,Atherton,Town,San Mateo,6914,5.02,13.0,"September 12, 1923"


In [9]:
df.head()

Unnamed: 0,city/town,type,county,population,sq_mi,km2,incorporated
0,Alameda,City,Alameda,73812,10.61,27.5,"April 19, 1854"
1,Albany,City,Alameda,18539,1.79,4.6,"September 22, 1908"
2,American Canyon,City,Napa,19454,4.84,12.5,"January 1, 1992"
3,Antioch,City,Contra Costa,102372,28.35,73.4,"February 6, 1872"
4,Atherton,Town,San Mateo,6914,5.02,13.0,"September 12, 1923"


In [10]:
# Add Stanford and Alviso to match the sales data
new_row_1 = {'city/town': 'Stanford', 'county': 'Santa Clara'}
new_row_2 = {'city/town': 'Alviso', 'county': 'Santa Clara'}

#append row to the dataframe
df = df.append(new_row_1, ignore_index=True)
df = df.append(new_row_2, ignore_index=True)

# Obtain Lat/Long Data

In [11]:
# address = 'Santa Clara, California'

# geolocator = Nominatim(user_agent = "foursquare_agent")
# location = geolocator.geocode(address)
# latitude = location.latitude
# longitude = location.longitude
# print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))

In [12]:
# for name in df['name']:
#     address = name + ', California'
#     print(address)

In [13]:
df['Name']

KeyError: 'Name'

In [None]:
df['Name'].tolist()

In [None]:
# Look up lat/long with geopy library

lats = []
longs = []
    
for name in df['Name']:
    address = name + ', California'
    geolocator = Nominatim(user_agent = "foursquare_agent")
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    lats.append(latitude)
    longs.append(longitude)

In [None]:
# Populate df with lat/long

df['latitude'] = lats
df['longitude'] = longs
df.tail()

In [None]:
# Locate county with enough cities to compare (Looking for ~10-15)

df['county'].value_counts()

In [None]:
# View cities in Santa Clara County

df[df['county']=='Santa Clara']

In [None]:
# Create new df for Santa Clara County and reset index

sclara = df[df['county'] == 'Santa Clara'].reset_index(drop=True)
sclara.head(13)

In [None]:
# Fix lat/long for Santa Clara

sclara.loc[12,'latitude'] = 37.352483
sclara.loc[12,'longitude'] = -121.956564

# Confirm change
sclara.loc[sclara['city/town'] == 'Santa Clara']

#### Sales Data

In [None]:
# Read in home sales data
sales = pd.read_csv('../data/santa_clara_co_sales_2019.csv')
sales.head()

In [None]:
# Drop nulls
sales.dropna(subset=["city/town"], axis=0, inplace=True)
sales

In [None]:
# View dtypes

sales.dtypes

In [None]:
# Format data

sales['average_price'] = [item.replace('$','').replace(',','') for item in sales['average_price']]
sales.head()

In [None]:
# Format data
sales['median_price'] = [item.replace('$','').replace(',','') for item in sales['median_price']]
sales.head()

In [None]:
# Convert dtype to float

sales['average_price'] = sales['average_price'].astype(float)

In [None]:
# Convert dtype to float

sales['median_price'] = sales['median_price'].astype(float)

In [None]:
# Confirm dtype conversion

sales.dtypes

In [None]:
# Reset index

sales = sales.reset_index()
sales.head()

In [None]:
# Drop old index

sales = sales.drop(['index'], axis=1)
sales.head()

In [None]:
# Strip spaces from names in sales df

sales['city/town']=sales['city/town'].str.strip()
sales['city/town'].tolist()

In [None]:
# Merge location and sales dataframes

df = pd.concat([sclara.set_index('city/town'), sales.set_index('city/town')], axis=1, join='inner')
df.reset_index(inplace=True)
df.head()

In [None]:
df.columns.tolist()

# Visualize

In [None]:
# Get index for City of Santa Clara for map

df[df['city/town']=='Santa Clara']

In [None]:
# create map of the cities in Santa Clara County with the city of Santa Clara as the focal point
map_df = folium.Map(location=[df.iloc[0,2], df.iloc[0,3]], zoom_start=10)

# add markers to map
for lat, lng, name in zip(df['latitude'], df['longitude'], df['Name']):
    label = '{}'.format(name)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='purple',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.6,
        parse_html=False).add_to(map_df)  
    
map_df

In [None]:
# create map of the cities in Santa Clara County with the city of Santa Clara as the focal point
map_df = folium.Map(location=[df.iloc[12,7], df.iloc[12,8]], zoom_start=10)

# add markers to map
for lat, lng, name, county in zip(df['latitude'], df['longitude'], df['city/town'], df['county']):
    label = '{}, {} County'.format(name, county)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='purple',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.6,
        parse_html=False).add_to(map_df)  
    
map_df

## Foursquare Data

In [None]:
# Establish Foursquare credentials for reference

CLIENT_ID = 'S5JB2KFD1NTX3EQLZEUCB5OLMPQTFOITAQG1ZMHVNB12UFH0' 
CLIENT_SECRET = '2SV2BAJSKBTEXPH41FHT2X3P1M1VQKYLZ54213I0VAZU2JPT' 
VERSION = '20200609' # Foursquare API version


In [None]:
# df.loc[0, 'city/town']

In [None]:
# city_latitude = df.loc[0, 'latitude']
# city_longitude = df.loc[0, 'longitude'] 

# city_name = df.loc[0, 'city/town'] 

# print('Latitude and longitude values of {} are {}, {}.'.format(city_name, 
#                                                                city_latitude, 
#                                                                city_longitude))

In [None]:
# #Foursquare rate limits: https://developer.foursquare.com/docs/places-api/rate-limits/
    
# LIMIT = 100 # limit of number of venues returned by Foursquare API

# radius = 500 # define radius

# # create URL
# url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
#     CLIENT_ID, 
#     CLIENT_SECRET, 
#     VERSION, 
#     city_latitude, 
#     city_longitude, 
#     radius, 
#     LIMIT)

In [None]:
# results = requests.get(url).json()

In [None]:
# # function that extracts the category of the venue
# def get_category_type(row):
#     try:
#         categories_list = row['categories']
#     except:
#         categories_list = row['venue.categories']
        
#     if len(categories_list) == 0:
#         return None
#     else:
#         return categories_list[0]['name']

In [None]:
# venues = results['response']['groups'][0]['items']
    
# nearby_venues = json_normalize(venues) # flatten JSON

# # filter columns
# filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
# nearby_venues =nearby_venues.loc[:, filtered_columns]

# # filter the category for each row
# nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# # clean columns
# nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

# nearby_venues.head()

In [None]:
# # Campbell only
# print('{} venues returned from Foursquare'.format(nearby_venues.shape[0]))

In [None]:
# df['city/town'].unique()

In [None]:
# Collect venue results from Foursquare
# Code adapted from IBM data science cert lab

radius = 1000 # 
limit = 200

venues = []

for lat, long, city in zip(df['latitude'], df['longitude'], df['city/town']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        limit)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            city,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [None]:
#  venues into a dataframe
venues_df = pd.DataFrame(venues)
venues_df.columns = ['city/town', 'latitude', 'longitude', 'venue_name', 'venue_latitude', 'venue_longitude', 'venue_type']

# check the dataframe
print('Shape: {}'.format(venues_df.shape))
print('Unique venue types: {}'.format(len(venues_df['venue_type'].unique())))
venues_df.head()

In [None]:
# # Collect venue results from Foursquare
# # Code adapted from IBM data science cert lab

# radius = 1000 
# limit = 200

# for lat, long, city in zip(df['latitude'], df['longitude'], df['city/town']):
#     url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
#         CLIENT_ID,
#         CLIENT_SECRET,
#         VERSION,
#         lat,
#         long,
#         radius, 
#         limit)
    
#     results = requests.get(url).json()['response']['groups'][0]['items']
    
#     venues = []

#     for venue in results:
#         venues.append((
#             city,
#             lat, 
#             long, 
#             venue['venue']['name'], 
#             venue['venue']['location']['lat'], 
#             venue['venue']['location']['lng'],  
#             venue['venue']['categories'][0]['name']))

In [None]:
venues_df.to_csv('../data/venues_df.csv', index=False)

In [None]:
venues_df = pd.read_csv('../data/venues_df.csv')
venues_df.head()

In [None]:
venues_df['city/town'].unique()

In [None]:
# Create function to generate venues for all cities in Santa Clara County

# def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
#     venues_list=[]
#     for name, lat, lng in zip(names, latitudes, longitudes):
#         print(name)
            
#         # create the API request URL
#         url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
#             CLIENT_ID, 
#             CLIENT_SECRET, 
#             VERSION, 
#             lat, 
#             lng, 
#             radius, 
#             LIMIT)
            
#         # make the GET request
#         results = requests.get(url).json()["response"]['groups'][0]['items']
        
#         # return only relevant information for each nearby venue
#         venues_list.append([(
#             name, 
#             lat, 
#             lng, 
#             v['venue']['name'], 
#             v['venue']['location']['lat'], 
#             v['venue']['location']['lng'],  
#             v['venue']['categories'][0]['name']) for v in results])

#     nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
#     nearby_venues.columns = ['city/town', 
#                   'City Latitude', 
#                   'City Longitude', 
#                   'Venue', 
#                   'Venue Latitude', 
#                   'Venue Longitude', 
#                   'Venue Category']
    
#     return(nearby_venues)

In [None]:
# scc is for santa clara county
# scc_venues = getNearbyVenues(names=df['city/town'],
#                                    latitudes=df['latitude'],
#                                    longitudes=df['longitude']
#                                   )


In [None]:
# Count of venues per city
venues_df.groupby('city/town').count()

In [None]:
# one hot encoding
df_onehot = pd.get_dummies(venues_df[['venue_type']], prefix="", prefix_sep="")

# Add city/town to venue df
df_onehot['city/town'] = venues_df['city/town'] 

# move city/town column to the first column
fixed_columns = [df_onehot.columns[-1]] + list(df_onehot.columns[:-1])
df_onehot = df_onehot[fixed_columns]

df_onehot.head()

In [None]:
# new shape
df_onehot.shape

In [None]:
# Group venue type count by city

df_grouped = df_onehot.groupby('city/town').sum().reset_index() 
df_grouped

In [None]:
# View size
df_grouped.shape

In [None]:
# # Boxplots

# plt.figure(figsize=(20, 10))
# plt.xticks(rotation='vertical')
# sns.boxplot

# ax = sns.boxplot(data = df_grouped)
# ax.set_ylabel('Venue Count', fontsize=25)
# ax.set_xlabel('Venue Type', fontsize=25)
# ax.tick_params(labelsize=20)
# plt.xticks(rotation=45, ha='right')

# plt.show()

In [None]:
# num_top_venues = 5

# for city in scc_grouped['city/town']:
#     print("----"+city+"----")
#     temp = scc_grouped[scc_grouped['city/town'] == city].T.reset_index()
#     temp.columns = ['venue','freq']
#     temp = temp.iloc[1:]
#     temp['freq'] = temp['freq'].astype(float)
#     temp = temp.round({'freq': 2})
#     print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
#     print('\n')

In [None]:
# def return_most_common_venues(row, num_top_venues):
#     row_categories = row.iloc[1:]
#     row_categories_sorted = row_categories.sort_values(ascending=False)
    
#     return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
# num_top_venues = 10

# indicators = ['st', 'nd', 'rd']

# # create columns according to number of top venues
# columns = ['city/town']
# for ind in np.arange(num_top_venues):
#     try:
#         columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
#     except:
#         columns.append('{}th Most Common Venue'.format(ind+1))

# # create a new dataframe
# city_venues_sorted = pd.DataFrame(columns=columns)
# city_venues_sorted['city/town'] = scc_grouped['city/town']

# for ind in np.arange(scc_grouped.shape[0]):
#     city_venues_sorted.iloc[ind, 1:] = return_most_common_venues(scc_grouped.iloc[ind, :], num_top_venues)

# city_venues_sorted.head()

# Modeling

In [None]:
# Scale the price

sc = StandardScaler()
price_scaled = sc.fit_transform(df[['average_price']])

In [None]:
#  Add scaled price to the dataframe
price_df = pd.DataFrame(df_grouped)
price_df['Scaled_Price'] = price_scaled

print(price_df.shape)
price_df.head()

## Linear Regression

In [None]:
# Define X and y
X = price_df.drop(columns=['city/town', 'Scaled_Price'])
y = price_df['Scaled_Price']

In [None]:
# Train/test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# Instantiate and fit model

lr = LinearRegression(normalize=True)

model = lr.fit(X_train, y_train)

In [None]:
# Score on training set. (We'll use R^2 for the score today.)
print(f'Training R2 Score: {round(model.score(X_train, y_train),2)}')

# Score on testing set.
print(f'Testing R2 Score: {round(model.score(X_test, y_test),2)}')



In [None]:
# Mean Squared Error

y_pred = lr.predict(X_test)
print(f'MSE: {round(mean_squared_error(y_test, y_pred),2)}')

In [None]:
# let's see how well Linear Regression fit the problem
y_pred = lr.predict(X_test)

print('R2-Score:', r2_score(y_test, y_pred)) # r2 score
print('Mean Squared Error:', mean_squared_error(y_test, y_pred)) # mse

print('Max positive coefs:', lr.coef_[np.argsort(-lr.coef_)[:10]])

print('Venue types with most postive effect:', X.columns[np.argsort(-lr.coef_)[:10]].values)

print('Max negative coefs:', lr.coef_[np.argsort(lr.coef_)[:10]])

print('Venue types with most negative effect:', X.columns[np.argsort(lr.coef_)[:10]].values)

coef_abs = abs(lr.coef_)
print('Min coefs:', lr.coef_[np.argsort(coef_abs)[:10]])
print('Venue types with least effect:', X.columns[np.argsort(coef_abs)[:10]].values)

# PCA Modeling

In [None]:
# Define X and y
X = price_df.drop(columns=['city/town', 'Scaled_Price'])
y = price_df['Scaled_Price']

In [None]:
# Train/test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# Instantiate StandardScaler.
sc = StandardScaler()

# Standardize X_train.
X_train = sc.fit_transform(X_train)

# Standardize X_test.
X_test = sc.transform(X_test)

In [None]:
# Instantiate and fit PCA
pca = PCA(svd_solver='auto', random_state = 0)

# Fit PCA on the training data.
pca.fit(X_train) 

In [None]:
# Transform PCA
Z_train = pca.transform(X_train)

Z_test = pca.transform(X_test)

In [None]:
# Let's check out the resulting data.
pd.DataFrame(Z_train).describe()

In [None]:

# Pull the explained variance attribute.
var_exp = pca.explained_variance_ratio_
print(f'Explained variance (first 20 components): {np.round(var_exp[:20],3)}')

print('')

# Generate the cumulative explained variance.
cum_var_exp = np.cumsum(var_exp)
print(f'Cumulative explained variance (first 20 components): {np.round(cum_var_exp[:20],3)}')

In [None]:
# Plot the variance explained (and cumulative variance explained).

# Set figure size.
plt.figure(figsize=(12,8))

# Plot the explained variance.
plt.plot(range(len(var_exp)), var_exp, lw=3, label = 'Variance Explained')

# Plot the cumulative explained variance.
plt.plot(range(len(var_exp)), cum_var_exp, lw=3, color = 'orange', label = 'Cumulative Variance Explained')


# Add horizontal lines at y=0 and y=1.
plt.axhline(y=0, linewidth=1, color='grey', ls='dashed')
plt.axhline(y=1, linewidth=1, color='grey', ls='dashed')

# Set the limits of the axes.
plt.xlim([-1,21])
plt.ylim([-0.01,1.01])

# Label the axes.
plt.ylabel('Variance Explained', fontsize=20)
plt.xlabel('Principal Component', fontsize=20)

# Make the tick labels bigger
plt.xticks(range(0, 21, 5), range(1, 22, 5), fontsize=12)
plt.yticks(fontsize=12)
    
# Add title and legend.
plt.title('Component vs. Variance Explained', fontsize=24)
plt.legend(fontsize=11);

In [None]:
# Instantiate PCA with 10 components.
pca = PCA(n_components = 10, random_state = 42)

# Fit PCA to training data.
pca.fit(X_train)

In [None]:
# Instantiate linear regression model.
lm = LinearRegression()

# Transform Z_train and Z_test.
Z_train = pca.transform(X_train)
Z_test = pca.transform(X_test)

# Fit on Z_train.
lm.fit(Z_train, y_train)

# Score on training and testing sets.
print(f'Training Score: {round(lm.score(Z_train, y_train),4)}')
print(f'Testing Score: {round(lm.score(Z_test, y_test),4)}')

In [None]:
X = price_df.drop(columns=['city/town', 'Scaled_Price'])
y = price_df['Scaled_Price']

# First, apply PCA
pca = PCA(svd_solver='auto', random_state=0)
X_pca = pca.fit_transform(scale(X))

In [None]:
n_component_list = range(1, 51)
r2_list = []
mse_list = []

# Second, Linear Regression
for i in n_component_list:
    lreg = LinearRegression()
    X_train, X_test, y_train, y_test = train_test_split(X_pca[:,:i], y, test_size=0.2, random_state=0)
    model = lreg.fit(X_train, y_train)
    # check the result
    y_pred = lreg.predict(X_test)
    r2 = r2_score(y_test, y_pred) # r2 score
    mse = mean_squared_error(y_test, y_pred) # mse
    r2_list.append(r2)
    mse_list.append(mse)
    
scores_df = pd.DataFrame.from_dict(dict([('NComponents', n_component_list),
                                        ('R2', r2_list),
                                        ('MSE', mse_list)]))
scores_df.set_index('NComponents', inplace=True)

In [None]:
# plot the scores to see the best n_components
plt.subplot(1, 3, 1)
scores_df['R2'].plot(kind='line')
plt.title('R2 score / n components')
plt.ylabel('R2 score')
plt.xlabel('n components')

plt.subplot(1, 3, 3)
scores_df['MSE'].plot(kind='line')
plt.title('MSE score / n components')
plt.ylabel('MSE score')
plt.xlabel('n components')

plt.show()

In [None]:

r2_max = scores_df['R2'].idxmax()
print("Best n:", r2_max, "R2 score:", scores_df['R2'][r2_max])

mse_min = scores_df['MSE'].idxmin()
print("Best n:", mse_min, "MSE:", scores_df['MSE'][mse_min])

In [None]:
# Use the best n_components parameter
lreg = LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(X_pca[:,:r2_max], y, test_size=0.2, random_state=0)
model = lreg.fit(X_train, y_train)

# check the result
y_pred = lreg.predict(X_test)
r2 = r2_score(y_test, y_pred) # r2 score
mse = mean_squared_error(y_test, y_pred) # mse
print("R2 score:", r2)
print("MSE:", mse)

In [None]:
# Let's try to project the coefs back to the original number of features
eigenvectors = pca.components_
pcr_coefs = eigenvectors[:r2_max, :].T @ lreg.coef_

pcr_coefs.shape

In [None]:
# Let's check which venue types effect the most and least
print('\nMax positive coefs:', pcr_coefs[np.argsort(-pcr_coefs)[:10]])
print('\nVenue types with most positive effect:', X.columns[np.argsort(-pcr_coefs)[:10]].values)
print('\nMax negative coefs:', pcr_coefs[np.argsort(pcr_coefs)[:10]])
print('\nVenue types with most negative effect:', X.columns[np.argsort(pcr_coefs)[:10]].values)
coef_abs = abs(pcr_coefs)
print('\nMin coefs:', pcr_coefs[np.argsort(coef_abs)[:10]])
print('\nVenue types with least effect:', X.columns[np.argsort(coef_abs)[:10]].values)

## Clustering

In [None]:
# set number of clusters
kclusters = 5

scc_grouped_clustering = df_grouped.drop('city/town', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(scc_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

In [None]:
city_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

scc_merged = df

scc_merged = scc_merged.join(city_venues_sorted.set_index('city/town'), on='city/town')

scc_merged.head() 

In [None]:
# Obtain coordinates for Santa Clara

address = 'Santa Clara, California'

geolocator = Nominatim(user_agent="http://nominatim.openstreetmap.org/reverse?email=jpander@gmail.com&format=xml&lat=-23.56320001&lon=-46.66140002&zoom=27&addressdetails=1")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Central Toronto are {}, {}.'.format(latitude, longitude))

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(scc_merged['latitude'], scc_merged['longitude'], scc_merged['city/town'], scc_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow, #rainbow[cluster-1],
        fill=True,
        fill_color=rainbow, #rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Cluster 1

In [None]:
scc_merged.loc[scc_merged['Cluster Labels'] == 0, scc_merged.columns[[1] + list(range(5, scc_merged.shape[1]))]]

#### Cluster 2

In [None]:
scc_merged.loc[scc_merged['Cluster Labels'] == 1, scc_merged.columns[[1] + list(range(5, scc_merged.shape[1]))]]

#### Cluster 3

In [None]:
scc_merged.loc[scc_merged['Cluster Labels'] == 2, scc_merged.columns[[1] + list(range(5, scc_merged.shape[1]))]]

#### Cluster 4

In [None]:
scc_merged.loc[scc_merged['Cluster Labels'] == 3, scc_merged.columns[[1] + list(range(5, scc_merged.shape[1]))]]

#### Cluster 5

In [None]:
scc_merged.loc[scc_merged['Cluster Labels'] == 4, scc_merged.columns[[1] + list(range(5, scc_merged.shape[1]))]]