In [1]:
# import libraries

import pandas as pd 
import numpy as np 
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import seaborn as sns 

In [2]:
# Read in dataframe

venues_df = pd.read_csv('../data/venues_df.csv')
venues_df.head()

Unnamed: 0,neighborhood,latitude,longitude,venue_name,venue_latitude,venue_longitude,venue_type
0,Anderson Mill,30.455834,-97.807096,Millrun Park,30.451548,-97.802975,Park
1,Anderson Mill,30.455834,-97.807096,Harper Park (in Anderson Mill),30.457869,-97.811024,Park
2,Anderson Mill,30.455834,-97.807096,Freda's Seafood Grille,30.464196,-97.803776,Seafood Restaurant
3,Anderson Mill,30.455834,-97.807096,Interstellar Bbq,30.461178,-97.81493,BBQ Joint
4,Anderson Mill,30.455834,-97.807096,Punch Austin Kettlebell Gym,30.447413,-97.810296,Gym / Fitness Center


In [3]:
# Read in dataframe 

data_df = pd.read_csv('../data/data_df.csv')
data_df.head()

Unnamed: 0,neighborhood,location,latitude,longitude,url_format,url,median_prices,avg_price_per_sqft
0,Anderson Mill,"Anderson Mill, Austin, Texas",30.455834,-97.807096,anderson-mill,https://www.neighborhoods.com/anderson-mill-au...,321495.0,194.0
1,Barton Hills,"Barton Hills, Austin, Texas",30.251571,-97.784106,barton-hills,https://www.neighborhoods.com/barton-hills-aus...,667750.0,487.0
2,Bouldin Creek,"Bouldin Creek, Austin, Texas",30.255667,-97.755481,bouldin-creek,https://www.neighborhoods.com/bouldin-creek-au...,725750.0,546.0
3,Brentwood,"Brentwood, Austin, Texas",30.331264,-97.736465,brentwood,https://www.neighborhoods.com/brentwood-austin-tx,495250.0,370.0
4,Bryker Woods,"Bryker Woods, Austin, Texas",30.305246,-97.754585,bryker-woods,https://www.neighborhoods.com/bryker-woods-aus...,795000.0,558.0


In [4]:
data_df.describe()

Unnamed: 0,latitude,longitude,median_prices,avg_price_per_sqft
count,37.0,37.0,37.0,37.0
mean,30.280944,-97.761416,520183.9,381.972973
std,0.07147,0.054907,202841.2,149.404835
min,30.140056,-97.899507,170000.0,164.0
25%,30.236389,-97.781803,382250.0,232.0
50%,30.268054,-97.75046,496400.0,386.0
75%,30.31339,-97.731707,675000.0,499.0
max,30.455834,-97.659055,1002500.0,745.0


In [5]:
data_df.shape[0]

37

In [6]:
price_groups = []

for price in data_df['avg_price_per_sqft']:
    if price >= 0 and price < 400:
        group = 0
        price_groups.append(group)
    else:
        group = 1
        price_groups.append(group)

In [7]:
# Add price groups to dataframe
data_df['price_group'] = price_groups

In [8]:
data_df.head()

Unnamed: 0,neighborhood,location,latitude,longitude,url_format,url,median_prices,avg_price_per_sqft,price_group
0,Anderson Mill,"Anderson Mill, Austin, Texas",30.455834,-97.807096,anderson-mill,https://www.neighborhoods.com/anderson-mill-au...,321495.0,194.0,0
1,Barton Hills,"Barton Hills, Austin, Texas",30.251571,-97.784106,barton-hills,https://www.neighborhoods.com/barton-hills-aus...,667750.0,487.0,1
2,Bouldin Creek,"Bouldin Creek, Austin, Texas",30.255667,-97.755481,bouldin-creek,https://www.neighborhoods.com/bouldin-creek-au...,725750.0,546.0,1
3,Brentwood,"Brentwood, Austin, Texas",30.331264,-97.736465,brentwood,https://www.neighborhoods.com/brentwood-austin-tx,495250.0,370.0,0
4,Bryker Woods,"Bryker Woods, Austin, Texas",30.305246,-97.754585,bryker-woods,https://www.neighborhoods.com/bryker-woods-aus...,795000.0,558.0,1


In [9]:
# View value counts distribution

data_df['price_group'].value_counts()

0    19
1    18
Name: price_group, dtype: int64

In [10]:
# Confirm number of remaining neighborhoods

len(venues_df['neighborhood'].unique())

36

In [11]:
# Preview
venues_df.head()

Unnamed: 0,neighborhood,latitude,longitude,venue_name,venue_latitude,venue_longitude,venue_type
0,Anderson Mill,30.455834,-97.807096,Millrun Park,30.451548,-97.802975,Park
1,Anderson Mill,30.455834,-97.807096,Harper Park (in Anderson Mill),30.457869,-97.811024,Park
2,Anderson Mill,30.455834,-97.807096,Freda's Seafood Grille,30.464196,-97.803776,Seafood Restaurant
3,Anderson Mill,30.455834,-97.807096,Interstellar Bbq,30.461178,-97.81493,BBQ Joint
4,Anderson Mill,30.455834,-97.807096,Punch Austin Kettlebell Gym,30.447413,-97.810296,Gym / Fitness Center


In [12]:
# Count of unique venues

len(venues_df['venue_type'].unique())

270

In [13]:
# drop unncessary columns

venues_df.drop(columns=['latitude', 'longitude', 'venue_name', 'venue_latitude', 'venue_longitude'], inplace=True)
venues_df.head()

Unnamed: 0,neighborhood,venue_type
0,Anderson Mill,Park
1,Anderson Mill,Park
2,Anderson Mill,Seafood Restaurant
3,Anderson Mill,BBQ Joint
4,Anderson Mill,Gym / Fitness Center


In [14]:
# Count of venues per neighborhood

venues_df.groupby('neighborhood').count().head()

Unnamed: 0_level_0,venue_type
neighborhood,Unnamed: 1_level_1
Anderson Mill,18
Barton Hills,30
Bouldin Creek,100
Brentwood,59
Bryker Woods,47


In [15]:
# Dummify venue types

df_onehot = pd.get_dummies(venues_df[['venue_type']], prefix='', prefix_sep='')

# Add neighborhoods to df
df_onehot['neighborhood'] = venues_df['neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [df_onehot.columns[-1]] + list(df_onehot.columns[:-1])
df_onehot = df_onehot[fixed_columns]

In [16]:
# Preview

df_onehot.head()

Unnamed: 0,neighborhood,ATM,Accessories Store,Adult Boutique,African Restaurant,American Restaurant,Antique Shop,Argentinian Restaurant,Art Gallery,Art Museum,...,Veterinarian,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Winery,Wings Joint,Women's Store,Yoga Studio
0,Anderson Mill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Anderson Mill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Anderson Mill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Anderson Mill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Anderson Mill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# Check new shape

df_onehot.shape

(1901, 271)

In [18]:
# Group venue type count by neighborhood

df_grouped = df_onehot.groupby('neighborhood').sum().reset_index() 

In [19]:
# Preview
df_grouped.head()

Unnamed: 0,neighborhood,ATM,Accessories Store,Adult Boutique,African Restaurant,American Restaurant,Antique Shop,Argentinian Restaurant,Art Gallery,Art Museum,...,Veterinarian,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Winery,Wings Joint,Women's Store,Yoga Studio
0,Anderson Mill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Barton Hills,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,Bouldin Creek,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,1
3,Brentwood,0,0,0,0,1,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1
4,Bryker Woods,0,0,0,0,4,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [20]:
# View size

df_grouped.shape

(36, 271)

In [21]:
# Combine dataframes

df_comb = pd.merge(df_grouped, data_df, on='neighborhood')
df_comb.head()

Unnamed: 0,neighborhood,ATM,Accessories Store,Adult Boutique,African Restaurant,American Restaurant,Antique Shop,Argentinian Restaurant,Art Gallery,Art Museum,...,Women's Store,Yoga Studio,location,latitude,longitude,url_format,url,median_prices,avg_price_per_sqft,price_group
0,Anderson Mill,0,0,0,0,0,0,0,0,0,...,0,0,"Anderson Mill, Austin, Texas",30.455834,-97.807096,anderson-mill,https://www.neighborhoods.com/anderson-mill-au...,321495.0,194.0,0
1,Barton Hills,0,0,0,0,0,0,0,0,0,...,0,1,"Barton Hills, Austin, Texas",30.251571,-97.784106,barton-hills,https://www.neighborhoods.com/barton-hills-aus...,667750.0,487.0,1
2,Bouldin Creek,0,0,0,0,0,0,0,0,0,...,0,1,"Bouldin Creek, Austin, Texas",30.255667,-97.755481,bouldin-creek,https://www.neighborhoods.com/bouldin-creek-au...,725750.0,546.0,1
3,Brentwood,0,0,0,0,1,1,0,0,0,...,0,1,"Brentwood, Austin, Texas",30.331264,-97.736465,brentwood,https://www.neighborhoods.com/brentwood-austin-tx,495250.0,370.0,0
4,Bryker Woods,0,0,0,0,4,0,0,0,0,...,0,0,"Bryker Woods, Austin, Texas",30.305246,-97.754585,bryker-woods,https://www.neighborhoods.com/bryker-woods-aus...,795000.0,558.0,1


In [22]:
# Drop unnecessary columns

df_comb.drop(columns=['location', 'latitude', 'longitude', 'url_format', 'url', 'median_prices', 'avg_price_per_sqft'], inplace=True)
df_comb.head()

Unnamed: 0,neighborhood,ATM,Accessories Store,Adult Boutique,African Restaurant,American Restaurant,Antique Shop,Argentinian Restaurant,Art Gallery,Art Museum,...,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Winery,Wings Joint,Women's Store,Yoga Studio,price_group
0,Anderson Mill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Barton Hills,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,1
2,Bouldin Creek,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,1,1
3,Brentwood,0,0,0,0,1,1,0,0,0,...,0,1,0,0,0,0,0,0,1,0
4,Bryker Woods,0,0,0,0,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [23]:
# Save dataframe for modeling 

df_comb.to_csv('../data/df_comb.csv', index=False)