### Modeling: Predicting Median Rent from Yelp 

##### Imports

In [174]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

from geopy import distance as d

import sys
import os
sys.path.insert(0, os.path.abspath("../"))

import CRA as c

In [28]:
yelp = pd.read_csv("../datasets/seattle_restaurants.csv")

seattle = pd.read_csv("../datasets/seattle_demographics.csv")

##### Formatting

In [35]:
yelp.head()

Unnamed: 0.1,Unnamed: 0,name,price,categories,coordinates,cra
0,0,Pike Place Chowder,2,"['seafood', 'soup']","[47.60939, -122.34112]",Downtown Commercial Core
1,1,Piroshky Piroshky,1,"['bakeries', 'russian']","[47.60991, -122.34231]",Belltown
2,2,The Pink Door,2,"['italian', 'wine_bars', 'seafood']","[47.61028, -122.3425]",Belltown
3,3,Paseo,2,"['caribbean', 'cuban', 'sandwiches']","[47.65849, -122.35031]",Fremont
4,4,Ellenos Real Greek Yogurt,1,['icecream'],"[47.608912, -122.34058]",Downtown Commercial Core


In [43]:
yelp.drop(columns = ["Unnamed: 0"], inplace = True)

In [49]:
seattle.head()

Unnamed: 0,OBJECTID,CRA_NO,CRA_GRP,GEN_ALIAS,DETL_NAMES,NEIGHDIST,AREA_SQMI,DISPLAY_NAME,TOTAL_POPULATION,MEDIAN_AGE,...,SHAPE_Area,PCT_ALL_FAMILY_UNDER_POVERTY,PCT_POPULATION_UNDER_POVERTY,HU_VALUE_MEDIAN_DOLLARS,AVERAGE_HOUSEHOLD_SIZE,CIVILIAN_LABOR_FORCE_UNEMPLOYD,CIVILIAN_LABOR_FORCE,CIVILIAN_LABOR_FORCE_EMPLOYED,NOT_IN_LABOR_FORCE,MEDIAN_HH_INC_PAST_12MO_DOLLAR
0,1,10.4,10,Ballard,"Ballard, West Woodland, Adams",Ballard,0.77,CRA - Ballard,8649,34.3,...,21472310.0,0.0,7.0,543200,1.62,324,6313,5989,1736,79162
1,2,10.1,10,North Beach/Blue Ridge,"Crown Hill, North Beach, Blue Ridge",Ballard,2.01,CRA - North Beach-Blue Ridge,12701,42.6,...,55950010.0,6.6,7.8,658600,2.38,268,7787,7519,2666,94804
2,3,7.1,7,Montlake/Portage Bay,"Montlake, Portage Bay, Interlaken Park, Eastla...",Northeast,1.49,CRA - Montlake-Portage Bay,9732,37.3,...,41429080.0,2.0,4.6,821250,2.09,110,6518,6408,1941,132573
3,4,12.2,12,Interbay,Interbay,Magnolia/Queen Anne,1.9,CRA - Interbay,11024,34.4,...,52907760.0,4.5,8.7,571300,1.92,334,7885,7551,1675,74679
4,5,6.3,6,North Capitol Hill,"North Capitol Hill, Capitol Hill, North Broadway",East,0.44,CRA - North Capitol Hill,4807,36.1,...,12356840.0,1.2,2.3,896200,1.93,149,3415,3266,759,96220


In [50]:
keeps = ["GEN_ALIAS", "AREA_SQMI", "MEDIAN_GROSS_RENT"]

seattle_rent = seattle.loc[:, keeps].copy()

In [58]:
y = yelp["cra"].unique()
s = seattle["GEN_ALIAS"].unique()

[print(x) for x in y if x not in s]

[]

In [57]:
yelp = yelp[yelp["cra"] != "Not Found"].copy()

In [62]:
seattle_rent.head()

Unnamed: 0,GEN_ALIAS,AREA_SQMI,MEDIAN_GROSS_RENT,1 dollar
0,Ballard,0.77,1542,0
1,North Beach/Blue Ridge,2.01,1476,0
2,Montlake/Portage Bay,1.49,1723,0
3,Interbay,1.9,1490,0
4,North Capitol Hill,0.44,1576,0


In [63]:
filler = [0 for i in range(seattle_rent.shape[0])]

for i in range(1, 5): 
    seattle_rent[f"{i} dollar"] = filler

In [131]:
# add columns in seattle df for metrics 
# proportion of each tier of dollar sign rating 

def dollar_rating(seattle_df, yelp_df, neighborhood, dollar_tier): 
    yelp_hood = yelp_df[yelp_df["cra"] == neighborhood]
    
    if yelp_df[yelp_df["cra"] == neighborhood].shape[0] == 0: 
        proportion = 0
    else: 
        num_businesses = yelp_df[yelp_df["cra"] == neighborhood].shape[0]

        proportion = (yelp_hood[yelp_hood["price"] == dollar_tier].shape[0])/num_businesses
    
    hood_index = seattle_df.index[seattle_df["GEN_ALIAS"] == neighborhood].tolist()
    
    seattle_df.loc[hood_index, [f"{dollar_tier} dollar"]] = proportion
    

In [132]:
hoods = list(seattle_rent["GEN_ALIAS"].unique())

for hood in hoods: 
    for i in range(1, 5): 
        dollar_rating(seattle_rent, yelp, hood, i)

In [133]:
seattle_rent.head()

Unnamed: 0,GEN_ALIAS,AREA_SQMI,MEDIAN_GROSS_RENT,1 dollar,2 dollar,3 dollar,4 dollar
0,Ballard,0.77,1542,0.229167,0.75,0.020833,0.0
1,North Beach/Blue Ridge,2.01,1476,0.0,1.0,0.0,0.0
2,Montlake/Portage Bay,1.49,1723,0.142857,0.714286,0.142857,0.0
3,Interbay,1.9,1490,0.4,0.6,0.0,0.0
4,North Capitol Hill,0.44,1576,0.25,0.75,0.0,0.0


In [None]:
yelp.reset_index(inplace = True)

##### Functions
Defining functions for finding businesses within a certain radius of a selected location

In [136]:
yelp.head()

Unnamed: 0,name,price,categories,coordinates,cra
0,Pike Place Chowder,2,"['seafood', 'soup']","[47.60939, -122.34112]",Downtown Commercial Core
1,Piroshky Piroshky,1,"['bakeries', 'russian']","[47.60991, -122.34231]",Belltown
2,The Pink Door,2,"['italian', 'wine_bars', 'seafood']","[47.61028, -122.3425]",Belltown
3,Paseo,2,"['caribbean', 'cuban', 'sandwiches']","[47.65849, -122.35031]",Fremont
4,Ellenos Real Greek Yogurt,1,['icecream'],"[47.608912, -122.34058]",Downtown Commercial Core


In [141]:
yelp.dtypes

name           object
price           int64
categories     object
coordinates    object
cra            object
dtype: object

In [148]:
filler = [0 for x in range(yelp.shape[0])]

yelp["latitude"] = filler 
yelp["longitude"] = filler

In [164]:
def separate_coordinates(yelp_df): 
    for coordinate in yelp_df["coordinates"]: 
        index = yelp.index[yelp["coordinates"] == coordinate]
        
        split = coordinate.split(",")
        lat = split[0].replace("[", "")
        long = split[1].replace(" ", "").replace("]", "")
        yelp.loc[index, ["latitude"]] = lat
        yelp.loc[index, ["longitude"]] = long

In [166]:
separate_coordinates(yelp)

In [178]:
yelp["latitude"] = yelp["latitude"].astype("float")

yelp["longitude"] = yelp["longitude"].astype("float")

In [167]:
yelp.head()

Unnamed: 0,name,price,categories,coordinates,cra,latitude,longitude
0,Pike Place Chowder,2,"['seafood', 'soup']","[47.60939, -122.34112]",Downtown Commercial Core,47.60939,-122.34112
1,Piroshky Piroshky,1,"['bakeries', 'russian']","[47.60991, -122.34231]",Belltown,47.60991,-122.34231
2,The Pink Door,2,"['italian', 'wine_bars', 'seafood']","[47.61028, -122.3425]",Belltown,47.61028,-122.3425
3,Paseo,2,"['caribbean', 'cuban', 'sandwiches']","[47.65849, -122.35031]",Fremont,47.65849,-122.35031
4,Ellenos Real Greek Yogurt,1,['icecream'],"[47.608912, -122.34058]",Downtown Commercial Core,47.608912,-122.34058


In [204]:
# generate gps locations 

gps_dict = {"latitude": [], "longitude": [], "neighborhood": []}

for i in range(1300): 
    lat = round(random.uniform(yelp["latitude"].min(), yelp["latitude"].max()), 6)
    gps_dict["latitude"].append(lat)
    
    long = round(random.uniform(yelp["longitude"].min(), yelp["longitude"].max()), 6)
    gps_dict["longitude"].append(long)
    
    n = c.to_cra([long, lat])
    gps_dict["neighborhood"].append(n)

In [205]:
gps_df = pd.DataFrame(gps_dict)

gps_df = gps_df[gps_df["neighborhood"] != "Not Found"].copy()

gps_df.shape

(993, 3)

In [213]:
d.distance(home, sasha).miles

4.5074044982716455

In [217]:
for i in range(1, 5): 
    gps_df[f"0.5mi {i} dollar"] = [0 for x in range(gps_df.shape[0])]
    gps_df[f"1.0mi {i} dollar"] = [0 for x in range(gps_df.shape[0])]

In [220]:
gps_df.head()

Unnamed: 0,latitude,longitude,neighborhood,0.5mi 1 dollar,1.0mi 1 dollar,0.5mi 2 dollar,1.0mi 2 dollar,0.5mi 3 dollar,1.0mi 3 dollar,0.5mi 4 dollar,1.0mi 4 dollar
1,47.702117,-122.346505,Licton Springs,0,0,0,0,0,0,0,0
2,47.7161,-122.365286,Broadview/Bitter Lake,0,0,0,0,0,0,0,0
4,47.64924,-122.400311,Magnolia,0,0,0,0,0,0,0,0
6,47.610194,-122.320842,Capitol Hill,0,0,0,0,0,0,0,0
7,47.571082,-122.379656,West Seattle Junction/Genesee Hill,0,0,0,0,0,0,0,0


In [245]:
type((gps_df.loc[0, ["latitude"]], gps_df.loc[0, ["longitude"]]))

tuple

In [253]:
bread = (47.679642, -122.290494)

type(bread)

tuple

In [308]:
def radius_dollar_proportion(location, df, radius, dollar_tier):
    length = df.shape[0]
    
    indices = []
    
    for i in range(length): 
        coordinates = (float(df.loc[i, ["latitude"]]), float(df.loc[i, ["longitude"]]))
        if d.distance(location, coordinates).miles <= radius: 
            indices.append(i)
    
    if len(indices) == 0: 
        proportion = 0
    else: 
        surrounding_businesses = df.iloc[indices]
        total_businesses = surrounding_businesses.shape[0]
        proportion = (surrounding_businesses[surrounding_businesses["price"] == dollar_tier].shape[0]/total_businesses)
    
    return proportion
    

In [309]:
radius_dollar_proportion(sasha, yelp, 0.5, 3)

0.0234375

In [311]:
for i in range(gps_df.shape[0]): 
    for dollar in range(1, 5): 
        for radius in [0.5, 1.0]: 
            location = (float(gps_df.loc[i, ["latitude"]]), float(gps_df.loc[i, ["longitude"]]))
            gps_df.loc[i, [f"{radius}mi {dollar} dollar"]] = radius_dollar_proportion(location, yelp, radius, dollar)

In [314]:
gps_df.shape

(993, 12)