In [17]:
# import libraries

import pandas as pd 
import numpy as np 
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, normalize, scale
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [18]:
# Read in dataframe

df_comb = pd.read_csv('../data/df_comb.csv')
df_comb.head()

Unnamed: 0,neighborhood,ATM,Accessories Store,Adult Boutique,African Restaurant,American Restaurant,Antique Shop,Argentinian Restaurant,Art Gallery,Art Museum,...,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Winery,Wings Joint,Women's Store,Yoga Studio,price_group
0,Anderson Mill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Barton Hills,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,1
2,Bouldin Creek,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,1,1
3,Brentwood,0,0,0,0,1,1,0,0,0,...,0,1,0,0,0,0,0,0,1,0
4,Bryker Woods,0,0,0,0,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


# Modeling

#### Prepocessing

In [84]:
# Define X and y

X = df_comb.drop(columns=['neighborhood', 'price_group'])
y = df_comb['price_group']

In [85]:
# Train/test split 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [86]:
# # Scale data

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

#### PCA Modeling

In [90]:
# Instantiate and fit model
pca = PCA(svd_solver='auto', random_state=0)
# X_pca = pca.fit(X_train) 
X_pca = pca.fit_transform(scale(X))

In [91]:
# Code adapted from Medium article on PCA 

n_component_list = range(1, 51)
R2_list = []
MSE_list = []

# Linear Regression
for i in n_component_list:
    lr = LinearRegression()
    X_train, X_test, y_train, y_test = train_test_split(X_pca[:,:i], y, test_size=0.2, random_state=0)
    model = lr.fit(X_train, y_train)
    # check the result
    y_pred = lr.predict(X_test)
    r2 = r2_score(y_test, y_pred) # r2 score
    mse = mean_squared_error(y_test, y_pred) # mse
    R2_list.append(r2)
    MSE_list.append(mse)
    
scores_df = pd.DataFrame.from_dict(dict([('NComponents', n_component_list),
                                        ('R2', R2_list),
                                        ('MSE', MSE_list)]))
scores_df.set_index('NComponents', inplace=True)

In [92]:
# View best scores

r2_max = scores_df['R2'].idxmax()
print("Best n:", r2_max, ", R2 score:", scores_df['R2'][r2_max])

mse_min = scores_df['MSE'].idxmin()
print("Best n:", mse_min, ", MSE:", scores_df['MSE'][mse_min])

Best n: 7 , R2 score: 0.23021842891698496
Best n: 7 , MSE: 0.18041755572258164


In [103]:
# Use the best n_components parameter

# Instantiate and fit
lr = LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(X_pca[:,:r2_max], y, test_size=0.2, random_state=0)
model = lr.fit(X_train, y_train)

In [104]:
# View scores

y_pred = lr.predict(X_test)
r2 = r2_score(y_test, y_pred) # r2 score
mse = mean_squared_error(y_test, y_pred) # mse
print("R2 score:", r2)
print("MSE:", mse)

R2 score: 0.23021842891698496
MSE: 0.18041755572258164


In [105]:
# Project the coefs back to the original number of features

eigenvectors = pca.components_
pcr_coefs = eigenvectors[:r2_max, :].T @ lr.coef_

In [106]:
# View with venue types have most and least effect

print('\nMax positive coefs:', pcr_coefs[np.argsort(-pcr_coefs)[:5]])
print('\nVenue types with most positive effect:', X.columns[np.argsort(-pcr_coefs)[:5]].values)
print('\nMax negative coefs:', pcr_coefs[np.argsort(pcr_coefs)[:5]])
print('\nVenue types with most negative effect:', X.columns[np.argsort(pcr_coefs)[:5]].values)
coef_abs = abs(pcr_coefs)
print('\nMin coefs:', pcr_coefs[np.argsort(coef_abs)[:5]])
print('\nVenue types with least effect:', X.columns[np.argsort(coef_abs)[:5]].values)


Max positive coefs: [0.01259863 0.01207248 0.01133876 0.0107604  0.01068665]

Venue types with most positive effect: ['Italian Restaurant' 'Art Gallery' 'Seafood Restaurant' 'Ice Cream Shop'
 'Spa']

Max negative coefs: [-0.01051495 -0.01051495 -0.01051495 -0.01044824 -0.00926685]

Venue types with most negative effect: ['Pool Hall' 'Camera Store' 'Auto Dealership' 'Bagel Shop'
 'Indian Restaurant']

Min coefs: [-7.67178138e-05 -9.50939824e-05 -1.33901987e-04 -1.55390064e-04
 -1.55390064e-04]

Venue types with least effect: ['Playground' 'Platform' 'Tennis Court' 'Kitchen Supply Store'
 'Golf Driving Range']


#### SVM Model

In [38]:
# Instantiate and fit model

svm_model = svm.SVC()
svm_model.fit(X_train, y_train) 

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [39]:
# Generate predictions

y_pred = svm_model.predict(X_test)
y_pred

array([0, 1, 0, 1, 1, 1, 0, 0])

In [40]:
# Get accuracy scores

print("SVM Training Accuracy Score: %.3f" % svm_model.score(X_train, y_train))
print("SVM Testing Accuracy Score: %.3f" % svm_model.score(X_test, y_test)) 

SVM Training Accuracy Score: 0.966
SVM Testing Accuracy Score: 0.625


#### Decision Tree

In [97]:
# Instantiate and fit model

dt_model = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
dt_model.fit(X_train,y_train)
# dt_model

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=4, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [98]:
# Generate predictions

y_pred = dt_model.predict(X_test)
y_pred

array([1, 1, 0, 1, 1, 1, 1, 0])

In [99]:
print("Decision Tree Training Accuracy Score: %.3f" % dt_model.score(X_train, y_train))
print("Decision Tree Testing Accuracy Score: %.3f" % dt_model.score(X_test, y_test))

Decision Tree Training Accuracy Score: 1.000
Decision Tree Testing Accuracy Score: 0.875


#### Logistic Regression

In [100]:
# Instantiate and fit model

logreg_model = LogisticRegression(C=0.01).fit(X_train,y_train)
logreg_model

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [101]:
# Generate predictions

y_pred = logreg_model.predict(X_test)
y_pred

array([0, 1, 0, 1, 1, 1, 0, 0])

In [102]:
print("Logreg Training Accuracy Score: %.3f" % logreg_model.score(X_train, y_train))
print("Logreg Tree Testing Accuracy Score: %.3f" % logreg_model.score(X_test, y_test))

Logreg Training Accuracy Score: 0.862
Logreg Tree Testing Accuracy Score: 0.625
