In [1]:
#Support Vector Regression


In [2]:
from sklearn import svm
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

### Data Import 

In [3]:
#Data Import
zipCSVPath = '../../data/Zip_Code_Data.csv'
zipDF = pd.read_csv(zipCSVPath)
zipDF['Zip'] = zipDF['Zip'].astype(str)
laDF = zipDF[(zipDF.City=="Los Angeles")]
laDFTarget = laDF[['Zip',
                    'Total_Pop',
                   'Percent_21_and_Older',
                   'Percent_60_and_Older',
                   'Percent_Male',
                   'Pct_Owner_Occupied',
                   'Pct_Renter_Occupied'
                  ]]

la_data = pd.read_csv('../../data/la_combined.csv').drop(columns='Unnamed: 0')
la_data['Zip'] = la_data['Zip'].astype(str)


la_data = la_data.merge(laDFTarget)

In [4]:
la_data_model = la_data.iloc[:103,:]
la_data_test = la_data.iloc[-20:,:]

In [32]:
la_data_model.head()

Unnamed: 0,Zip,Cluster_Labels,LAT,LNG,Num_Disp,Total_Households,Household_Median_Income,Total_Pop,Percent_21_and_Older,Percent_60_and_Older,Percent_Male,Pct_Owner_Occupied,Pct_Renter_Occupied
0,90001,2,33.973274,-118.24897,7,13669,43360,59832,63.3,10.9,51.7,35.9,64.1
1,90002,2,33.948951,-118.24698,1,12917,37285,53302,62.8,10.4,47.7,35.0,65.0
2,90003,2,33.96335,-118.273936,1,17484,40598,73730,63.0,10.0,49.0,28.3,71.7
3,90004,2,34.075721,-118.303017,1,22004,49675,60541,77.5,15.8,49.3,16.6,83.4
4,90005,3,34.059071,-118.303017,0,16781,38491,39732,78.6,16.8,50.4,7.7,92.3


In [31]:
la_data_test.head()

Unnamed: 0,Zip,Cluster_Labels,LAT,LNG,Num_Disp,Total_Households,Household_Median_Income,Total_Pop,Percent_21_and_Older,Percent_60_and_Older,Percent_Male,Pct_Owner_Occupied,Pct_Renter_Occupied,Predicted_Value,Percent_Difference
103,91352,1,34.22551,-118.376517,26,12436,57145,47076,72.9,17.9,49.3,53.4,46.6,5.087365,0.804332
104,91356,2,34.169903,-118.540489,3,11802,76929,29822,78.7,27.1,48.6,56.8,43.2,3.623057,0.207686
105,91364,2,34.162012,-118.598469,4,10648,106225,27971,77.6,25.7,49.4,66.8,33.2,3.415476,0.146131
106,91367,2,34.178584,-118.609579,3,18433,96085,45970,78.9,22.7,47.3,53.1,46.9,5.242073,0.747358
107,91401,1,34.181152,-118.434827,3,14933,53882,39755,76.4,18.7,49.0,33.8,66.2,4.245392,0.415131


### Scale Data

In [7]:
#Map Domain data
X = la_data_model[['Total_Pop',
                   'Total_Households', 
                   'Household_Median_Income',
                   'Percent_21_and_Older',
                   'Percent_60_and_Older',
                   'Percent_Male',
                   'Pct_Owner_Occupied',
                   'Pct_Renter_Occupied']].to_numpy()
#Map Range data
y = la_data_model['Num_Disp'].to_numpy().reshape((-1,1))

In [8]:
#Scale Model Data
sc_X = StandardScaler()
sc_y = StandardScaler()

X = sc_X.fit_transform(X)
y = sc_y.fit_transform(y).ravel()


In [9]:
#Scale Test Data
X_test = la_data_test[['Total_Pop',
                   'Total_Households', 
                   'Household_Median_Income',
                   'Percent_21_and_Older',
                   'Percent_60_and_Older',
                   'Percent_Male',
                   'Pct_Owner_Occupied',
                   'Pct_Renter_Occupied']].to_numpy()
y_test = la_data_test['Num_Disp'].to_numpy().reshape((-1,1))

X_test = sc_X.fit_transform(X_test)
y_test = sc_y.fit_transform(y_test).ravel()

### Grid Search to find optimal parameters for model

In [10]:
# Use grid Search to find the best parameters for the data

parameters = {'kernel':('linear', 'poly', 'rbf', 'sigmoid'),'C':[0.5,1,5,10,50],'epsilon':[0.01,0.1,0.5,1,5,10,50]}

regr = svm.SVR()
clf = GridSearchCV(regr, parameters)
clf.fit(X,y)

clf.best_params_

{'C': 0.5, 'epsilon': 0.1, 'kernel': 'rbf'}

In [11]:
#Create a model with grid search parameters
regr = svm.SVR(**clf.best_params_)
regr.fit(X,y)

SVR(C=0.5)

### Testing the Model


In [64]:
def percentDiff(x1, y1):
    dummy = []
    for i,e in enumerate(x1):
        if e != 0:
            foo = abs(y1[i] - e) / e
        else:
            foo = y1[i]
        dummy.append(round(100*foo,2))
    return dummy

In [65]:
# Predict Values from model
y_pred = regr.predict(X_test)

y_pred = list(sc_y.inverse_transform(y_pred))
y_test = la_data_test['Num_Disp']


la_data_test['Predicted_Value'] = y_pred

la_data_test['Percent_Difference'] = percentDiff(y_test, y_pred)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [67]:
la_data_test[la_data_test['Percent_Difference'] < 40][['Num_Disp',
                                                       'Total_Pop',
                                                       'Total_Households', 
                                                       'Household_Median_Income',
                                                       'Percent_21_and_Older',
                                                       'Percent_60_and_Older',
                                                       'Percent_Male',
                                                       'Pct_Owner_Occupied',
                                                       'Pct_Renter_Occupied',
                                                       'Predicted_Value',
                                                       'Percent_Difference'
                                                      ]].describe()

Unnamed: 0,Num_Disp,Total_Pop,Total_Households,Household_Median_Income,Percent_21_and_Older,Percent_60_and_Older,Percent_Male,Pct_Owner_Occupied,Pct_Renter_Occupied,Predicted_Value,Percent_Difference
count,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
mean,5.666667,31998.666667,12440.5,80690.833333,78.2,20.233333,48.883333,42.683333,57.316667,4.594312,24.35
std,2.42212,12336.908489,3398.772764,25443.843659,5.155967,5.320777,0.397073,17.000873,17.000873,1.294741,9.271377
min,3.0,19672.0,9493.0,47037.0,72.5,14.5,48.5,25.1,33.2,3.415476,14.61
25%,4.0,26576.0,10021.75,63186.0,74.075,15.8,48.525,28.6,45.6,3.637111,16.885
50%,5.0,28896.5,11225.0,80468.5,78.15,19.65,48.85,40.15,59.85,4.151531,23.38
75%,7.5,32138.0,13931.25,100670.75,80.8,24.4,49.175,54.4,71.4,5.328227,29.3425
max,9.0,55506.0,18246.0,111341.0,86.0,27.1,49.4,66.8,74.9,6.661236,38.68


In [68]:
la_data_test[la_data_test['Percent_Difference'] > 40][['Num_Disp',
                                                       'Total_Pop',
                                                       'Total_Households', 
                                                       'Household_Median_Income',
                                                       'Percent_21_and_Older',
                                                       'Percent_60_and_Older',
                                                       'Percent_Male',
                                                       'Pct_Owner_Occupied',
                                                       'Pct_Renter_Occupied',
                                                       'Predicted_Value',
                                                       'Percent_Difference'
                                                      ]].describe()

Unnamed: 0,Num_Disp,Total_Pop,Total_Households,Household_Median_Income,Percent_21_and_Older,Percent_60_and_Older,Percent_Male,Pct_Owner_Occupied,Pct_Renter_Occupied,Predicted_Value,Percent_Difference
count,14.0,14.0,14.0,14.0,14.0,14.0,14.0,14.0,14.0,14.0,14.0
mean,6.285714,39705.785714,14314.428571,77122.142857,77.457143,19.128571,49.3,42.257143,57.742857,4.912129,203.829286
std,8.835493,14566.543855,3829.25654,34076.289451,4.323943,3.596243,1.261867,14.575887,14.575887,0.733813,180.92742
min,0.0,15292.0,5600.0,45796.0,70.3,14.7,47.3,17.9,21.9,3.691204,41.51
25%,0.25,30683.75,12552.5,53889.25,74.525,16.45,48.45,33.575,48.425,4.361159,64.81
50%,2.5,38298.0,15009.0,65581.5,76.5,18.95,49.4,38.8,61.2,4.99419,112.74
75%,7.0,46799.5,16600.25,89982.75,80.7,20.575,50.05,51.575,66.425,5.312092,356.81
max,26.0,72059.0,19839.0,174085.0,84.9,27.8,51.7,78.1,82.1,6.160512,512.77


### Sacramento  data 


In [69]:
#Incorperate Other City Data to model

sacDF = zipDF[zipDF.City=="Sacramento"]
sacDFTarget = sacDF[['Zip', 
                   'LAT', 
                   'LNG', 
                   "Num_Disp",
                   'Total_Pop',
                   'Total_Households', 
                   'Household_Median_Income',
                   'Percent_21_and_Older',
                   'Percent_60_and_Older',
                   'Percent_Male',
                   'Pct_Owner_Occupied',
                   'Pct_Renter_Occupied'
                  ]]
sacDFTarget['Zip'] = sacDFTarget['Zip'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [70]:
#Scale City data
X2_test = sacDFTarget[['Total_Pop',
                   'Total_Households', 
                   'Household_Median_Income',
                   'Percent_21_and_Older',
                   'Percent_60_and_Older',
                   'Percent_Male',
                   'Pct_Owner_Occupied',
                   'Pct_Renter_Occupied']].to_numpy()

X2_test = sc_X.fit_transform(X2_test)



In [71]:
#Create prediction from previous model
y2_pred = regr.predict(X2_test)

In [72]:
y2_pred = list(sc_y.inverse_transform(y2_pred))
y2_test = sacDFTarget['Num_Disp']


sacDFTarget['Predicted_Value'] = y2_pred

sacDFTarget['Percent_Difference'] = percentDiff(y2_test, y2_pred)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [73]:
sacDFTarget[sacDFTarget['Percent_Difference'] <= 30][['Num_Disp',
                                                       'Total_Pop',
                                                       'Total_Households', 
                                                       'Household_Median_Income',
                                                       'Percent_21_and_Older',
                                                       'Percent_60_and_Older',
                                                       'Percent_Male',
                                                       'Pct_Owner_Occupied',
                                                       'Pct_Renter_Occupied',
                                                       'Predicted_Value',
                                                       'Percent_Difference'
                                                      ]].describe()

Unnamed: 0,Num_Disp,Total_Pop,Total_Households,Household_Median_Income,Percent_21_and_Older,Percent_60_and_Older,Percent_Male,Pct_Owner_Occupied,Pct_Renter_Occupied,Predicted_Value,Percent_Difference
count,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,,,,,,,,,,,
std,,,,,,,,,,,
min,,,,,,,,,,,
25%,,,,,,,,,,,
50%,,,,,,,,,,,
75%,,,,,,,,,,,
max,,,,,,,,,,,


In [74]:
#These zip codes least accuratly predict the number of dispensaries in Sac, so if our hypothesis that LA is the gold star 
#to which all cities should base their dispensary locations, then these would be ideal locations for new dispensaries. 
sacDFTarget[sacDFTarget['Percent_Difference'] > 30][['Num_Disp',
                                                       'Total_Pop',
                                                       'Total_Households', 
                                                       'Household_Median_Income',
                                                       'Percent_21_and_Older',
                                                       'Percent_60_and_Older',
                                                       'Percent_Male',
                                                       'Pct_Owner_Occupied',
                                                       'Pct_Renter_Occupied',
                                                       'Predicted_Value',
                                                       'Percent_Difference'
                                                      ]].describe()

Unnamed: 0,Num_Disp,Total_Pop,Total_Households,Household_Median_Income,Percent_21_and_Older,Percent_60_and_Older,Percent_Male,Pct_Owner_Occupied,Pct_Renter_Occupied,Predicted_Value,Percent_Difference
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,1.45,33087.35,11854.7,60847.35,74.285,18.055,49.05,45.15,54.85,5.254556,301.279
std,1.431782,18059.943732,5308.868272,20053.753323,8.19077,4.291053,2.670403,15.379361,15.379361,1.405417,207.357054
min,0.0,6294.0,3147.0,33938.0,63.7,13.3,45.7,7.2,30.6,2.277022,51.96
25%,0.0,19217.25,8802.5,48200.25,68.725,13.85,47.05,38.35,45.775,4.64311,135.095
50%,1.0,33366.5,10834.5,56522.5,72.2,18.1,48.7,47.5,52.5,5.551679,212.89
75%,2.0,40901.0,15405.25,66814.5,77.925,20.45,49.925,54.225,61.65,5.990743,468.4725
max,4.0,79440.0,23710.0,106514.0,92.0,29.3,57.9,69.4,92.8,7.700432,770.04


In [75]:
location = '../../data/sacDispDataSVR.csv'
sacDFTarget.to_csv(location)

In [76]:
sacDFTarget

Unnamed: 0,Zip,LAT,LNG,Num_Disp,Total_Pop,Total_Households,Household_Median_Income,Percent_21_and_Older,Percent_60_and_Older,Percent_Male,Pct_Owner_Occupied,Pct_Renter_Occupied,Predicted_Value,Percent_Difference
123,95814,38.58167,-121.49591,1.0,11908,6709,33938,92.0,21.5,57.9,7.2,92.8,5.593368,459.34
124,95815,38.6057,-121.44588,4.0,25673,9226,34583,69.2,13.7,51.5,33.7,66.3,6.391143,59.78
125,95824,38.51773,-121.44093,0.0,30296,9103,38985,67.1,15.7,49.8,39.5,60.5,5.173192,517.32
126,95832,38.44796,-121.49676,0.0,12114,3147,47341,65.5,15.8,50.7,49.3,50.7,4.958662,495.87
127,95823,38.47424,-121.4431,0.0,79440,23710,47553,67.3,15.7,48.6,45.7,54.3,7.700432,770.04
128,95838,38.64462,-121.44059,2.0,39053,11056,48416,66.5,13.3,50.3,50.3,49.7,5.722382,186.12
129,95817,38.55166,-121.45074,2.0,13758,6148,50925,81.6,18.0,45.7,40.7,59.3,3.427095,71.35
130,95820,38.53825,-121.44668,2.0,36437,12964,51068,73.7,18.7,48.9,53.0,47.0,5.509989,175.5
131,95828,38.48881,-121.40283,4.0,58717,17852,53229,70.9,19.8,48.8,61.9,38.1,7.68883,92.22
132,95616,38.56296,-121.81601,1.0,52212,17302,55510,63.7,13.6,46.3,34.9,65.1,4.826902,382.69
