## Support Vector Regression


Libraries Used
* Sklearn: https://scikit-learn.org/stable/modules/svm.html#svm-regression
* Numpy:  https://numpy.org/doc/stable/
* Pandas: https://pandas.pydata.org/docs/user_guide/index.html

In [10]:
from sklearn import svm
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

### Data Import 

In [11]:
#Import Census data from previous notebook
zipCSVPath = '../../data/Zip_Code_Data.csv'
zipDF = pd.read_csv(zipCSVPath)
zipDF['Zip'] = zipDF['Zip'].astype(str)
laDF = zipDF[(zipDF.City=="Los Angeles")]
laDFTarget = laDF[['Zip',
                    'Total_Pop',
                   'Percent_21_and_Older',
                   'Percent_60_and_Older',
                   'Percent_Male',
                   'Pct_Owner_Occupied',
                   'Pct_Renter_Occupied'
                  ]]

In [12]:
#Import Dispensary data from previous notebook
la_data = pd.read_csv('../../data/la_combined.csv').drop(columns='Unnamed: 0')
la_data['Zip'] = la_data['Zip'].astype(str)


In [13]:
#Merge data by zip code
la_data = la_data.merge(laDFTarget)

In [31]:
#Split Data to test model
la_data_model = la_data.iloc[:103,:]
la_data_test = la_data.iloc[-20:,:]

In [33]:
la_data_model.head(10)

Unnamed: 0,Zip,Cluster_Labels,LAT,LNG,Num_Disp,Total_Households,Household_Median_Income,Total_Pop,Percent_21_and_Older,Percent_60_and_Older,Percent_Male,Pct_Owner_Occupied,Pct_Renter_Occupied
0,90001,2,33.973274,-118.24897,7,13669,43360,59832,63.3,10.9,51.7,35.9,64.1
1,90002,2,33.948951,-118.24698,1,12917,37285,53302,62.8,10.4,47.7,35.0,65.0
2,90003,2,33.96335,-118.273936,1,17484,40598,73730,63.0,10.0,49.0,28.3,71.7
3,90004,2,34.075721,-118.303017,1,22004,49675,60541,77.5,15.8,49.3,16.6,83.4
4,90005,3,34.059071,-118.303017,0,16781,38491,39732,78.6,16.8,50.4,7.7,92.3
5,90006,2,34.050102,-118.292689,0,19337,37072,59576,74.0,15.6,49.5,9.0,91.0
6,90007,3,34.028334,-118.285077,4,11919,27406,42433,69.4,10.8,51.6,12.3,87.7
7,90008,2,34.010971,-118.341588,0,14858,43364,31754,80.6,24.8,43.8,33.0,67.0
8,90010,2,34.061391,-118.3097,0,2029,63112,3822,86.8,23.8,47.7,17.6,82.4
9,90011,2,34.007904,-118.259037,4,24433,40940,111165,63.7,9.8,50.6,26.0,74.0


### Scale Data

In [36]:
#numpyify model data set domain and range data
X = la_data_model[['Total_Pop',
                   'Total_Households', 
                   'Household_Median_Income',
                   'Percent_21_and_Older',
                   'Percent_60_and_Older',
                   'Percent_Male',
                   'Pct_Owner_Occupied',
                   'Pct_Renter_Occupied']].to_numpy()
y = la_data_model['Num_Disp'].to_numpy().reshape((-1,1))

In [37]:
#numpyify test data set domain and range data
X_test = la_data_test[['Total_Pop',
                   'Total_Households', 
                   'Household_Median_Income',
                   'Percent_21_and_Older',
                   'Percent_60_and_Older',
                   'Percent_Male',
                   'Pct_Owner_Occupied',
                   'Pct_Renter_Occupied']].to_numpy()
y_test = la_data_test['Num_Disp'].to_numpy().reshape((-1,1))


In [38]:
#Create Scalers
sc_X = StandardScaler()
sc_y = StandardScaler()

In [39]:
#Scale domain and range data for model
X = sc_X.fit_transform(X)
y = sc_y.fit_transform(y).ravel()

In [40]:
#Scale domain and range data for test
X_test = sc_X.fit_transform(X_test)
y_test = sc_y.fit_transform(y_test).ravel()

### Grid Search to find optimal parameters for model

In [41]:
# Use grid Search to find the best parameters for the data

#Establish parameters
parameters = {'kernel':('linear', 'poly', 'rbf', 'sigmoid'),'C':[0.5,1,5,10,50],'epsilon':[0.01,0.1,0.5,1,5,10,50]}

#create model
regr = svm.SVR()
clf = GridSearchCV(regr, parameters)

#Fit model with data
clf.fit(X,y)

#Read out best parameters by gridsearch
clf.best_params_

{'C': 0.5, 'epsilon': 0.1, 'kernel': 'rbf'}

In [42]:
#Create a model with grid search parameters
regr = svm.SVR(**clf.best_params_)
regr.fit(X,y)

SVR(C=0.5)

### Testing the Model


In [44]:
#Percent difference function
def percentDiff(current, previous):
    if current == previous:
        return 100.0
    try:
        return (abs(current - previous) / previous) * 100.0
    except ZeroDivisionError:
        return 0

In [45]:
# Predict Values from model
y_pred = regr.predict(X_test)

In [46]:
#Reshape data for function load
y_pred = list(sc_y.inverse_transform(y_pred))
y_test = la_data_test['Num_Disp']
y_zip = list(zip(y_pred,y_test))

In [47]:
#Add data to dataframe
la_data_test['Predicted_Value'] = y_pred

la_data_test['Percent_Difference'] = [percentDiff(y_zip[i][1],y_zip[i][0]) for i,e in enumerate(y_zip)]

In [50]:
#Display Number of Dispensaries with model predicted number 
la_data_test[la_data_test['Num_Disp'] != 0][['Zip','Num_Disp','Predicted_Value','Percent_Difference']]

Unnamed: 0,Zip,Num_Disp,Predicted_Value,Percent_Difference
103,91352,26,5.087365,411.070077
104,91356,3,3.623057,17.19699
105,91364,4,3.415476,17.113995
106,91367,3,5.242073,42.770729
107,91401,3,4.245392,29.335153
108,91402,4,5.907136,32.285295
109,91403,2,4.901016,59.192135
110,91405,6,3.679273,63.075713
111,91406,16,6.160512,159.718665
112,91411,9,6.661236,35.11006


## Sacramento Data Analysis

<img width=50% src="https://www.terracon.com/offices/sacramento/2021-siterework-office-sacramento/">

In [51]:
#Load Data from previous notebook
sacDF = zipDF[zipDF.City=="Sacramento"]
sacDFTarget = sacDF[['Zip', 
                   'LAT', 
                   'LNG', 
                   "Num_Disp",
                   'Total_Pop',
                   'Total_Households', 
                   'Household_Median_Income',
                   'Percent_21_and_Older',
                   'Percent_60_and_Older',
                   'Percent_Male',
                   'Pct_Owner_Occupied',
                   'Pct_Renter_Occupied'
                  ]]
sacDFTarget['Zip'] = sacDFTarget['Zip'].astype(str)

In [52]:
#Scale City data with StandardScaler
X2_actual = sacDFTarget[['Total_Pop',
                   'Total_Households', 
                   'Household_Median_Income',
                   'Percent_21_and_Older',
                   'Percent_60_and_Older',
                   'Percent_Male',
                   'Pct_Owner_Occupied',
                   'Pct_Renter_Occupied']].to_numpy()

X2_actual = sc_X.fit_transform(X2_actual)


In [53]:
#Create a prediction using new data and our previous model
y2_pred = regr.predict(X2_actual)

In [54]:
#transform data to original scale, store in list of tuples
y2_pred = list(sc_y.inverse_transform(y2_pred))
y2_actual = sacDFTarget['Num_Disp']
y2_zip = list(zip(y2_pred,y2_actual))

In [55]:
#Add to dataframe
sacDFTarget['Predicted_Value'] = y2_pred
sacDFTarget['Percent_Difference'] = [percentDiff(y2_zip[i][1],y2_zip[i][0]) for i,e in enumerate(y2_zip)]

In [58]:
#Display Number of Dispensaries with model predicted number 
sacDFTarget[sacDFTarget['Num_Disp'] != 0][['Zip','Num_Disp','Predicted_Value','Percent_Difference']]

Unnamed: 0,Zip,Num_Disp,Predicted_Value,Percent_Difference
123,95814,1.0,5.593368,82.121685
124,95815,4.0,6.391143,37.413387
128,95838,2.0,5.722382,65.049518
129,95817,2.0,3.427095,41.64153
130,95820,2.0,5.509989,63.702288
131,95828,4.0,7.68883,47.976483
132,95616,1.0,4.826902,79.282776
133,95822,2.0,5.961569,66.451787
134,95811,3.0,5.888622,49.054293
135,95833,1.0,6.359241,84.274853


In [65]:
#Display visualization from tableau
from IPython.display import IFrame
IFrame('https://public.tableau.com/app/profile/andrew.bly/viz/SacramentoDispensaries/Dashboard1', width=700, height=700)

In [57]:
#Save Data
location = '../../data/sacDispDataSVR.csv'
sacDFTarget.to_csv(location)