## Support Vector Regression


Libraries Used
* Sklearn: https://scikit-learn.org/stable/modules/svm.html#svm-regression
* Numpy:  https://numpy.org/doc/stable/
* Pandas: https://pandas.pydata.org/docs/user_guide/index.html

In [89]:
from sklearn import svm
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

### Data Import 

In [58]:
#Import Census data from previous notebook
zipCSVPath = '../../data/Zip_Code_Data.csv'
zipDF = pd.read_csv(zipCSVPath)
zipDF['Zip'] = zipDF['Zip'].astype(str)
laDF = zipDF[(zipDF.City=="Los Angeles")]
laDFTarget = laDF[['Zip',
                    'Total_Pop',
                   'Percent_21_and_Older',
                   'Percent_60_and_Older',
                   'Percent_Male',
                   'Pct_Owner_Occupied',
                   'Pct_Renter_Occupied'
                  ]]

#Import Dispensary data from previous notebook
la_data = pd.read_csv('../../data/la_combined.csv').drop(columns='Unnamed: 0')
la_data['Zip'] = la_data['Zip'].astype(str)

#Merge data by zip code
la_data = la_data.merge(laDFTarget)

In [90]:
#Split Data to test model
la_data_model = la_data.iloc[:103,:]
la_data_test = la_data.iloc[-20:,:]

### Scale Data

In [92]:
#numpyify model data set domain and range data
X = la_data_model[['Total_Pop',
                   'Total_Households', 
                   'Household_Median_Income',
                   'Percent_21_and_Older',
                   'Percent_60_and_Older',
                   'Percent_Male',
                   'Pct_Owner_Occupied',
                   'Pct_Renter_Occupied']].to_numpy()
y = la_data_model['Num_Disp'].to_numpy().reshape((-1,1))

In [93]:
#numpyify test data set domain and range data
X_test = la_data_test[['Total_Pop',
                   'Total_Households', 
                   'Household_Median_Income',
                   'Percent_21_and_Older',
                   'Percent_60_and_Older',
                   'Percent_Male',
                   'Pct_Owner_Occupied',
                   'Pct_Renter_Occupied']].to_numpy()
y_test = la_data_test['Num_Disp'].to_numpy().reshape((-1,1))


In [94]:
#Create Scalers
sc_X = StandardScaler()
sc_y = StandardScaler()

In [95]:
#Scale domain and range data for model
X = sc_X.fit_transform(X)
y = sc_y.fit_transform(y).ravel()

In [96]:
#Scale domain and range data for test
X_test = sc_X.fit_transform(X_test)
y_test = sc_y.fit_transform(y_test).ravel()

### Grid Search to find optimal parameters for model

In [97]:
# Use grid Search to find the best parameters for the data

#Establish parameters
parameters = {'kernel':('linear', 'poly', 'rbf', 'sigmoid'),'C':[0.5,1,5,10,50],'epsilon':[0.01,0.1,0.5,1,5,10,50]}

#create model
regr = svm.SVR()
clf = GridSearchCV(regr, parameters)

#Fit model with data
clf.fit(X,y)

#Read out best parameters by gridsearch
clf.best_params_

{'C': 0.5, 'epsilon': 0.1, 'kernel': 'rbf'}

In [98]:
#Create a model with grid search parameters
regr = svm.SVR(**clf.best_params_)
regr.fit(X,y)

SVR(C=0.5)

### Testing the Model


In [102]:
#Percent difference function
def percentDiff(current, previous):
    if current == previous:
        return 100.0
    try:
        return (abs(current - previous) / previous) * 100.0
    except ZeroDivisionError:
        return 0

In [103]:
# Predict Values from model
y_pred = regr.predict(X_test)

In [104]:
#Reshape data for function load
y_pred = list(sc_y.inverse_transform(y_pred))
y_test = la_data_test['Num_Disp']
y_zip = list(zip(y_pred,y_test))

In [105]:
#Add data to dataframe
la_data_test['Predicted_Value'] = y_pred

la_data_test['Percent_Difference'] = [percentDiff(y_zip[i][1],y_zip[i][0]) for i,e in enumerate(y_zip)]

In [106]:
#Display Number of Dispensaries with model predicted number 
la_data_test[la_data_test['Num_Disp'] != 0][['Zip','Num_Disp','Predicted_Value','Percent_Difference']]

Unnamed: 0,Zip,Num_Disp,Predicted_Value,Percent_Difference
103,91352,26,5.087365,411.070077
104,91356,3,3.623057,17.19699
105,91364,4,3.415476,17.113995
106,91367,3,5.242073,42.770729
107,91401,3,4.245392,29.335153
108,91402,4,5.907136,32.285295
109,91403,2,4.901016,59.192135
110,91405,6,3.679273,63.075713
111,91406,16,6.160512,159.718665
112,91411,9,6.661236,35.11006


## Sacramento Data Analysis

<img width=50% src="https://www.terracon.com/offices/sacramento/2021-siterework-office-sacramento/">

In [107]:
#Load Data
sacDF = zipDF[zipDF.City=="Sacramento"]
sacDFTarget = sacDF[['Zip', 
                   'LAT', 
                   'LNG', 
                   "Num_Disp",
                   'Total_Pop',
                   'Total_Households', 
                   'Household_Median_Income',
                   'Percent_21_and_Older',
                   'Percent_60_and_Older',
                   'Percent_Male',
                   'Pct_Owner_Occupied',
                   'Pct_Renter_Occupied'
                  ]]
sacDFTarget['Zip'] = sacDFTarget['Zip'].astype(str)

In [108]:
#Scale City data with StandardScaler
X2_test = sacDFTarget[['Total_Pop',
                   'Total_Households', 
                   'Household_Median_Income',
                   'Percent_21_and_Older',
                   'Percent_60_and_Older',
                   'Percent_Male',
                   'Pct_Owner_Occupied',
                   'Pct_Renter_Occupied']].to_numpy()

X2_test = sc_X.fit_transform(X2_test)



In [109]:
#Create a prediction using new data and our previous model
y2_pred = regr.predict(X2_test)

In [110]:
#transform data to original scale, store in list of tuples
y2_pred = list(sc_y.inverse_transform(y2_pred))
y2_test = sacDFTarget['Num_Disp']
y2_zip = list(zip(y2_pred,y2_test))

In [111]:
#Add to dataframe
sacDFTarget['Predicted_Value'] = y2_pred
sacDFTarget['Percent_Difference'] = [percentDiff(y2_zip[i][1],y2_zip[i][0]) for i,e in enumerate(y2_zip)]

In [112]:
#Display Number of Dispensaries with model predicted number 
sacDFTarget[sacDFTarget['Num_Disp'] != 0][['Zip','Num_Disp','Predicted_Value','Percent_Difference']]

Unnamed: 0,Zip,Num_Disp,Predicted_Value,Percent_Difference
123,95814,1.0,5.593368,82.121685
124,95815,4.0,6.391143,37.413387
128,95838,2.0,5.722382,65.049518
129,95817,2.0,3.427095,41.64153
130,95820,2.0,5.509989,63.702288
131,95828,4.0,7.68883,47.976483
132,95616,1.0,4.826902,79.282776
133,95822,2.0,5.961569,66.451787
134,95811,3.0,5.888622,49.054293
135,95833,1.0,6.359241,84.274853


In [30]:
#Save Data
location = '../../data/sacDispDataSVR.csv'
sacDFTarget.to_csv(location)