# Building a support vector regression model

We will now build a support vector regression for the medical data set on diabetes pregression.

# Dataset

In [26]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.preprocessing import StandardScaler

dataset = load_diabetes()

X = pd.DataFrame(data=dataset['data'],columns = dataset['feature_names'])

# Again, best to scale the input variables
X = StandardScaler().fit_transform(X)

y = pd.DataFrame(data=dataset['target'], columns = ['progression'])

# Building the regression

In [29]:
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3)

svr = SVR(gamma='auto')
svr.fit(X_train,y_train.values.ravel())
prediction = svr.predict(X_test)
print('RMSE:', np.sqrt(mse(y_test,prediction)))

RMSE: 69.27776392088394


We can also change the parameters

In [32]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel':['linear','poly','rbf'],'C':[0.2,0.5,1.0]}

grid_search = GridSearchCV(SVR(gamma='auto'), parameters, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train.values.ravel())

means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']

print('Mean RMSE (+/- standard deviation), for parameters')
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
    print("%0.3f (+/- %0.03f) for %r"
# The MSE is return as a negative, so we multiple it with -1 before squaring it
# scoring='neg_mean_squared_error'
          % (np.sqrt(-1*mean), np.sqrt(std), params))

Mean RMSE (+/- standard deviation), for parameters
59.798 (+/- 20.532) for {'C': 0.2, 'kernel': 'linear'}
74.861 (+/- 29.443) for {'C': 0.2, 'kernel': 'poly'}
77.420 (+/- 31.542) for {'C': 0.2, 'kernel': 'rbf'}
57.127 (+/- 19.360) for {'C': 0.5, 'kernel': 'linear'}
71.501 (+/- 27.264) for {'C': 0.5, 'kernel': 'poly'}
75.648 (+/- 30.790) for {'C': 0.5, 'kernel': 'rbf'}
56.533 (+/- 19.587) for {'C': 1.0, 'kernel': 'linear'}
69.119 (+/- 25.576) for {'C': 1.0, 'kernel': 'poly'}
73.251 (+/- 30.198) for {'C': 1.0, 'kernel': 'rbf'}


 The confidence intervals are quite wide, however, so the results might not be very reliable.