#  SVM Regression (scikit-learn)

In [16]:
%%javascript
IPython.notebook.clear_all_output();

<IPython.core.display.Javascript object>

# Import Dataset
## UCI bike sharing dataset

The example is from https://www.python-course.eu/Regression_Trees.php

We will use all 731 instances as well as a subset of the original 16 features.

We use the features: {'season', 'holiday', 'weekday', 'workingday', 'wheathersit', 'cnt'} where the {'cnt'} feature serves as our target feature and represents the number of total rented bikes per day.

The first five rows of the dataset look as follows

In [17]:
import pandas as pd
dataset = pd.read_csv("day.csv", usecols=['season','holiday','weekday','workingday','weathersit','cnt'])
print(type(dataset))
dataset

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,season,holiday,weekday,workingday,weathersit,cnt
0,1,0,6,0,2,985
1,1,0,0,0,2,801
2,1,0,1,1,1,1349
3,1,0,2,1,1,1562
4,1,0,3,1,1,1600
5,1,0,4,1,1,1606
6,1,0,5,1,2,1510
7,1,0,6,0,2,959
8,1,0,0,0,1,822
9,1,0,1,1,1,1321


In [18]:
dataset = dataset.sample(frac=1)
dataset.head()

Unnamed: 0,season,holiday,weekday,workingday,weathersit,cnt
538,3,0,5,1,1,5823
117,2,0,4,1,2,4058
575,3,0,0,0,1,6597
254,3,0,1,1,1,4713
618,3,0,1,1,1,7525


In [19]:
# 70% data for training, 30% data for testing

training_data = dataset.iloc[:int(0.7*len(dataset))]
training_data.head()

Unnamed: 0,season,holiday,weekday,workingday,weathersit,cnt
538,3,0,5,1,1,5823
117,2,0,4,1,2,4058
575,3,0,0,0,1,6597
254,3,0,1,1,1,4713
618,3,0,1,1,1,7525


In [20]:
training_data = training_data.reset_index(drop=True)
training_data.head()

Unnamed: 0,season,holiday,weekday,workingday,weathersit,cnt
0,3,0,5,1,1,5823
1,2,0,4,1,2,4058
2,3,0,0,0,1,6597
3,3,0,1,1,1,4713
4,3,0,1,1,1,7525


In [21]:
testing_data = dataset.iloc[int(0.7*len(dataset)):].reset_index(drop=True)

X_train = training_data.iloc[:, :-1]
print(X_train.shape)
X_test = testing_data.iloc[:, :-1]
print(X_test.shape)
y_train = training_data.iloc[:, -1]
print(y_train.shape)
y_test = testing_data.iloc[:, -1]
print(y_test.shape)

(511, 5)
(220, 5)
(511,)
(220,)


# sklearn.svm.SVR

# 1. Linear Kernel: 

## Training
**C : float, optional (default=1.0):** Penalty parameter C of the error term.

**epsilon : float, optional (default=0.1):** the epsilon-tube within which there is no penalty.

In [22]:
from sklearn import svm
model = svm.SVR(kernel = 'linear', C = 1, epsilon = 0.1)

# training
model.fit(X_train, y_train)

# testing
y_predict = model.predict(X_test)
n_report = 10
print(y_predict[range(n_report)].astype(int))
print(y_test.head(n_report).values)

[4359 4700 4559 4650 4275 4218 4479 4690 4874 4445]
[7132 6536 4118 7030 3922 1204 2425 1817 4839 3487]


In [23]:
import numpy as np

# compute root-mean-square error
RMSE = (y_test - y_predict)**2
RMSE = np.sum(RMSE.values)
RMSE /= len(y_test)
RMSE = np.sqrt(RMSE)
print(RMSE)

1874.1702365578335


## Grid Search for Hyperparameters

See how to use `sklearn.model_selection.GridSearchCV(.)` :-)

In [24]:
from sklearn.model_selection import GridSearchCV
import numpy as np
# create a model with some fixed hyperparameters
model = svm.SVR(kernel = 'linear')
# create a dictionary of hyperparameters (of the model) for grid search
exp = 4
vals = np.logspace(-exp, exp, 2*exp+1) # base 10
print(vals)
params = {
    'C': vals,
    'epsilon': vals
}
# grid search
gridsearch = GridSearchCV(estimator = model, param_grid = params, verbose = 1)
# set verbose = 2 if you want to see the progress of each model training
gridsearch.fit(X_train, y_train);

# print results
print("====== Scores for all models ======")
display(['mean', 'std', 'params'])
display(list(zip(gridsearch.cv_results_['mean_test_score'], gridsearch.cv_results_['std_test_score'], gridsearch.cv_results_['params']))) # IPython.display, a better looking function than print()
print()

print("====== Parameters for the best model ======")
print(gridsearch.best_params_)
print()

print("====== Score for the best model ======")
print(gridsearch.best_score_)

[1.e-04 1.e-03 1.e-02 1.e-01 1.e+00 1.e+01 1.e+02 1.e+03 1.e+04]
Fitting 3 folds for each of 81 candidates, totalling 243 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.




[Parallel(n_jobs=1)]: Done 243 out of 243 | elapsed:    1.7s finished


['mean', 'std', 'params']

[(-0.002701735241714325,
  0.0022633996546322812,
  {'C': 0.0001, 'epsilon': 0.0001}),
 (-0.002701735241714325,
  0.0022633996546322812,
  {'C': 0.0001, 'epsilon': 0.001}),
 (-0.002701735241714325,
  0.0022633996546322812,
  {'C': 0.0001, 'epsilon': 0.01}),
 (-0.002701735241714325, 0.0022633996546322812, {'C': 0.0001, 'epsilon': 0.1}),
 (-0.002701735241714325, 0.0022633996546322812, {'C': 0.0001, 'epsilon': 1.0}),
 (-0.0028673440279164794,
  0.002567995863887652,
  {'C': 0.0001, 'epsilon': 10.0}),
 (-0.003083553129543749,
  0.002448787742937153,
  {'C': 0.0001, 'epsilon': 100.0}),
 (-0.004810197617623922,
  0.005845230446545947,
  {'C': 0.0001, 'epsilon': 1000.0}),
 (-0.00014661204270983172,
  4.044676477097476e-05,
  {'C': 0.0001, 'epsilon': 10000.0}),
 (-0.00263877493072443, 0.002255621818759232, {'C': 0.001, 'epsilon': 0.0001}),
 (-0.00263877493072443, 0.002255621818759232, {'C': 0.001, 'epsilon': 0.001}),
 (-0.00263877493072443, 0.002255621818759232, {'C': 0.001, 'epsilon': 0.01}),


{'C': 10000.0, 'epsilon': 1000.0}

0.2516590175478076


## Put in the best parameters

In [25]:
model = gridsearch.best_estimator_

# testing
y_predict = model.predict(X_test)
n_report = 10
print(y_predict[range(n_report)].astype(int))
print(y_test.head(n_report).values)

# report error
RMSE = (y_test - y_predict)**2
RMSE = np.sum(RMSE.values)
RMSE /= len(y_test)
RMSE = np.sqrt(RMSE)
print(RMSE)

[4122 4778 4028 5110 2708 3372 4782 3744 6501 3036]
[7132 6536 4118 7030 3922 1204 2425 1817 4839 3487]
1675.2743482830806


THE END