In [1]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

| Model                     | DataSet | Time  | RMSE   | Regression | Scaling | Cross Score |
|---------------------------|---------|-------|--------|------------|---------|-------------|
| LinearRegression          | 20640   | 18ms  | 0.7455 | Y          | No      | 0.7204      |
| Ridge                     | 20640   | 17ms  | 0.7455 | Y          | No      | 0.7204      |
| Lasso                     | 20640   | 45ms  | 0.9685 | Y          | No      | 0.9740      |
| ElasticNet                | 20640   | 25ms  | 0.8723 | Y          | No      | 0.8751      |
| LogisticRegression        | 20640   | N/A   | N/A    | N          | No      | N/A         |
| DecisionTreeRegressor     | 20640   | 355ms | 0.7037 | Y          | No      | 0.7303      |
| RandomForestRegressor     | 20640   | 24s   | 0.5053 | Y          | No      | 0.5060      |
| GradientBoostingRegressor | 20640   | 5s    | 0.5222 | Y          | No      | 0.5313      |
| SVR                       | 20640   | 26s   | 0.9615 | Y          | Yes     | 0.5965      |
| SVR (Tuned)               | 20640   | 26s   | 0.9615 | Y          | Yes     | 0.5634      |
| LinearSVR                 | 20640   | 1s    | 0.95   | Y          | No      | 1.54        |

SMV
kernels: # TODO: update kernel list
- linear - better for
- poly - better for
- rbf - better for
params: # TODO: add desciption and possible values
- C [0.01 - 1000] min - soft margin, more - hard margin, for kernels:
- loss hinge, for kernels:
- dual true, false , for kernels:
- gamma [ , for kernels:
- epsilon , for kernels:
- degree , for kernels:

In [2]:
housing_data = fetch_california_housing()
X = pd.DataFrame(housing_data.data, columns=housing_data.feature_names)
y = pd.Series(housing_data.target, name='MedHouseVal')

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [4]:
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(scaler.fit_transform(X_test), columns=X_test.columns, index=X_test.index)

In [10]:
def get_stats(model, X_test, y_test, X_train, y_train):
    y_pred = model.predict(X_test)
    rmse =  np.sqrt(mean_squared_error(y_test, y_pred))
    cross_score = np.sqrt(-cross_val_score(model,
                                           X_train,
                                           y_train,
                                           scoring="neg_mean_squared_error",
                                           cv=3))
    y_train_pred = model.predict(X_train)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    print('Cross RMSE mean:', cross_score.mean())
    print("Test RMSE:", rmse)
    print("Train RMSE:", train_rmse)

In [11]:
lin_rg_model = LinearRegression()
lin_rg_model.fit(X_train, y_train)
get_stats(lin_rg_model, X_test, y_test, X_train, y_train)

Cross RMSE mean: 0.7209854547933086
Test RMSE: 0.7455813830127768
Train RMSE: 0.7196757085831575


In [12]:
ridge_reg_model = Ridge(random_state=42)
ridge_reg_model.fit(X_train, y_train)
get_stats(ridge_reg_model, X_test, y_test, X_train, y_train)

Cross RMSE mean: 0.720986044355974
Test RMSE: 0.7455222779992703
Train RMSE: 0.7196757706930822


In [13]:
lasso_reg_model = Lasso(random_state=42)
lasso_reg_model.fit(X_train, y_train)
get_stats(lasso_reg_model, X_test, y_test, X_train, y_train)

Cross RMSE mean: 0.9740735107052373
Test RMSE: 0.9685214254184276
Train RMSE: 0.9739446869489576


In [14]:
elasticnet_reg_model = ElasticNet(random_state=42)
elasticnet_reg_model.fit(X_train, y_train)
get_stats(elasticnet_reg_model, X_test, y_test, X_train, y_train)

Cross RMSE mean: 0.8751908303022821
Test RMSE: 0.8743887238506185
Train RMSE: 0.8752184193646253


In [15]:
tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(X_train, y_train)
get_stats(tree_reg, X_test, y_test, X_train, y_train) # Model in underfitted

Cross RMSE mean: 0.7407730901033215
Test RMSE: 0.7037294974840077
Train RMSE: 3.218325866275131e-16


In [16]:
grad_regressor = GradientBoostingRegressor(random_state=42)
grad_regressor.fit(X_train_scaled, y_train)
get_stats(grad_regressor, X_test, y_test, X_train, y_train)

Cross RMSE mean: 0.5332110983615022
Test RMSE: 2.269329396732592
Train RMSE: 2.259087160522356


In [17]:
ran_forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
ran_forest_reg.fit(X_train, y_train)
get_stats(ran_forest_reg, X_test, y_test, X_train, y_train) # model overfitted

Cross RMSE mean: 0.5165147304574792
Test RMSE: 0.5053399773665033
Train RMSE: 0.18795619089802199


In [18]:
svr_reg = SVR()
svr_reg.fit(X_train_scaled, y_train)
get_stats(svr_reg, X_test_scaled, y_test, X_train_scaled, y_train) # model overfitted

Cross RMSE mean: 0.5965186232335512
Test RMSE: 0.9615568224843896
Train RMSE: 0.5797686954586869


In [19]:
lin_svr_model = LinearSVR()
lin_svr_model.fit(X_train_scaled, y_train)
get_stats(lin_svr_model, X_test_scaled, y_test, X_train_scaled, y_train)



Cross RMSE mean: 1.5396731691786576
Test RMSE: 0.9529981620728445
Train RMSE: 0.979818839656626




In [20]:
# Try use linear kernel for SVR and C=1.0, epsilon=0.1
svr_reg = SVR(kernel='linear', C=1.0, epsilon=0.1)
svr_reg.fit(X_train_scaled, y_train)
get_stats(svr_reg, X_test_scaled, y_test, X_train_scaled, y_train)

Cross RMSE mean: 1.5460802606467177
Test RMSE: 0.9218275619580523
Train RMSE: 0.9475397693037826


In [21]:
# Try use linear kernel for SVR and C=1.0, epsilon=0.1
svr_reg = SVR(kernel='rbf', C=0.5, epsilon=0.1)
svr_reg.fit(X_train_scaled, y_train)
get_stats(svr_reg, X_test_scaled, y_test, X_train_scaled, y_train) # model overfitted

Cross RMSE mean: 0.6084152771342692
Test RMSE: 0.887449829709035
Train RMSE: 0.5938231114429178


In [5]:
estimator = SVR(kernel='rbf')
param_grid = {
    'C': [1, 25, 50],
    'epsilon': [0.26, 0.265]
}
search = RandomizedSearchCV(estimator,
                            param_grid,
                            n_iter=5,
                            scoring='neg_mean_squared_error',
                            cv=3,
                            random_state=42,
                            verbose=0,
                            return_train_score=False)

search.fit(X_train_scaled, y_train)
print("Best parameters found: ", search.best_params_)
print("Best cross-validation score: ", np.sqrt(-search.best_score_))

Best parameters found:  {'epsilon': 0.26, 'C': 25}
Best cross-validation score:  0.5634554213786135


In [7]:
best_model = search.best_estimator_

In [8]:
y_pred_svr = best_model.predict(X_test_scaled)
svr_mse = mean_squared_error(y_test, y_pred_svr)
svr_rmse = np.sqrt(svr_mse)
print(f'RMSE: {svr_rmse}')

RMSE: 1.5566573465989348


In [9]:
# model if overfitted
y_pred_svr = best_model.predict(X_train_scaled)
svr_mse = mean_squared_error(y_train, y_pred_svr)
svr_rmse = np.sqrt(svr_mse)
print(f'RMSE: {svr_rmse}')

RMSE: 0.5282284610961608


In [None]:
# TODO: Predict MNIST training set using SVM

In [None]:
# TODO: Predict California Housing training set using SVM