In [1]:
from sklearn.datasets import load_wine
wine = load_wine()
X_wine, y_wine = wine.data, wine.target

In [2]:
import numpy as np

FRACTION = 10
X_train_usps = np.genfromtxt('zip.train', delimiter=' ')[::FRACTION]
X_test_usps = np.genfromtxt('zip.test', delimiter=' ')[::FRACTION]

y_train_usps = X_train_usps[:, 0]
y_test_usps = X_test_usps[:, 0]
X_train_usps = X_train_usps[:, 1:]
X_test_usps = X_test_usps[:, 1:]

X_usps = np.concatenate((X_train_usps, X_test_usps), axis=0)
y_usps = np.concatenate((y_train_usps, y_test_usps), axis=0)


In [3]:
from sklearn.model_selection import train_test_split

X_train_wine, X_test_wine, y_train_wine, y_test_wine = train_test_split(X_wine, y_wine, random_state=2909)
X_train_usps, X_test_usps, y_train_usps, y_test_usps = train_test_split(X_usps, y_usps, random_state=2909)


In [4]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

CV = 5

svm = SVC()
scores_wine = cross_val_score(svm, X_train_wine, y_train_wine, cv=CV)
generalization_accuracy_wine = np.mean(scores_wine)

scores_usps = cross_val_score(svm, X_train_usps, y_train_usps, cv=CV)
generalization_accuracy_usps = np.mean(scores_usps)


In [5]:
svm.fit(X_train_wine, y_train_wine)
test_error_rate_wine = 1 - svm.score(X_test_wine, y_test_wine)

svm.fit(X_train_usps, y_train_usps)
test_error_rate_usps = 1 - svm.score(X_test_usps, y_test_usps)


In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer, RobustScaler
from sklearn.svm import SVC


param_grid = {'SVC__C': [0.01, 0.1, 1, 10], 'SVC__gamma': [0.001, 0.01, 0.1, 1]}
n_jobs = -1  # Use all available CPU cores


scalers = {'StandardScaler': StandardScaler(),
           'MinMaxScaler': MinMaxScaler(),
           'Normalizer': Normalizer(),
           'RobustScaler': RobustScaler()}

best_estimators_wine = {}
best_estimators_usps = {}



In [7]:
print("Processing Wine dataset with different scalers...")
for scaler_name, scaler in scalers.items():
    wine_pipeline = Pipeline([('scaler', scaler), ('SVC', SVC())])
    wine_grid_search = GridSearchCV(wine_pipeline, param_grid, cv=CV, n_jobs=n_jobs)
    wine_grid_search.fit(X_train_wine, y_train_wine)


    best_estimators_wine[scaler_name] = wine_grid_search.best_estimator_


    wine_best_score = wine_grid_search.best_score_
    wine_test_score = wine_grid_search.score(X_test_wine, y_test_wine)
    wine_best_params = wine_grid_search.best_params_
    wine_test_error_rate = 1 - wine_test_score

    print(f"Wine - {scaler_name}: Best CV Accuracy: {wine_best_score}, Test Set Accuracy: {wine_test_score}, Best Params: {wine_best_params}, Test Error Rate: {wine_test_error_rate}")

Processing Wine dataset with different scalers...
Wine - StandardScaler: Best CV Accuracy: 0.9925925925925926, Test Set Accuracy: 1.0, Best Params: {'SVC__C': 1, 'SVC__gamma': 0.01}, Test Error Rate: 0.0
Wine - MinMaxScaler: Best CV Accuracy: 0.9851851851851852, Test Set Accuracy: 0.9777777777777777, Best Params: {'SVC__C': 0.1, 'SVC__gamma': 1}, Test Error Rate: 0.022222222222222254
Wine - Normalizer: Best CV Accuracy: 0.6168091168091168, Test Set Accuracy: 0.6888888888888889, Best Params: {'SVC__C': 10, 'SVC__gamma': 1}, Test Error Rate: 0.3111111111111111
Wine - RobustScaler: Best CV Accuracy: 0.9849002849002849, Test Set Accuracy: 0.9777777777777777, Best Params: {'SVC__C': 1, 'SVC__gamma': 0.1}, Test Error Rate: 0.022222222222222254


In [8]:
print("\nProcessing USPS dataset with different scalers...")
for scaler_name, scaler in scalers.items():
    usps_pipeline = Pipeline([('scaler', scaler), ('SVC', SVC())])
    usps_grid_search = GridSearchCV(usps_pipeline, param_grid, cv=CV, n_jobs=n_jobs)
    usps_grid_search.fit(X_train_usps, y_train_usps)


    best_estimators_usps[scaler_name] = usps_grid_search.best_estimator_


    usps_best_score = usps_grid_search.best_score_
    usps_test_score = usps_grid_search.score(X_test_usps, y_test_usps)
    usps_best_params = usps_grid_search.best_params_
    usps_test_error_rate = 1 - usps_test_score

    print(f"USPS - {scaler_name}: Best CV Accuracy: {usps_best_score}, Test Set Accuracy: {usps_test_score}, Best Params: {usps_best_params}, Test Error Rate: {usps_test_error_rate}")



Processing USPS dataset with different scalers...
USPS - StandardScaler: Best CV Accuracy: 0.9168653648509764, Test Set Accuracy: 0.9012875536480687, Best Params: {'SVC__C': 10, 'SVC__gamma': 0.001}, Test Error Rate: 0.09871244635193133
USPS - MinMaxScaler: Best CV Accuracy: 0.9183556012332991, Test Set Accuracy: 0.9141630901287554, Best Params: {'SVC__C': 10, 'SVC__gamma': 0.01}, Test Error Rate: 0.0858369098712446
USPS - Normalizer: Best CV Accuracy: 0.9255292908530318, Test Set Accuracy: 0.9098712446351931, Best Params: {'SVC__C': 10, 'SVC__gamma': 1}, Test Error Rate: 0.09012875536480691
USPS - RobustScaler: Best CV Accuracy: 0.7033915724563207, Test Set Accuracy: 0.7296137339055794, Best Params: {'SVC__C': 10, 'SVC__gamma': 0.001}, Test Error Rate: 0.2703862660944206


In [9]:
best_scaler_name_wine = max(best_estimators_wine, key=lambda k: best_estimators_wine[k].score(X_test_wine, y_test_wine))
best_estimator_wine = best_estimators_wine[best_scaler_name_wine]
wine_predictions = best_estimator_wine.predict(X_test_wine)
wine_test_error_rate = 1 - np.mean(wine_predictions == y_test_wine)
print(f"\nWine Dataset - Best Scaler: {best_scaler_name_wine}, Test Error Rate: {wine_test_error_rate:.4f}")


Wine Dataset - Best Scaler: StandardScaler, Test Error Rate: 0.0000


In [10]:
best_scaler_name_usps = max(best_estimators_usps, key=lambda k: best_estimators_usps[k].score(X_test_usps, y_test_usps))
best_estimator_usps = best_estimators_usps[best_scaler_name_usps]
usps_predictions = best_estimator_usps.predict(X_test_usps)
usps_test_error_rate = 1 - np.mean(usps_predictions == y_test_usps)
print(f"USPS Dataset - Best Scaler: {best_scaler_name_usps}, Test Error Rate: {usps_test_error_rate:.4f}")

USPS Dataset - Best Scaler: MinMaxScaler, Test Error Rate: 0.0858
