# TP3
## 1 - Validation tests and cross validation

In [145]:
import numpy as np
from sklearn.datasets import load_wine
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import RidgeCV

### 1.1 - Valisation sets

In [211]:
# Load the wine dataset
wine_db = load_wine()
print(wine_db.data.shape)

# Randomly select 150 samples
random_subset = np.random.choice(wine_db.data.shape[0], 178, replace=False)
wine_db.data = wine_db.data[random_subset]
wine_db.target = wine_db.target[random_subset]

# Train set
X_train, X_test, y_train, y_test = train_test_split(wine_db.data, wine_db.target, test_size=0.2, random_state=42)
# Validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Build a grid of parameters to compare hyperparameters
param_grid = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10]}
'''
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['rbf']
}
'''
grid = ParameterGrid(param_grid)

# Train a decision tree for each parameter combination
best_score = 0
best_params = None
for params in grid:
    # Train a decision tree
    clf = DecisionTreeClassifier(**params)
    #clf = SVC(**params)
    clf.fit(X_train, y_train)
    # Evaluate the decision tree
    score = clf.score(X_val, y_val)
    # Keep the best model
    if score > best_score:
        best_score = score
        best_params = params

# Train the best model on the whole training set
clf = DecisionTreeClassifier(**best_params)
#clf = SVC(**best_params)
clf.fit(X_train, y_train)

print('Best parameters:', best_params)
print('Best validation score:', best_score)
print('Test score:', clf.score(X_test, y_test))


(178, 13)
Best parameters: {'max_depth': 3, 'min_samples_split': 3}
Best validation score: 0.9310344827586207
Test score: 0.8888888888888888


In [144]:
# Load the digits dataset
digits_db = load_digits()

# Randomly select 300 samples
random_subset = np.random.choice(digits_db.data.shape[0], 300, replace=False)
digits_db.data = digits_db.data[random_subset]
digits_db.target = digits_db.target[random_subset]

# Train set
X_train, X_test, y_train, y_test = train_test_split(digits_db.data, digits_db.target, test_size=0.2, random_state=42)
# Validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.5, random_state=42)

# Build a grid of parameters to compare hyperparameters
param_grid = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10]}
grid = ParameterGrid(param_grid)

# Train a decision tree for each parameter combination
best_score = 0
best_params = None
for params in grid:
    # Train a decision tree
    clf = DecisionTreeClassifier(**params)
    clf.fit(X_train, y_train)
    # Evaluate the decision tree
    score = clf.score(X_val, y_val)
    # Keep the best model
    if score > best_score:
        best_score = score
        best_params = params

# Train the best model on the whole training set
clf = DecisionTreeClassifier(**best_params)
clf = clf.fit(X_train, y_train)

print('Best parameters:', best_params)
print('Best validation score:', best_score)
print('Test score:', clf.score(X_test, y_test))

Best parameters: {'max_depth': 10, 'min_samples_split': 4}
Best validation score: 0.6916666666666667
Test score: 0.7166666666666667


### 1.2 - Cross validation

In [208]:
# Cross-validation digits
clf = DecisionTreeClassifier(max_depth=5, min_samples_split=2)
#clf = SVC(C=10, gamma=0.01, kernel='rbf')
scores = cross_val_score(clf, digits_db.data, digits_db.target, cv=5)
print('Cross-validation scores:', scores)
print('Mean cross-validation score:', np.mean(scores))

Cross-validation scores: [0.73333333 0.56666667 0.58333333 0.71666667 0.7       ]
Mean cross-validation score: 0.6599999999999999


In [209]:
# Cross-validation wine
clf = DecisionTreeClassifier(max_depth=5, min_samples_split=2)
#clf = SVC(C=10, gamma=0.01, kernel='rbf')
scores = cross_val_score(clf, wine_db.data, wine_db.target, cv=5)
print('Cross-validation scores:', scores)
print('Mean cross-validation score:', np.mean(scores))

Cross-validation scores: [0.91666667 0.88888889 0.97222222 0.91428571 0.82857143]
Mean cross-validation score: 0.9041269841269841


In [254]:
# Grid search wine
param_grid = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10]}
#param_grid = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']}
clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
#clf = GridSearchCV(SVC(), param_grid, cv=5)
clf.fit(wine_db.data, wine_db.target)
print('Best parameters:', clf.best_params_)
print('Best cross-validation score:', clf.best_score_)
print('Test score:', clf.score(X_test, y_test))

Best parameters: {'max_depth': 6, 'min_samples_split': 3}
Best cross-validation score: 0.9384126984126985
Test score: 1.0


In [255]:
# Ridge regression
clf = RidgeCV(alphas=[0.1, 1.0, 10.0])
clf.fit(X_train, y_train)
print('Best alpha:', clf.alpha_)
print('Test score:', clf.score(X_test, y_test))

Best alpha: 1.0
Test score: 0.8568641238806292
