In [3]:
from sklearn.svm import SVC
#from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, ParameterGrid, train_test_split

import utilities
import visualization

# Dataset
The dataset consists of handwritten digits (0-9). Both training datasets conteins 7330 labeled patterns.
- _"pendigits_tr.txt"_ conteins 16-dimensional patterns (x,y coordinates of eigth point equidistant after normalization and resampling);
- _"pendigits_tr_Pca_K2.txt"_ is a two-dimensional version obtained by PCA, to visualize the results. 

In [8]:
# Choose the dataset to visualize (16 or 2 features):
dataset_path = 'DBs/PenDigits/pendigits_tr.txt'
feature_count = 16

#dataset_path = 'DBs/PenDigits/pendigits_tr_Pca_K2.txt'
#feature_count = 2

In [10]:
# Loading of dataset:
dataset_patterns, dataset_labels = utilities.load_labeled_dataset_from_txt(dataset_path, feature_count)
print('Shape dataset:', dataset_patterns.shape)
print('Shape labels:', dataset_labels.shape)

Shape dataset: (7330, 16)
Shape labels: (7330,)


I split the dataset in training set (60%) and validation (40%):

In [6]:
train_x, validation_x, train_y, validation_y = train_test_split(dataset_patterns, dataset_labels, test_size=0.40)
print('Shape training set:', train_x.shape)
print('Shape validation set:', validation_x.shape)

Shape training set: (4398, 2)
Shape validation set: (2932, 2)


# Grid Search and Cross-Validation
To choose the type of classifier to use and set the parameters:

In [None]:
#clf = KNeighborsClassifier()
clf = SVC()

# Creation of hyperparameters grid:
param_grid = [
   {'kernel': ['rbf'], 'C': [9,9.5], 'gamma': [1/9000,1/10000]}
   # {'kernel': ['linear'], 'C': [0.05,0.5]}
]

# Number of fold for Cross-validation:
n_folds = 5

# Creation of a GridSearchCV object:
grid_search_cv = GridSearchCV(clf, param_grid, cv=n_folds)

# Research of hyperparameters:
grid_search_cv.fit(dataset_patterns, dataset_labels)

# Results:
print('Combination of parameters:\n', grid_search_cv.cv_results_['params'])
print('Average accuracy for each combination:\n', grid_search_cv.cv_results_['mean_test_score'])
print('Best combination:\n', grid_search_cv.best_params_)
print('Average accuracy for the best combination: %.4f' % grid_search_cv.best_score_)

# 2D Visualization:
if feature_count ==2:
    visualization.show_2D_results(grid_search_cv.best_estimator_,
                                 (dataset_patterns, dataset_labels, 'Data'),
                                 figsize=(9, 8))
    # Note: "grid_search_cv.best_estimator_" train the dataset with the best combination

Finally I can train the final model to predict the data of the test set, contained in the files _"pendigits_te.txt"_ and _"pendigits_te_Pca_K2.txt"_.

In [37]:
# Loading of test set
if feature_count == 16:
    test_path = 'DBs/PenDigits/pendigits_te.txt'
elif feature_count == 2:
    test_path = 'DBs/PenDigits/pendigits_te_Pca_K2.txt'
    
test_x = utilities.load_unlabeled_dataset_from_txt(test_path, feature_count)

# Creation of the classifier:
clf = SVC(kernel = 'rbf', C = 9.5, gamma = 1/9000)

clf.fit(dataset_patterns, dataset_labels)

# Compute the predictions:
predictions = clf.predict(test_x)

# Saving the predictions:
with open("Predictions.txt", "w") as f:
    for prediction in predictions:
        f.write(str(int(prediction)) + '\n')
print('Predictions correctly saved.')


Predictions correctly saved.
