In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Data preprocessing

### Read Data

In [2]:
data_path = "./data/letter-recognition.data"
data = pd.read_csv(data_path, header=None)

### Sample Filtering

In [3]:
HK = data.loc[(data[0] == 'H') | (data[0] == 'K')]
MY = data.loc[(data[0] == 'M') | (data[0] == 'Y')]
AB = data.loc[(data[0] == 'A') | (data[0] == 'B')]

### Data Standardization & Train/Test Sets Preparing

In [4]:
def preprocess(data, percent):
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    idx = int(data.shape[0] * percent)
    train = data[:idx]
    test = data[idx:]
    train_X = scaler.fit_transform(train.to_numpy()[:,1:])   
    train_y = train.to_numpy()[:,0]
    test_X = scaler.transform(test.to_numpy()[:,1:])
    test_y = test.to_numpy()[:,0]

    return train_X, train_y, test_X, test_y

In [5]:
train_X, train_y, test_X, test_y = preprocess(HK, 0.9)

# Model Fitting

### Cross-Validation & Feature Selection

In [6]:
from sklearn.model_selection import GridSearchCV
def searchCV(X, y, params, estimator, folds=5):
    clf = GridSearchCV(estimator=estimator, param_grid=params, cv=folds)
    clf.fit(X, y)
    return clf, clf.best_estimator_, clf.best_score_, \
        clf.cv_results_.get('params'), clf.cv_results_.get('mean_test_score'), \
        clf.cv_results_.get('rank_test_score')


### K-Nearest Neighbors
> https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [121]:
from sklearn.neighbors import KNeighborsClassifier
params = {
    'n_neighbors':(1,2,3,4,5),
    'algorithm':('ball_tree', 'kd_tree', 'brute')
}

searchCV(X=train_X, y=train_y, params=params, estimator=KNeighborsClassifier())

(GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
              param_grid={'algorithm': ('ball_tree', 'kd_tree', 'brute'),
                          'n_neighbors': (1, 2, 3, 4, 5)}),
 KNeighborsClassifier(algorithm='ball_tree', n_neighbors=3),
 0.9388679245283018,
 [{'algorithm': 'ball_tree', 'n_neighbors': 1},
  {'algorithm': 'ball_tree', 'n_neighbors': 2},
  {'algorithm': 'ball_tree', 'n_neighbors': 3},
  {'algorithm': 'ball_tree', 'n_neighbors': 4},
  {'algorithm': 'ball_tree', 'n_neighbors': 5},
  {'algorithm': 'kd_tree', 'n_neighbors': 1},
  {'algorithm': 'kd_tree', 'n_neighbors': 2},
  {'algorithm': 'kd_tree', 'n_neighbors': 3},
  {'algorithm': 'kd_tree', 'n_neighbors': 4},
  {'algorithm': 'kd_tree', 'n_neighbors': 5},
  {'algorithm': 'brute', 'n_neighbors': 1},
  {'algorithm': 'brute', 'n_neighbors': 2},
  {'algorithm': 'brute', 'n_neighbors': 3},
  {'algorithm': 'brute', 'n_neighbors': 4},
  {'algorithm': 'brute', 'n_neighbors': 5}],
 array([0.93584906, 0.9245283 , 0.93886

### Decision Tree
> https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

In [23]:
from sklearn.tree import DecisionTreeClassifier
params = {
    'max_depth': [2, 4, 6, 8, 10],
    'max_features': ['auto', 'sqrt', 'log2']
}
searchCV(X=train_X, y=train_y, params=params, estimator=DecisionTreeClassifier())

(GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
              param_grid={'max_depth': [2, 4, 6, 8, 10],
                          'max_features': ['auto', 'sqrt', 'log2']}),
 DecisionTreeClassifier(max_depth=10, max_features='sqrt'),
 0.9305660377358491,
 [{'max_depth': 2, 'max_features': 'auto'},
  {'max_depth': 2, 'max_features': 'sqrt'},
  {'max_depth': 2, 'max_features': 'log2'},
  {'max_depth': 4, 'max_features': 'auto'},
  {'max_depth': 4, 'max_features': 'sqrt'},
  {'max_depth': 4, 'max_features': 'log2'},
  {'max_depth': 6, 'max_features': 'auto'},
  {'max_depth': 6, 'max_features': 'sqrt'},
  {'max_depth': 6, 'max_features': 'log2'},
  {'max_depth': 8, 'max_features': 'auto'},
  {'max_depth': 8, 'max_features': 'sqrt'},
  {'max_depth': 8, 'max_features': 'log2'},
  {'max_depth': 10, 'max_features': 'auto'},
  {'max_depth': 10, 'max_features': 'sqrt'},
  {'max_depth': 10, 'max_features': 'log2'}],
 array([0.78188679, 0.81735849, 0.78037736, 0.84603774, 0.87471698,
    

### SVM
> https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC

In [38]:
from sklearn.svm import SVC
params = {
    'C': [3.0, 4.0, 5.0, 6.0, 7.0],
    'kernel':['linear','poly', 'rbf']
}
searchCV(X=train_X, y=train_y, params=params, estimator=SVC())

(GridSearchCV(cv=5, estimator=SVC(),
              param_grid={'C': [3.0, 4.0, 5.0, 6.0, 7.0],
                          'kernel': ['linear', 'poly', 'rbf']}),
 SVC(C=5.0),
 0.9826415094339623,
 [{'C': 3.0, 'kernel': 'linear'},
  {'C': 3.0, 'kernel': 'poly'},
  {'C': 3.0, 'kernel': 'rbf'},
  {'C': 4.0, 'kernel': 'linear'},
  {'C': 4.0, 'kernel': 'poly'},
  {'C': 4.0, 'kernel': 'rbf'},
  {'C': 5.0, 'kernel': 'linear'},
  {'C': 5.0, 'kernel': 'poly'},
  {'C': 5.0, 'kernel': 'rbf'},
  {'C': 6.0, 'kernel': 'linear'},
  {'C': 6.0, 'kernel': 'poly'},
  {'C': 6.0, 'kernel': 'rbf'},
  {'C': 7.0, 'kernel': 'linear'},
  {'C': 7.0, 'kernel': 'poly'},
  {'C': 7.0, 'kernel': 'rbf'}],
 array([0.93433962, 0.96226415, 0.97811321, 0.93358491, 0.9645283 ,
        0.98037736, 0.93283019, 0.96150943, 0.98264151, 0.93283019,
        0.96679245, 0.98188679, 0.93283019, 0.96754717, 0.98037736]),
 array([11,  9,  5, 12,  8,  3, 13, 10,  1, 13,  7,  2, 13,  6,  3]))

### Random Forest
> https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [39]:
from sklearn.ensemble import RandomForestClassifier

params = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 2, 4, 6, 8]
}
searchCV(X=train_X, y=train_y, params=params, estimator=RandomForestClassifier())

(GridSearchCV(cv=5, estimator=RandomForestClassifier(),
              param_grid={'max_depth': [None, 2, 4, 6, 8],
                          'n_estimators': [100, 200, 300, 400, 500]}),
 RandomForestClassifier(n_estimators=300),
 0.9735849056603774,
 [{'max_depth': None, 'n_estimators': 100},
  {'max_depth': None, 'n_estimators': 200},
  {'max_depth': None, 'n_estimators': 300},
  {'max_depth': None, 'n_estimators': 400},
  {'max_depth': None, 'n_estimators': 500},
  {'max_depth': 2, 'n_estimators': 100},
  {'max_depth': 2, 'n_estimators': 200},
  {'max_depth': 2, 'n_estimators': 300},
  {'max_depth': 2, 'n_estimators': 400},
  {'max_depth': 2, 'n_estimators': 500},
  {'max_depth': 4, 'n_estimators': 100},
  {'max_depth': 4, 'n_estimators': 200},
  {'max_depth': 4, 'n_estimators': 300},
  {'max_depth': 4, 'n_estimators': 400},
  {'max_depth': 4, 'n_estimators': 500},
  {'max_depth': 6, 'n_estimators': 100},
  {'max_depth': 6, 'n_estimators': 200},
  {'max_depth': 6, 'n_estimators': 300

### Artificial Neural Network
> https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier

In [40]:
from sklearn.neural_network import MLPClassifier

params = {
    'activation': ['relu', 'tanh', 'logistic'],
    'solver': ['lbfgs', 'sgd', 'adam'],
    'learning_rate': ['constant', 'invscaling', 'adaptive']
}
searchCV(X=train_X, y=train_y, params=params, estimator=MLPClassifier(learning_rate='adaptive', max_iter=2000))

(GridSearchCV(cv=5,
              estimator=MLPClassifier(learning_rate='adaptive', max_iter=2000),
              param_grid={'activation': ['relu', 'tanh', 'logistic'],
                          'learning_rate': ['constant', 'invscaling',
                                            'adaptive'],
                          'solver': ['lbfgs', 'sgd', 'adam']}),
 MLPClassifier(activation='tanh', learning_rate='invscaling', max_iter=2000),
 0.9796226415094338,
 [{'activation': 'relu', 'learning_rate': 'constant', 'solver': 'lbfgs'},
  {'activation': 'relu', 'learning_rate': 'constant', 'solver': 'sgd'},
  {'activation': 'relu', 'learning_rate': 'constant', 'solver': 'adam'},
  {'activation': 'relu', 'learning_rate': 'invscaling', 'solver': 'lbfgs'},
  {'activation': 'relu', 'learning_rate': 'invscaling', 'solver': 'sgd'},
  {'activation': 'relu', 'learning_rate': 'invscaling', 'solver': 'adam'},
  {'activation': 'relu', 'learning_rate': 'adaptive', 'solver': 'lbfgs'},
  {'activation': 'relu

# Dimension Reduction

> https://scikit-learn.org/stable/modules/feature_selection.html
> 
> https://scikit-learn.org/stable/modules/unsupervised_reduction.html

## Feature Extration (3)

### PCA

In [94]:
from sklearn.decomposition import PCA
pca = PCA(n_components=4)
train_pca_X = pca.fit_transform(train_X)
test_pca_X = pca.transform(test_X)
train_pca_X.shape

(1325, 4)

### SVD

In [95]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=4)
train_svd_X = svd.fit_transform(train_X)
test_svd_X = svd.transform(test_X)
train_svd_X.shape

(1325, 4)

### NMF

In [96]:
from sklearn.decomposition import NMF
nmf = TruncatedSVD(n_components=4)
train_nmf_X = nmf.fit_transform(train_X)
test_nmf_X = nmf.transform(test_X)
train_nmf_X.shape

(1325, 4)

## Warpper Feature Selection (2)

### Forward Feature Construction

In [92]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier
ffs = SequentialFeatureSelector(KNeighborsClassifier(n_neighbors=3), n_features_to_select=4)
train_ffs_X = sfs.fit_transform(train_X, train_y)
test_ffs_X = sfs.transform(test_X)
train_ffs_X.shape

(1325, 4)

### Backward Feature Elimination

In [93]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier
bfe = SequentialFeatureSelector(KNeighborsClassifier(n_neighbors=3), n_features_to_select=4, direction='backward')
train_bfe_X = bfe.fit_transform(train_X, train_y)
test_bfe_X = bfe.transform(test_X)
train_bfe_X.shape

(1325, 4)

## Embedded Methods (3)

### Decision Trees


In [91]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel

tree = SelectFromModel(estimator=DecisionTreeClassifier(), max_features=4)
train_tree_X = tree.fit_transform(train_X, train_y)
test_tree_X = tree.transform(test_X)
train_tree_X.shape

(1325, 4)

### Random Forest

In [90]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

forest = SelectFromModel(estimator=RandomForestClassifier(), max_features=4)
train_forest_X = forest.fit_transform(train_X, train_y)
test_forest_X = forest.transform(test_X)
train_forest_X.shape

(1325, 4)

### LASSO Regression (L1-Based Feature Selection)

In [89]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
lr = SelectFromModel(LinearSVC(C=0.0026, penalty="l1", dual=False).fit(train_X, train_y), prefit=True)
train_lr_X = lr.transform(train_X)
test_lr_X = lr.transform(test_X)
train_lr_X.shape

(1325, 4)