In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
data_path = "./data/letter-recognition.data"
data = pd.read_csv(data_path, header=None)

In [3]:
HK = data.loc[(data[0] == 'H') | (data[0] == 'K')]
MY = data.loc[(data[0] == 'M') | (data[0] == 'Y')]
AB = data.loc[(data[0] == 'A') | (data[0] == 'B')]

In [4]:
def preprocess(data, percent):
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    idx = int(data.shape[0] * percent)
    train = data[:idx]
    test = data[idx:]
    train_X = scaler.fit_transform(train.to_numpy()[:,1:])   
    train_y = train.to_numpy()[:,0]
    test_X = scaler.transform(test.to_numpy()[:,1:])
    test_y = test.to_numpy()[:,0]

    return train_X, train_y, test_X, test_y

In [5]:
train_X, train_y, test_X, test_y = preprocess(HK, 0.9)

In [6]:
from sklearn.model_selection import GridSearchCV
def searchCV(X, y, params, estimator, folds=5):
    clf = GridSearchCV(estimator=estimator, param_grid=params, cv=folds)
    clf.fit(X, y)
    return clf, clf.best_estimator_, clf.best_score_, \
        clf.cv_results_.get('params'), clf.cv_results_.get('mean_test_score'), \
        clf.cv_results_.get('rank_test_score')


> https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [121]:
from sklearn.neighbors import KNeighborsClassifier
params = {
    'n_neighbors':(1,2,3,4,5),
    'algorithm':('ball_tree', 'kd_tree', 'brute')
}

searchCV(X=train_X, y=train_y, params=params, estimator=KNeighborsClassifier())

(GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
              param_grid={'algorithm': ('ball_tree', 'kd_tree', 'brute'),
                          'n_neighbors': (1, 2, 3, 4, 5)}),
 KNeighborsClassifier(algorithm='ball_tree', n_neighbors=3),
 0.9388679245283018,
 [{'algorithm': 'ball_tree', 'n_neighbors': 1},
  {'algorithm': 'ball_tree', 'n_neighbors': 2},
  {'algorithm': 'ball_tree', 'n_neighbors': 3},
  {'algorithm': 'ball_tree', 'n_neighbors': 4},
  {'algorithm': 'ball_tree', 'n_neighbors': 5},
  {'algorithm': 'kd_tree', 'n_neighbors': 1},
  {'algorithm': 'kd_tree', 'n_neighbors': 2},
  {'algorithm': 'kd_tree', 'n_neighbors': 3},
  {'algorithm': 'kd_tree', 'n_neighbors': 4},
  {'algorithm': 'kd_tree', 'n_neighbors': 5},
  {'algorithm': 'brute', 'n_neighbors': 1},
  {'algorithm': 'brute', 'n_neighbors': 2},
  {'algorithm': 'brute', 'n_neighbors': 3},
  {'algorithm': 'brute', 'n_neighbors': 4},
  {'algorithm': 'brute', 'n_neighbors': 5}],
 array([0.93584906, 0.9245283 , 0.93886

> https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

In [122]:
from sklearn.tree import DecisionTreeClassifier
params = {
    
}
searchCV(X=train_X, y=train_y, params=params, estimator=DecisionTreeClassifier())

(GridSearchCV(cv=5, estimator=DecisionTreeClassifier(), param_grid={}),
 DecisionTreeClassifier(),
 0.9403773584905661,
 [{}],
 array([0.94037736]),
 array([1]))

> https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC

In [123]:
from sklearn.svm import SVC
params = {
    'C': (1.0, 2.0, 3.0, 4.0, 5.0),
    'kernel':('poly', 'rbf', 'sigmoid')
}
searchCV(X=train_X, y=train_y, params=params, estimator=SVC())

(GridSearchCV(cv=5, estimator=SVC(),
              param_grid={'C': (1.0, 2.0, 3.0, 4.0, 5.0),
                          'kernel': ('poly', 'rbf', 'sigmoid')}),
 SVC(C=5.0),
 0.9826415094339623,
 [{'C': 1.0, 'kernel': 'poly'},
  {'C': 1.0, 'kernel': 'rbf'},
  {'C': 1.0, 'kernel': 'sigmoid'},
  {'C': 2.0, 'kernel': 'poly'},
  {'C': 2.0, 'kernel': 'rbf'},
  {'C': 2.0, 'kernel': 'sigmoid'},
  {'C': 3.0, 'kernel': 'poly'},
  {'C': 3.0, 'kernel': 'rbf'},
  {'C': 3.0, 'kernel': 'sigmoid'},
  {'C': 4.0, 'kernel': 'poly'},
  {'C': 4.0, 'kernel': 'rbf'},
  {'C': 4.0, 'kernel': 'sigmoid'},
  {'C': 5.0, 'kernel': 'poly'},
  {'C': 5.0, 'kernel': 'rbf'},
  {'C': 5.0, 'kernel': 'sigmoid'}],
 array([0.94867925, 0.96830189, 0.79018868, 0.95924528, 0.97962264,
        0.77962264, 0.96226415, 0.97811321, 0.77660377, 0.9645283 ,
        0.98037736, 0.77735849, 0.96150943, 0.98264151, 0.77660377]),
 array([10,  5, 11,  9,  3, 12,  7,  4, 14,  6,  2, 13,  8,  1, 14]))

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [124]:
from sklearn.ensemble import RandomForestClassifier

params = {
    'n_estimators': (50, 100, 150, 200, 250, 1000)
}
searchCV(X=train_X, y=train_y, params=params, estimator=RandomForestClassifier())

(GridSearchCV(cv=5, estimator=RandomForestClassifier(),
              param_grid={'n_estimators': (50, 100, 150, 200, 250, 1000)}),
 RandomForestClassifier(n_estimators=150),
 0.9728301886792453,
 [{'n_estimators': 50},
  {'n_estimators': 100},
  {'n_estimators': 150},
  {'n_estimators': 200},
  {'n_estimators': 250},
  {'n_estimators': 1000}],
 array([0.96981132, 0.96981132, 0.97283019, 0.97207547, 0.96981132,
        0.96981132]),
 array([3, 3, 1, 2, 3, 3]))

https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier

In [131]:
from sklearn.neural_network import MLPClassifier

params = {
    'activation': ['relu', 'tanh', 'logistic']
}
searchCV(X=train_X, y=train_y, params=params, estimator=MLPClassifier(solver='adam', learning_rate='adaptive', max_iter=1000))



(GridSearchCV(cv=5,
              estimator=MLPClassifier(learning_rate='adaptive', max_iter=1000),
              param_grid={'activation': ['relu', 'tanh', 'logistic']}),
 MLPClassifier(learning_rate='adaptive', max_iter=1000),
 0.9773584905660379,
 [{'activation': 'relu'}, {'activation': 'tanh'}, {'activation': 'logistic'}],
 array([0.97735849, 0.97735849, 0.97358491]),
 array([1, 1, 3]))

# Dimension Reduction


> https://scikit-learn.org/stable/modules/feature_selection.html
> 
> https://scikit-learn.org/stable/modules/unsupervised_reduction.html

### PCA

In [10]:
from sklearn.decomposition import PCA
pca = PCA(n_components=4)
train_pca_X = pca.fit_transform(train_X)
test_pca_X = pca.transform(test_X)
train_pca_X

array([[ 0.85377952, -0.20091656, -0.58070488,  0.46947561],
       [-2.12332457, -0.42530788,  0.29009342, -0.21664127],
       [ 1.67435311, -0.21890295, -1.78748695,  0.93066566],
       ...,
       [-3.93224347, -0.32109104, -0.71702807,  0.05498993],
       [ 1.86916309,  0.7972506 , -1.84798423,  1.20137316],
       [ 2.31320111, -1.40238815,  2.28021878,  1.91639989]])

### Greedy Forward Feature Construction

In [14]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier
sfs = SequentialFeatureSelector(KNeighborsClassifier(n_neighbors=3), n_features_to_select=4)
train_ffs_X = sfs.fit_transform(train_X, train_y)
test_ffs_X = sfs.transform(test_X)
train_ffs_X

array([[ 0.19075371,  0.5402138 , -0.70484094, -0.50930649],
       [ 0.19075371,  0.98701985, -0.70484094,  0.11553712],
       [-0.07390458,  0.5402138 , -0.70484094, -0.50930649],
       ...,
       [-0.86787942,  1.4338259 ,  1.80062987, -0.50930649],
       [-0.60322114,  0.09340775,  1.17426216, -0.50930649],
       [ 2.57267824, -1.69381643, -0.70484094, -0.50930649]])

### Tree-Based Feature Selection


In [17]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel

tree = SelectFromModel(estimator=DecisionTreeClassifier(), max_features=4)
train_tree_X = tree.fit_transform(train_X, train_y)
test_tree_X = tree.transform(test_X)
train_tree_X

array([[ 0.19075371, -0.70484094, -0.50930649, -0.64905441],
       [ 0.19075371, -0.70484094,  0.11553712,  1.40835491],
       [-0.07390458, -0.70484094, -0.50930649,  0.7225518 ],
       ...,
       [-0.86787942,  1.80062987, -0.50930649,  0.03674869],
       [-0.60322114,  1.17426216, -0.50930649, -0.64905441],
       [ 2.57267824, -0.70484094, -0.50930649, -0.64905441]])