# CSE 514A Assignment 2

In [3]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## Data preprocessing

### Read Data

In [4]:
data_path = "./data/letter-recognition.data"
data = pd.read_csv(data_path, header=None)

### Sample Filtering

In [41]:
HK = data.loc[(data[0] == 'H') | (data[0] == 'K')]
MY = data.loc[(data[0] == 'M') | (data[0] == 'Y')]
UV = data.loc[(data[0] == 'U') | (data[0] == 'V')]

### Data Standardization & Train/Test Sets Preparing

In [37]:
def preprocess(data, percent):
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    idx = int(data.shape[0] * percent)
    train = data[:idx]
    test = data[idx:]
    train_X = scaler.fit_transform(train.to_numpy()[:,1:])   
    train_y = train.to_numpy()[:,0]
    test_X = scaler.transform(test.to_numpy()[:,1:])
    test_y = test.to_numpy()[:,0]

    return train_X, train_y, test_X, test_y

In [42]:
train_X, train_y, test_X, test_y = preprocess(UV, 0.9)

## Model Fitting

### Cross-Validation & Feature Selection

In [17]:
from sklearn.model_selection import GridSearchCV
def searchCV(X, y, params, estimator, folds=5):
    clf = GridSearchCV(estimator=estimator, param_grid=params, cv=folds)
    clf.fit(X, y)
    return clf, clf.best_estimator_, clf.best_score_, \
        clf.cv_results_.get('params'), clf.cv_results_.get('mean_test_score'), \
        clf.cv_results_.get('rank_test_score')


### K-Nearest Neighbors
> https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [18]:
from sklearn.neighbors import KNeighborsClassifier
params = {
    'n_neighbors':(1,2,3,4,5),
    'algorithm':('ball_tree', 'kd_tree', 'brute')
}

searchCV(X=train_X, y=train_y, params=params, estimator=KNeighborsClassifier())

(GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
              param_grid={'algorithm': ('ball_tree', 'kd_tree', 'brute'),
                          'n_neighbors': (1, 2, 3, 4, 5)}),
 KNeighborsClassifier(algorithm='ball_tree', n_neighbors=1),
 0.9569811320754716,
 [{'algorithm': 'ball_tree', 'n_neighbors': 1},
  {'algorithm': 'ball_tree', 'n_neighbors': 2},
  {'algorithm': 'ball_tree', 'n_neighbors': 3},
  {'algorithm': 'ball_tree', 'n_neighbors': 4},
  {'algorithm': 'ball_tree', 'n_neighbors': 5},
  {'algorithm': 'kd_tree', 'n_neighbors': 1},
  {'algorithm': 'kd_tree', 'n_neighbors': 2},
  {'algorithm': 'kd_tree', 'n_neighbors': 3},
  {'algorithm': 'kd_tree', 'n_neighbors': 4},
  {'algorithm': 'kd_tree', 'n_neighbors': 5},
  {'algorithm': 'brute', 'n_neighbors': 1},
  {'algorithm': 'brute', 'n_neighbors': 2},
  {'algorithm': 'brute', 'n_neighbors': 3},
  {'algorithm': 'brute', 'n_neighbors': 4},
  {'algorithm': 'brute', 'n_neighbors': 5}],
 array([0.95698113, 0.94415094, 0.95698

### Decision Tree
> https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

In [8]:
from sklearn.tree import DecisionTreeClassifier
params = {
    'max_depth': [2, 4, 6, 8, 10],
    'max_features': ['auto', 'sqrt', 'log2']
}
searchCV(X=train_X, y=train_y, params=params, estimator=DecisionTreeClassifier())

(GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
              param_grid={'max_depth': [2, 4, 6, 8, 10],
                          'max_features': ['auto', 'sqrt', 'log2']}),
 DecisionTreeClassifier(max_depth=10, max_features='auto'),
 0.9320754716981131,
 [{'max_depth': 2, 'max_features': 'auto'},
  {'max_depth': 2, 'max_features': 'sqrt'},
  {'max_depth': 2, 'max_features': 'log2'},
  {'max_depth': 4, 'max_features': 'auto'},
  {'max_depth': 4, 'max_features': 'sqrt'},
  {'max_depth': 4, 'max_features': 'log2'},
  {'max_depth': 6, 'max_features': 'auto'},
  {'max_depth': 6, 'max_features': 'sqrt'},
  {'max_depth': 6, 'max_features': 'log2'},
  {'max_depth': 8, 'max_features': 'auto'},
  {'max_depth': 8, 'max_features': 'sqrt'},
  {'max_depth': 8, 'max_features': 'log2'},
  {'max_depth': 10, 'max_features': 'auto'},
  {'max_depth': 10, 'max_features': 'sqrt'},
  {'max_depth': 10, 'max_features': 'log2'}],
 array([0.80603774, 0.7909434 , 0.79169811, 0.85811321, 0.85811321,
    

### SVM
> https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC

In [19]:
from sklearn.svm import SVC
params = {
    'C': [3.0, 4.0, 5.0, 6.0, 7.0],
    'kernel':['linear','poly', 'rbf']
}
searchCV(X=train_X, y=train_y, params=params, estimator=SVC())

(GridSearchCV(cv=5, estimator=SVC(),
              param_grid={'C': [3.0, 4.0, 5.0, 6.0, 7.0],
                          'kernel': ['linear', 'poly', 'rbf']}),
 SVC(C=7.0),
 0.9826415094339623,
 [{'C': 3.0, 'kernel': 'linear'},
  {'C': 3.0, 'kernel': 'poly'},
  {'C': 3.0, 'kernel': 'rbf'},
  {'C': 4.0, 'kernel': 'linear'},
  {'C': 4.0, 'kernel': 'poly'},
  {'C': 4.0, 'kernel': 'rbf'},
  {'C': 5.0, 'kernel': 'linear'},
  {'C': 5.0, 'kernel': 'poly'},
  {'C': 5.0, 'kernel': 'rbf'},
  {'C': 6.0, 'kernel': 'linear'},
  {'C': 6.0, 'kernel': 'poly'},
  {'C': 6.0, 'kernel': 'rbf'},
  {'C': 7.0, 'kernel': 'linear'},
  {'C': 7.0, 'kernel': 'poly'},
  {'C': 7.0, 'kernel': 'rbf'}],
 array([0.9290566 , 0.97433962, 0.97886792, 0.93056604, 0.97207547,
        0.98037736, 0.92981132, 0.97056604, 0.98188679, 0.92981132,
        0.96981132, 0.98188679, 0.93056604, 0.97207547, 0.98264151]),
 array([15,  6,  5, 11,  7,  4, 13,  9,  2, 13, 10,  2, 12,  8,  1]))

### Random Forest
> https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [20]:
from sklearn.ensemble import RandomForestClassifier

params = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 2, 4, 6, 8]
}
searchCV(X=train_X, y=train_y, params=params, estimator=RandomForestClassifier())

(GridSearchCV(cv=5, estimator=RandomForestClassifier(),
              param_grid={'max_depth': [None, 2, 4, 6, 8],
                          'n_estimators': [100, 200, 300, 400, 500]}),
 RandomForestClassifier(n_estimators=200),
 0.9743396226415095,
 [{'max_depth': None, 'n_estimators': 100},
  {'max_depth': None, 'n_estimators': 200},
  {'max_depth': None, 'n_estimators': 300},
  {'max_depth': None, 'n_estimators': 400},
  {'max_depth': None, 'n_estimators': 500},
  {'max_depth': 2, 'n_estimators': 100},
  {'max_depth': 2, 'n_estimators': 200},
  {'max_depth': 2, 'n_estimators': 300},
  {'max_depth': 2, 'n_estimators': 400},
  {'max_depth': 2, 'n_estimators': 500},
  {'max_depth': 4, 'n_estimators': 100},
  {'max_depth': 4, 'n_estimators': 200},
  {'max_depth': 4, 'n_estimators': 300},
  {'max_depth': 4, 'n_estimators': 400},
  {'max_depth': 4, 'n_estimators': 500},
  {'max_depth': 6, 'n_estimators': 100},
  {'max_depth': 6, 'n_estimators': 200},
  {'max_depth': 6, 'n_estimators': 300

### Artificial Neural Network
> https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier

In [19]:
from sklearn.neural_network import MLPClassifier

params = {
    'activation': ['relu', 'tanh', 'logistic'],
    'solver': ['lbfgs', 'sgd', 'adam'],
    'learning_rate': ['constant', 'invscaling', 'adaptive']
}
searchCV(X=train_X, y=train_y, params=params, estimator=MLPClassifier(max_iter=2000))

(GridSearchCV(cv=5, estimator=MLPClassifier(max_iter=2000),
              param_grid={'activation': ['relu', 'tanh', 'logistic'],
                          'learning_rate': ['constant', 'invscaling',
                                            'adaptive'],
                          'solver': ['lbfgs', 'sgd', 'adam']}),
 MLPClassifier(activation='tanh', learning_rate='adaptive', max_iter=2000),
 0.9781132075471698,
 [{'activation': 'relu', 'learning_rate': 'constant', 'solver': 'lbfgs'},
  {'activation': 'relu', 'learning_rate': 'constant', 'solver': 'sgd'},
  {'activation': 'relu', 'learning_rate': 'constant', 'solver': 'adam'},
  {'activation': 'relu', 'learning_rate': 'invscaling', 'solver': 'lbfgs'},
  {'activation': 'relu', 'learning_rate': 'invscaling', 'solver': 'sgd'},
  {'activation': 'relu', 'learning_rate': 'invscaling', 'solver': 'adam'},
  {'activation': 'relu', 'learning_rate': 'adaptive', 'solver': 'lbfgs'},
  {'activation': 'relu', 'learning_rate': 'adaptive', 'solver': 

### AdaBoost
https://scikit-learn.org/stable/modules/ensemble.html#adaboost

In [18]:
from sklearn.ensemble import AdaBoostClassifier
params = {
    'n_estimators': [25, 50, 100, 200, 400],
    'learning_rate': [.25, .5, 1, 2, 4]
}
searchCV(X=train_X, y=train_y, params=params, estimator=AdaBoostClassifier())

(GridSearchCV(cv=5, estimator=AdaBoostClassifier(),
              param_grid={'learning_rate': [0.25, 0.5, 1, 2, 4],
                          'n_estimators': [25, 50, 100, 200, 400]}),
 AdaBoostClassifier(learning_rate=1, n_estimators=100),
 0.9532075471698114,
 [{'learning_rate': 0.25, 'n_estimators': 25},
  {'learning_rate': 0.25, 'n_estimators': 50},
  {'learning_rate': 0.25, 'n_estimators': 100},
  {'learning_rate': 0.25, 'n_estimators': 200},
  {'learning_rate': 0.25, 'n_estimators': 400},
  {'learning_rate': 0.5, 'n_estimators': 25},
  {'learning_rate': 0.5, 'n_estimators': 50},
  {'learning_rate': 0.5, 'n_estimators': 100},
  {'learning_rate': 0.5, 'n_estimators': 200},
  {'learning_rate': 0.5, 'n_estimators': 400},
  {'learning_rate': 1, 'n_estimators': 25},
  {'learning_rate': 1, 'n_estimators': 50},
  {'learning_rate': 1, 'n_estimators': 100},
  {'learning_rate': 1, 'n_estimators': 200},
  {'learning_rate': 1, 'n_estimators': 400},
  {'learning_rate': 2, 'n_estimators': 25},

## Dimension Reduction

> https://scikit-learn.org/stable/modules/feature_selection.html
> 
> https://scikit-learn.org/stable/modules/unsupervised_reduction.html

### Feature Extration (3)

#### PCA

In [11]:
from sklearn.decomposition import PCA
pca = PCA(n_components=4)
train_pca_X = pca.fit_transform(train_X)
test_pca_X = pca.transform(test_X)
train_pca_X.shape

(1325, 4)

#### SVD

In [12]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=4)
train_svd_X = svd.fit_transform(train_X)
test_svd_X = svd.transform(test_X)
train_svd_X.shape

(1325, 4)

#### NMF

In [15]:
from sklearn.decomposition import NMF
nmf = NMF(n_components=4, max_iter=1000)
train_nmf_X = nmf.fit_transform(train_X)
test_nmf_X = nmf.transform(test_X)
train_nmf_X.shape



(1325, 4)

### Warpper Feature Selection (2)

#### Forward Feature Construction

In [16]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier
ffs = SequentialFeatureSelector(KNeighborsClassifier(n_neighbors=3), n_features_to_select=4)
train_ffs_X = ffs.fit_transform(train_X, train_y)
test_ffs_X = ffs.transform(test_X)
train_ffs_X.shape

(1325, 4)

#### Backward Feature Elimination

In [17]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier
bfe = SequentialFeatureSelector(KNeighborsClassifier(n_neighbors=3), n_features_to_select=4, direction='backward')
train_bfe_X = bfe.fit_transform(train_X, train_y)
test_bfe_X = bfe.transform(test_X)
train_bfe_X.shape

(1325, 4)

### Embedded Methods (3)

#### Decision Trees


In [18]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel

tree = SelectFromModel(estimator=DecisionTreeClassifier(), max_features=4)
train_tree_X = tree.fit_transform(train_X, train_y)
test_tree_X = tree.transform(test_X)
train_tree_X.shape

(1325, 4)

#### Random Forest

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

forest = SelectFromModel(estimator=RandomForestClassifier(), max_features=4)
train_forest_X = forest.fit_transform(train_X, train_y)
test_forest_X = forest.transform(test_X)
train_forest_X.shape

(1325, 4)

#### LASSO Regression (L1-Based Feature Selection)

In [44]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
lr = SelectFromModel(LinearSVC(C=0.013, penalty="l1", dual=False).fit(train_X, train_y), prefit=True)
train_lr_X = lr.transform(train_X)
test_lr_X = lr.transform(test_X)
train_lr_X.shape

(1419, 4)