In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.linear_model import LogisticRegression as LR
from sklearn.neighbors import KernelDensity
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")

## Ex. 6.12

In [2]:
def get_data(path):
    data=np.loadtxt(path)
    X=data[:, 1:]
    y=data[:, 0]
    return X, y

X_train, y_train=get_data('./data/zip.train')
X_test, y_test=get_data('./data/zip.test')
print(X_train.shape, X_test.shape)
print(np.unique(y_train, return_counts=True)[1])
print(np.unique(y_test, return_counts=True)[1])

(7291, 256) (2007, 256)
[1194 1005  731  658  652  556  664  645  542  644]
[359 264 198 166 200 160 170 147 166 177]


In [3]:
fit1 = LDA().fit(X_train, y_train)
fit2 = QDA().fit(X_train, y_train)
fit3 = LR(random_state=0).fit(X_train, y_train)
result1 = fit1.score(X_train, y_train), fit1.score(X_test, y_test)
result2 = fit2.score(X_train, y_train), fit2.score(X_test, y_test)
result3 = fit3.score(X_train, y_train), fit3.score(X_test, y_test)

print(result1)
print(result2)
print(result3)

(0.9380057605266767, 0.885401096163428)
(0.920175558908243, 0.8156452416542103)
(0.9938280071320806, 0.9108121574489287)


In [4]:
fit0 = LDA(n_components=9).fit(X_train, y_train)
X_train_t = fit0.transform(X_train)
X_test_t = fit0.transform(X_test)

fit1 = LDA().fit(X_train_t, y_train)
fit2 = QDA().fit(X_train_t, y_train)
fit3 = LR(random_state=0).fit(X_train_t, y_train)
result1 = fit1.score(X_train_t, y_train), fit1.score(X_test_t, y_test)
result2 = fit2.score(X_train_t, y_train), fit2.score(X_test_t, y_test)
result3 = fit3.score(X_train_t, y_train), fit3.score(X_test_t, y_test)

print(result1)
print(result2)
print(result3)

(0.9380057605266767, 0.885401096163428)
(0.9502125908654505, 0.8923766816143498)
(0.9478809491153477, 0.8928749377179871)


In [5]:
def fit(X_train, y_train, space):
    nums = []
    kdes = []
    for i in range(10):
        nums.append(np.sum(y_train == i))
        X = X_train[y_train == i]
        grid = GridSearchCV(KernelDensity(), {'bandwidth': space}).fit(X)
        kdes.append(grid.best_estimator_)
        print('{}th ({}) best bandwidth: {}'.format(i+1, nums[-1], kdes[-1].bandwidth))
    
    return np.log(nums), kdes

def score(pri, lik, X, y):
    Y = []
    for i, kde in enumerate(lik):
        Y.append(pri[i] + kde.score_samples(X))
        
    yh = np.argmax(Y, axis=0)
    return (y == yh).mean()

In [6]:
pri1, lik1 = fit(X_train, y_train, np.linspace(0.05, 0.4, 50))

1th (1194) best bandwidth: 0.3
2th (1005) best bandwidth: 0.1
3th (731) best bandwidth: 0.27142857142857146
4th (658) best bandwidth: 0.3142857142857143
5th (652) best bandwidth: 0.2571428571428572
6th (556) best bandwidth: 0.30714285714285716
7th (664) best bandwidth: 0.25
8th (645) best bandwidth: 0.1928571428571429
9th (542) best bandwidth: 0.34285714285714286
10th (644) best bandwidth: 0.2357142857142857


In [7]:
pri2, lik2 = fit(X_train_t, y_train, np.linspace(0.2, 0.9, 50))

1th (1194) best bandwidth: 0.5714285714285714
2th (1005) best bandwidth: 0.2285714285714286
3th (731) best bandwidth: 0.7285714285714286
4th (658) best bandwidth: 0.6428571428571428
5th (652) best bandwidth: 0.6428571428571428
6th (556) best bandwidth: 0.8
7th (664) best bandwidth: 0.6142857142857143
8th (645) best bandwidth: 0.5285714285714286
9th (542) best bandwidth: 0.7
10th (644) best bandwidth: 0.5285714285714286


In [8]:
result4 = score(pri1, lik1, X_train, y_train), score(pri1, lik1, X_test, y_test)
result5 = score(pri2, lik2, X_train_t, y_train), score(pri2, lik2, X_test_t, y_test)
print(result4)
print(result5)
pri0 = np.zeros(10)
result4 = score(pri0, lik1, X_train, y_train), score(pri0, lik1, X_test, y_test)
result5 = score(pri0, lik2, X_train_t, y_train), score(pri0, lik2, X_test_t, y_test)
print(result4)
print(result5)

(0.6250171444246331, 0.2640757349277529)
(0.9982169798381566, 0.8963627304434479)
(0.6246056782334385, 0.2640757349277529)
(0.9983541352352215, 0.8943697060288989)


In [10]:
X_train = pd.read_csv('./data/vowel.train', index_col=0)
X_test = pd.read_csv('./data/vowel.test', index_col=0)
y_train = X_train.pop('y')
y_test = X_test.pop('y')
X_train = X_train.values
y_train = y_train.values - 1
X_test = X_test.values
y_test = y_test.values - 1

print(X_train.shape, X_test.shape)
print(np.unique(y_train, return_counts=True)[1])
print(np.unique(y_test, return_counts=True)[1])

fit1 = LDA().fit(X_train, y_train)
fit2 = QDA().fit(X_train, y_train)
fit3 = LR(random_state=0).fit(X_train, y_train)
result1 = fit1.score(X_train, y_train), fit1.score(X_test, y_test)
result2 = fit2.score(X_train, y_train), fit2.score(X_test, y_test)
result3 = fit3.score(X_train, y_train), fit3.score(X_test, y_test)

print(result1)
print(result2)
print(result3)

pri, lik = fit(X_train, y_train, np.linspace(0.3, 0.7, 50))
result4 = score(pri, lik, X_train, y_train), score(pri, lik, X_test, y_test)
print(result4)

(528, 10) (462, 10)
[48 48 48 48 48 48 48 48 48 48 48]
[42 42 42 42 42 42 42 42 42 42 42]
(0.6837121212121212, 0.44372294372294374)
(0.9886363636363636, 0.47186147186147187)
(0.7253787878787878, 0.461038961038961)
1th (48) best bandwidth: 0.6591836734693877
2th (48) best bandwidth: 0.44693877551020406
3th (48) best bandwidth: 0.34897959183673466
4th (48) best bandwidth: 0.3979591836734694
5th (48) best bandwidth: 0.4061224489795918
6th (48) best bandwidth: 0.43061224489795913
7th (48) best bandwidth: 0.4877551020408163
8th (48) best bandwidth: 0.3571428571428571
9th (48) best bandwidth: 0.4714285714285714
10th (48) best bandwidth: 0.5693877551020408
(0.8674242424242424, 0.5194805194805194)
