In [1]:
import numpy as np
%matplotlib inline

A = np.array([[3,2000],[2,3000],[4,5000],[5,8000],[1,2000]],dtype = 'float')

A

array([[3.e+00, 2.e+03],
       [2.e+00, 3.e+03],
       [4.e+00, 5.e+03],
       [5.e+00, 8.e+03],
       [1.e+00, 2.e+03]])

In [2]:
mean =np.mean(A,axis = 0)
norm = A - mean
scope = np.max(norm,axis = 0)- np.min(norm,axis = 0)
norm = norm/scope
norm

array([[ 0.        , -0.33333333],
       [-0.25      , -0.16666667],
       [ 0.25      ,  0.16666667],
       [ 0.5       ,  0.66666667],
       [-0.5       , -0.33333333]])

In [3]:
U,S,V = np.linalg.svd(np.dot(norm.T,norm))
U

array([[-0.67710949, -0.73588229],
       [-0.73588229,  0.67710949]])

In [4]:
U_reduce = U[:,0].reshape(2,1)
U_reduce


array([[-0.67710949],
       [-0.73588229]])

In [5]:

R = np.dot(norm,U_reduce)
R

array([[ 0.2452941 ],
       [ 0.29192442],
       [-0.29192442],
       [-0.82914294],
       [ 0.58384884]])

In [6]:
Z = np.dot(R,U_reduce.T)
Z

array([[-0.16609096, -0.18050758],
       [-0.19766479, -0.21482201],
       [ 0.19766479,  0.21482201],
       [ 0.56142055,  0.6101516 ],
       [-0.39532959, -0.42964402]])

In [7]:
np.multiply(Z,scope)+mean

array([[2.33563616e+00, 2.91695452e+03],
       [2.20934082e+00, 2.71106794e+03],
       [3.79065918e+00, 5.28893206e+03],
       [5.24568220e+00, 7.66090960e+03],
       [1.41868164e+00, 1.42213588e+03]])

In [8]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

def std_PCA(**argv):
    scaler = MinMaxScaler()
    pca = PCA(**argv)
    pipeline = Pipeline([('scalar',scaler),('pca',pca)])
    return pipeline

pca = std_PCA(n_components=1)
R2 = pca.fit_transform(A)
R2

array([[-0.2452941 ],
       [-0.29192442],
       [ 0.29192442],
       [ 0.82914294],
       [-0.58384884]])

In [9]:
pca.inverse_transform(R2)

array([[2.33563616e+00, 2.91695452e+03],
       [2.20934082e+00, 2.71106794e+03],
       [3.79065918e+00, 5.28893206e+03],
       [5.24568220e+00, 7.66090960e+03],
       [1.41868164e+00, 1.42213588e+03]])

In [10]:
import time 
import logging

from sklearn.datasets import fetch_olivetti_faces

logging.basicConfig(level=logging.INFO,format='%(asctime)s %(message)s')

data_home = '/'
logging.info('Start to load dataset')
faces = fetch_olivetti_faces(data_home = data_home)
logging.info('Done with load dataset')

2020-12-01 13:06:10,056 Start to load dataset
2020-12-01 13:06:10,092 Done with load dataset


In [11]:
X = faces.data
Y = faces.target
targets = np.unique(faces.target)

target_names = np.array(['c%d' % t for t in targets])
n_targets = target_names.shape[0]
n_samples,h,w = faces.images.shape
print('Sample count:{}\nTarget count:{}'.format(n_samples,n_targets))
print('Image size:{}x{}\nDataset shape:{}\n'.format(w,h,X.shape))


Sample count:400
Target count:40
Image size:64x64
Dataset shape:(400, 4096)



In [12]:
%matplotlib
from matplotlib import pyplot as plt

def plot_gallery(images,titles,h,w,n_row = 2,n_col = 5):
    plt.figure(figsize = (2*n_col,2.2*n_row),dpi = 144)
    plt.subplots_adjust(bottom = 0,left = .01,right = .99,top=.90,hspace = 0.01)
    for i in range(n_row*n_col):
        plt.subplot(n_row,n_col,i+1)
        plt.imshow(images[i].reshape((h,w)),cmap = plt.cm.gray)
        plt.title(titles[i])
        plt.axis('off')
        #plt.show()

Using matplotlib backend: Qt5Agg


In [13]:

n_row = 2
n_col = 6
sample_images = None
sample_titles = []

for i in range(n_targets):
    people_images = X[Y==i]
    people_sample_index = np.random.randint(0,people_images.shape[0],1)
    people_sample_image = people_images[people_sample_index,:]
    if sample_images is not None:
        sample_images =np.concatenate((sample_images,people_sample_image),axis =0 )
    else:
        sample_images = people_sample_image
    sample_titles.append(target_names[i])
    
plot_gallery(sample_images,sample_titles,h,w,n_row,n_col)
        
    

In [14]:
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state = 4)

In [15]:
from sklearn.svm import SVC

start = time.perf_counter()
print('Fitting train databases ...')
clf = SVC(class_weight = 'balanced')
clf.fit(X_train,Y_train)
print('Done in {0:.2f}s'.format(time.perf_counter()-start))


Fitting train databases ...
Done in 0.70s


In [16]:
start = time.clock()
print('predicing test dataset ...')
Y_pred = clf.predict(X_test)
print('Done in {0:.2f}s'.format(time.perf_counter()-start))

predicing test dataset ...
Done in 0.08s


  """Entry point for launching an IPython kernel.


In [17]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(Y_test,Y_pred,labels = range(n_targets))
np.set_printoptions(threshold=np.nan)

print(cm)

[[0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0
  0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
  0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
  0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [18]:
from sklearn.metrics import classification_report
print(classification_report(Y_test,Y_pred,target_names = target_names))

             precision    recall  f1-score   support

         c0       0.00      0.00      0.00         1
         c1       0.00      0.00      0.00         3
         c2       0.00      0.00      0.00         2
         c3       0.00      0.00      0.00         1
         c4       0.00      0.00      0.00         1
         c5       0.00      0.00      0.00         1
         c6       0.00      0.00      0.00         4
         c7       0.00      0.00      0.00         2
         c8       0.00      0.00      0.00         4
         c9       0.00      0.00      0.00         2
        c10       0.00      0.00      0.00         1
        c11       0.00      0.00      0.00         0
        c12       0.00      0.00      0.00         4
        c13       0.00      0.00      0.00         4
        c14       0.00      0.00      0.00         1
        c15       0.00      0.00      0.00         1
        c16       0.00      0.00      0.00         3
        c17       0.00      0.00      0.00   

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [19]:
from sklearn.decomposition import PCA

print('Exploring explained variance radio for dataset ...')

candidate_components = range(10,300,30)
explained_ratios = []
start = time.clock()
for c in candidate_components:
    pca = PCA(n_components=c)
    X_pca = pca.fit_transform(X)
    explained_ratios.append(np.sum(pca.explained_variance_ratio_))
print('done in {0:.2f} s'.format(time.clock()-start))

Exploring explained variance radio for dataset ...


  import sys


done in 1.41 s


  if sys.path[0] == '':


In [20]:
from matplotlib import pyplot as plt

plt.figure(figsize=(10,6),dpi = 144)
plt.grid()
plt.plot(candidate_components,explained_ratios)
plt.xlabel('Number of PCA Components')
plt.ylabel('Explained Variance Ratio')
plt.title('Explained variance ratio for PCA')
plt.yticks(np.arange(0.5,1.05,0.05))
plt.xticks(np.arange(0,300,20))

([<matplotlib.axis.XTick at 0x2b1f6feab70>,
  <matplotlib.axis.XTick at 0x2b1f6fea4a8>,
  <matplotlib.axis.XTick at 0x2b1f6fea390>,
  <matplotlib.axis.XTick at 0x2b1f74c97f0>,
  <matplotlib.axis.XTick at 0x2b1f74c9d30>,
  <matplotlib.axis.XTick at 0x2b1f74d22b0>,
  <matplotlib.axis.XTick at 0x2b1f74d27f0>,
  <matplotlib.axis.XTick at 0x2b1f74d2d30>,
  <matplotlib.axis.XTick at 0x2b1f74db2b0>,
  <matplotlib.axis.XTick at 0x2b1f74d2668>,
  <matplotlib.axis.XTick at 0x2b1f74bb978>,
  <matplotlib.axis.XTick at 0x2b1f74db6a0>,
  <matplotlib.axis.XTick at 0x2b1f74dbc50>,
  <matplotlib.axis.XTick at 0x2b1f74e41d0>,
  <matplotlib.axis.XTick at 0x2b1f74e4710>],
 <a list of 15 Text xticklabel objects>)

In [36]:

def title_prefix(prefix, title):
    return "{}: {}".format(prefix, title)

n_row = 1
n_col = 5

sample_images = sample_images[0:5]
sample_titles = sample_titles[0:5]

plotting_images = sample_images
plotting_titles = [title_prefix('orig', t) for t in sample_titles]
candidate_components = [140, 75, 37, 19, 8]
for c in candidate_components:
    print("Fitting and projecting on PCA(n_components={}) ...".format(c))
    start = time.clock()
    pca = PCA(n_components=c)
    pca.fit(X)
    X_sample_pca = pca.transform(sample_images)
    X_sample_inv = pca.inverse_transform(X_sample_pca)
    plotting_images = np.concatenate((plotting_images, X_sample_inv), axis=0)
    sample_title_pca = [title_prefix('{}'.format(c), t) for t in sample_titles]
    plotting_titles = np.concatenate((plotting_titles, sample_title_pca), axis=0)
    print("Done in {0:.2f}s".format(time.clock() - start))

print("Plotting sample image with different number of PCA conpoments ...")
plot_gallery(plotting_images, plotting_titles, h, w,
    n_row * (len(candidate_components) + 1), n_col)

Fitting and projecting on PCA(n_components=140) ...
Done in 0.14s
Fitting and projecting on PCA(n_components=75) ...


  app.launch_new_instance()


Done in 0.08s
Fitting and projecting on PCA(n_components=37) ...
Done in 0.11s
Fitting and projecting on PCA(n_components=19) ...
Done in 0.06s
Fitting and projecting on PCA(n_components=8) ...
Done in 0.07s
Plotting sample image with different number of PCA conpoments ...


In [70]:
n_components = 140
print('fitting PCA by using training data ...')
start = time.clock()
pca = PCA(n_components=n_components,svd_solver='randomized',whiten = True).fit(X_train)
print("Done in {0:.2f}s".format(time.clock() - start))

fitting PCA by using training data ...
Done in 0.12s


  This is separate from the ipykernel package so we can avoid doing imports until
  """


In [71]:
from sklearn.model_selection import GridSearchCV

print("Searching the best parameters for SVC ...")
param_grid = {'C': [1, 5, 10, 50, 100],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01]}
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid, verbose=2, n_jobs=4)
clf = clf.fit(X_train_pca, Y_train)
print("Best parameters found by grid search:")
print(clf.best_params_)


Searching the best parameters for SVC ...
Fitting 3 folds for each of 25 candidates, totalling 75 fits
Best parameters found by grid search:
{'C': 10, 'gamma': 0.001}


[Parallel(n_jobs=4)]: Done  75 out of  75 | elapsed:    1.5s finished


In [72]:
start = time.perf_counter()
print("Predict test dataset ...")
y_pred = clf.best_estimator_.predict(X_test_pca)
cm = confusion_matrix(Y_test,y_pred,labels = range(n_targets))
print("Done in {0:.2f}.\n".format(time.perf_counter()-start))
print("confusion matrix:")
np.set_printoptions(threshold=np.nan)
print(cm)

Predict test dataset ...
Done in 0.00.

confusion matrix:
[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [1 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [0 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [0 0 0

In [74]:
print(classification_report(Y_test,y_pred,target_names=target_names))

             precision    recall  f1-score   support

         c0       0.50      1.00      0.67         1
         c1       1.00      0.67      0.80         3
         c2       1.00      0.50      0.67         2
         c3       1.00      1.00      1.00         1
         c4       1.00      1.00      1.00         1
         c5       1.00      1.00      1.00         1
         c6       1.00      0.75      0.86         4
         c7       1.00      1.00      1.00         2
         c8       1.00      1.00      1.00         4
         c9       1.00      1.00      1.00         2
        c10       1.00      1.00      1.00         1
        c11       1.00      1.00      1.00         4
        c12       1.00      1.00      1.00         4
        c13       1.00      1.00      1.00         1
        c14       1.00      1.00      1.00         1
        c15       0.75      1.00      0.86         3
        c16       1.00      1.00      1.00         2
        c17       1.00      1.00      1.00   

  .format(len(labels), len(target_names))
