In [2]:
from time import time
import logging
import matplotlib.pyplot as plt
from tensorflow import keras 
from tensorflow.keras import layers

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import fetch_lfw_people
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn import metrics



In [3]:
# print(__doc__)

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')


# #############################################################################
# Download the data, if not already on disk and load it as numpy arrays

lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)

# introspect the images arrays to find the shapes (for plotting)
n_samples, h, w = lfw_people.images.shape

# for machine learning we use the 2 data directly (as relative pixel
# positions info is ignored by this model)
X = lfw_people.data
n_features = X.shape[1]

# the label to predict is the id of the person
y = lfw_people.target
target_names = lfw_people.target_names
n_classes = target_names.shape[0]

print("Total dataset size:")
print("n_samples: %d" % n_samples)
print("n_features: %d" % n_features)
print("n_classes: %d" % n_classes)
target_names

# #############################################################################
# Split into a training set and a test set using a stratified k fold

# split into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)


# #############################################################################
# Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
# dataset): unsupervised feature extraction / dimensionality reduction
n_components = 150

print("Extracting the top %d eigenfaces from %d faces"
      % (n_components, X_train.shape[0]))
t0 = time()
pca = PCA(n_components=n_components, svd_solver='randomized',
          whiten=True).fit(X_train)
print("done in %0.3fs" % (time() - t0))

eigenfaces = pca.components_.reshape((n_components, h, w))

print("Projecting the input data on the eigenfaces orthonormal basis")
t0 = time()
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print("done in %0.3fs" % (time() - t0))


# #############################################################################
# Train a SVM classification model

print("Fitting the classifier to the training set")
t0 = time()
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
clf = GridSearchCV(
    SVC(kernel='rbf', class_weight='balanced'), param_grid
)
clf = clf.fit(X_train_pca, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)


# #############################################################################
# Quantitative evaluation of the model quality on the test set

print("Predicting people's names on the test set")
t0 = time()
y_pred = clf.predict(X_test_pca)
print("done in %0.3fs" % (time() - t0))

print(classification_report(y_test, y_pred, target_names=target_names))
print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))


# #############################################################################
# Qualitative evaluation of the predictions using matplotlib

def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
    """Helper function to plot a gallery of portraits"""
    plt.figure(figsize=(1.8 * n_col, 2.4 * n_row))
    plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
    for i in range(n_row * n_col):
        plt.subplot(n_row, n_col, i + 1)
        plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray)
        plt.title(titles[i], size=12)
        plt.xticks(())
        plt.yticks(())


# plot the result of the prediction on a portion of the test set

def title(y_pred, y_test, target_names, i):
    pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1]
    true_name = target_names[y_test[i]].rsplit(' ', 1)[-1]
    return 'predicted: %s\ntrue:      %s' % (pred_name, true_name)

prediction_titles = [title(y_pred, y_test, target_names, i)
                     for i in range(y_pred.shape[0])]

plot_gallery(X_test, prediction_titles, h, w)

# plot the gallery of the most significative eigenfaces

eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])]
plot_gallery(eigenfaces, eigenface_titles, h, w)

plt.show()

In [4]:
#Task 1 Random Forest########################################################
lf = RandomForestClassifier(max_depth=2, random_state=0)
lf.fit(X_train_pca, y_train)
predict = lf.predict(X_test_pca)
print("Accuracy:  {:.2f}".format(accuracy_score(y_test,predict)))
#############################################################################

In [5]:
#Task 2 MLP ############################################################################### 

# Train a mlp classification model

t0 = time()
clf = MLPClassifier(solver='adam', alpha=1e-5,hidden_layer_sizes=(15, 10), random_state=1)

clf = clf.fit(X_train_pca, y_train)
clf.score(X_test_pca, y_test)
print(clf.score(X_test_pca, y_test))
print("done in %0.3fs" % (time() - t0))

##########################################################################################

In [6]:
#######Task 2 exstended high parameter ############################################################
mlp = MLPClassifier(max_iter=100)
para={
    'hidden_layer_sizes': [(15,10)],
    'activation': ['tanh', 'relu', 'adam'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}
#builds a grid of potential param
clf = GridSearchCV(mlp, para, n_jobs=-1, cv=3)
clf.fit(X_test_pca, y_test)

#best prameter set
print('Best parameters found:\n', clf.best_params_)

# All results
#means = clf.cv_results_['mean_test_score']
#stds = clf.cv_results_['std_test_score']
#for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    #print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
#######################################################################################

In [7]:
#Task 2 exstended high parameter#########################################################################
# Train a mlp classification model
t0 = time()
clf = MLPClassifier(solver='adam', alpha=0.0001,hidden_layer_sizes=(15, 10), random_state=1, activation='tanh', learning_rate='constant')

clf = clf.fit(X_train_pca, y_train)
clf.score(X_test_pca, y_test)
print(clf.score(X_test_pca, y_test))
print("done in %0.3fs" % (time() - t0))
#################################################################################################

In [8]:
#Task 3 Cnn model #####################################################################


X_train = X_train.reshape(len(X_train), 50, 37, 1)
X_test = X_test.reshape(len(X_test), 50, 37, 1)

input_shape = (50, 37, 1)
output_classes = 10
num_classes = 10

## Build the model
#ANN##############################################################################
model = keras.Sequential([
    keras.layers.Flatten(input_shape=(50,37,1)),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(output_classes)
])

model.compile(optimizer='adam',
              loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

## Train the model
model.fit(X_train, y_train, epochs=10, batch_size=128)


## Evaluate the trained model
test_loss, test_acc = model.evaluate(X_test,  y_test, verbose=2)
print('\n Test accuracy:', test_acc)
##################################################################################

model = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Flatten(),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation="softmax"),
    ]
)

model.summary()

model.compile(optimizer="adam",loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=["accuracy"])
model.fit(X_train, y_train, batch_size= 128, epochs = 30)

#evaluate
score = model.evaluate(X_test, y_test, verbose=0)
print("test loss:", score[0])
print("test accuracy:", score[1])
###############################################################################################

In [15]:
#Task 4 Clustering ###################################################################
X_train = X_train/255.0
X_test = X_test/255.0

print(X_train.min())
print(X_train.max())

X_train = X_train.reshape(len(X_train), -1)
X_test = X_test.reshape(len(X_test), -1)

print(X_train.shape)
print(X_test.shape)

kmeans = KMeans(n_clusters = 7)
kmeans.fit(X_train)

kmeans.labels_
######################################################################################

In [10]:
#Task 4 Clustering ###################################################################
cluster_number = [10,16,36,64,144,256]

def retrieve_info(cluster_labels,y_train):
    reference_labels = {}
    for i in range(len(np.unique(kmeans.labels_))):
        index = np.where(cluster_labels == i,1,0)
        num = np.bincount(y_train[index==1]).argmax()
        reference_labels[i] = num
    return reference_labels
reference_labels = retrieve_info(kmeans.labels_,y_train)

def metrics(model,output):
    print("Number of clusters is {}".format(model.n_clusters))
    
for i in cluster_number:
    total_clusters = len(np.unique(y_test))
    kmeans = KMeans(n_clusters = i)
    kmeans.fit(X_train)
    metrics(kmeans,y_train)
   
    # Calculating reference_labels
    reference_labels = retrieve_info(kmeans.labels_,y_train)
    number_labels = np.random.rand(len(kmeans.labels_))
    for i in range(len(kmeans.labels_)):
        number_labels[i] = reference_labels[kmeans.labels_[i]]
            
    print("Accuracy score : {}".format(accuracy_score(number_labels,y_train)))
    print("\n")
    
   ######################################################################################

In [11]:
#Task 4 Clustering ###################################################################
wcss=[]
for i in cluster_number:
    kmeans = KMeans(i)
    kmeans.fit(X_test)
    wcss_iter = kmeans.inertia_
    wcss.append(wcss_iter)

number_clusters = cluster_number
plt.plot(number_clusters,wcss)
plt.title('')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')

kmeans = KMeans(3)
cl = kmeans.fit_predict(X_test)

######################################################################################

In [12]:
#Task 4 Clustering ###################################################################
Xbush = []

for i in range(len(number_labels)):
    if(y_train[i] == 3):
        Xbush.append(number_labels[i])
    
Xbush = np.array(Xbush) 
Xbush



In [16]:
X_bush = []

for i in range(len(X_test)):
    if (y_train[i] == 3):
        X_bush.append(X_test[i])

X_bush = np.array(X_bush)

X_bush_norm = X_bush / 255.0
X_bush_norm = X_bush_norm.reshape(len(X_bush), -1)
print(X_bush)


In [17]:
#Task 4 Clustering ###################################################################
X_bush = X_bush / 255.0
X_bush = X_bush.reshape(len(X_bush), -1)


predicted = kmeans.predict(X_bush)
number_labels[predicted]

