In [6]:
# install necessary packages to access google drive for colab
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse

from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

!mkdir -p drive
!google-drive-ocamlfuse drive


E: Package 'python-software-properties' has no installation candidate
Selecting previously unselected package libfuse2:amd64.
(Reading database ... 22280 files and directories currently installed.)
Preparing to unpack .../libfuse2_2.9.7-1ubuntu1_amd64.deb ...
Unpacking libfuse2:amd64 (2.9.7-1ubuntu1) ...
Selecting previously unselected package fuse.
Preparing to unpack .../fuse_2.9.7-1ubuntu1_amd64.deb ...
Unpacking fuse (2.9.7-1ubuntu1) ...
Selecting previously unselected package google-drive-ocamlfuse.
Preparing to unpack .../google-drive-ocamlfuse_0.7.0-0ubuntu1~ubuntu18.04.1_amd64.deb ...
Unpacking google-drive-ocamlfuse (0.7.0-0ubuntu1~ubuntu18.04.1) ...
Setting up libfuse2:amd64 (2.9.7-1ubuntu1) ...
Processing triggers for libc-bin (2.27-3ubuntu1) ...
Setting up fuse (2.9.7-1ubuntu1) ...
Setting up google-drive-ocamlfuse (0.7.0-0ubuntu1~ubuntu18.04.1) ...
Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleus

In [7]:
! ls './drive/Colab Notebooks/5002'a

images	vgg16_features.csv


In [0]:
import numpy as np
import pandas as pd
from keras.preprocessing import image
from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input as vgg16_input_pp
from keras.applications.vgg19 import VGG19
from keras.applications.vgg19 import preprocess_input as vgg19_input_pp
from keras.applications.resnet50 import ResNet50
from keras.applications.resnet50 import preprocess_input as resnet50_input_pp
from keras.applications.mobilenet import MobileNet
from keras.applications.mobilenet import preprocess_input as mobilenet_input_pp
from keras.applications.densenet import DenseNet169
from keras.applications.densenet import preprocess_input as densenet_input_pp

from sklearn.cluster import KMeans, DBSCAN

import os
from sklearn.utils.random import sample_without_replacement
from sklearn.metrics import silhouette_samples, silhouette_score
from matplotlib import pyplot as plt
from matplotlib import cm

In [0]:
# image_folder = './drive/Colab Notebooks/5002/images/'
image_folder = './drive/images/'
output_folder = './drive/Colab Notebooks/5002/'

## Collect image file data

In [0]:
image_paths = []
for root, folders, files in os.walk(image_folder):
    for file in files:
        image_paths.append(os.path.join(image_folder, file))

In [0]:

def gen_img_features(fe_model, fe_input_prossesor, img_path_list, resample=False, sample_size=0.3):
    
    paths = []
    if resample:
        population_size = len(img_path_list)
        sample_size = int(population_size*sample_size)
        sample_index = sample_without_replacement(n_samples=sample_size, n_population=population_size, method='auto')
        
        for i in sample_index:
            paths.append(img_path_list[i])
    else:
        paths = img_path_list
        
    img_vgg_features = []
    count = 0
    for img_path in paths:
        count += 1
        print(f'{count}th image starts.')
        
        img_data = image.load_img(img_path, target_size=(224, 224))
        img_data = image.img_to_array(img_data)
        img_data = fe_input_prossesor(np.expand_dims(img_data, axis=0))

        features = fe_model.predict(img_data)
        features = np.array(features).flatten() # collapse feature into one-dim for keans
    
        img_vgg_features.append(features)
        

    img_vgg_features = np.array(img_vgg_features)
    df_features = pd.DataFrame(img_vgg_features, columns=list(range(img_vgg_features.shape[1]))).set_index([paths])
    
    return df_features

## Evaluate pre-trained models

In [0]:
%%time
# to find the best K for KMeans

# generate dataset firstly

# fe_model = VGG16(weights='imagenet', include_top=False)
# fe_model.summary()
# fe_model = VGG19(weights='imagenet', include_top=False)
# fe_model.summary()
# fe_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')
# fe_model.summary()
# fe_model = DenseNet169(weights='imagenet', include_top=False, pooling='avg')
# fe_model.summary()
# fe_model = MobileNet(weights='imagenet', include_top=False, pooling='avg')
# fe_model.summary()

img_df = gen_img_features(fe_model, mobilenet_input_pp, image_paths, resample=True, sample_size=0.2)

In [0]:
# import numpy as np

img_data = img_df.values

for n_clusters in list([8, 9, 10]):
    
    clt = KMeans(n_clusters=n_clusters, random_state=0)
    labels = clt.fit_predict(img_data)

    silhouette_scores = silhouette_score(img_data, labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_scores)

In [0]:
# evaluate performance with each n_clusters

img_data = img_df.values

for n_clusters in list([10, 11, 12, 13, 14, 15]):
    
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)
    
    ax1.set_xlim([-0.1, 1])
    ax1.set_ylim([0, len(img_data) + (n_clusters + 1) * 10])
    
    clt = KMeans(n_clusters=n_clusters, random_state=0)
    labels = clt.fit_predict(img_data)
    
    silhouette_scores = silhouette_score(img_data, labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_score)
    
    silhouette_values = silhouette_samples(img_data, labels)

     
    y_lower = 10
    for i in range(n_clusters):
        
        i_cluster_silhouette_values = silhouette_values[labels==i]
        i_cluster_silhouette_values.sort()
        i_cluster_size = i_cluster_silhouette_values.shape[0]
        y_upper = y_lower + i_cluster_size
#         y_upper = y_lower + i_cluster_silhouette_values.shape[0]

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, i_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)
        ax1.text(-0.05, y_lower + 0.5 * i_cluster_size, str(i))
        y_lower = y_upper + 10
        
    ax1.set_title("Silhouette plot for the various clusters.")
    ax1.set_xlabel("Silhouette coefficient values")
    ax1.set_ylabel("Cluster label")
    
    ax1.axvline(x=silhouette_score, color="red", linestyle="--")
    ax1.set_yticks([])
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
    
    
    colors = cm.nipy_spectral(labels.astype(float) / n_clusters)
    ax2.scatter(img_data[:, 0], img_data[:, 1], marker='.', s=30, lw=0, alpha=0.7, c=colors, edgecolor='k')
    centers = clt.cluster_centers_
    
    ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
                c="white", alpha=1, s=200, edgecolor='k')

    for i, c in enumerate(centers):
        ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
                    s=50, edgecolor='k')

    ax2.set_title("Visualization of the clustered data.")
    ax2.set_xlabel("Feature space for the 1st feature")
    ax2.set_ylabel("Feature space for the 2nd feature")

    plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                  "with n_clusters = %d" % n_clusters),
                 fontsize=14, fontweight='bold')

plt.show()

## Apply strategy on completed data file

In [12]:
%%time
# model = VGG16(weights='imagenet', include_top=False)
# model.summary()
model = ResNet50(weights='imagenet', include_top=False, pooling='avg')
model.summary()


img_features = gen_img_features(model, resnet50_input_pp, image_paths, resample=False)

Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, None, 3 0                                            
__________________________________________________________________________________________________
conv1_pad (ZeroPadding2D)       (None, None, None, 3 0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, None, None, 6 9472        conv1_pad[0][0]                  
__________________________________________________________________________________________________
bn_conv1 (BatchNormalization)   (None, None, None, 6 256     

In [0]:
img_features.to_csv(os.path.join(output_folder, 'resnet_features.csv'))

In [16]:
%%time
img_data = img_features.values

for n_clusters in [6, 8, 9, 10, 11, 12, 13, 14, 16, 18, 20]:
    results = KMeans(n_clusters=n_clusters).fit_predict(img_data)

    avg_silhouette_score = silhouette_score(img_features.values, results)
    print("For n_clusters =", n_clusters, "The average silhouette_score is :", avg_silhouette_score)


For n_clusters = 12 The average silhouette_score is : 0.047678523
For n_clusters = 14 The average silhouette_score is : 0.043069534
For n_clusters = 16 The average silhouette_score is : 0.044690378
For n_clusters = 18 The average silhouette_score is : 0.04406558
For n_clusters = 20 The average silhouette_score is : 0.041952822
CPU times: user 1min 54s, sys: 8.39 s, total: 2min 2s
Wall time: 1min 36s


In [17]:
results = KMeans(n_clusters=12).fit_predict(img_data)

img_features['lable'] = results
result = img_features[['lable']]

result.head()

Unnamed: 0,lable
./drive/images/04775.jpg,2
./drive/images/00275.jpg,4
./drive/images/04007.jpg,11
./drive/images/03768.jpg,1
./drive/images/00261.jpg,8


In [32]:
result.reset_index().groupby('lable').count()
# result

Unnamed: 0_level_0,level_0,index
lable,Unnamed: 1_level_1,Unnamed: 2_level_1
0,266,266
1,537,537
2,209,209
3,570,570
4,445,445
5,655,655
6,279,279
7,508,508
8,383,383
9,378,378


## Output results

In [0]:
result_dict = {}
for row in result.values:
    if f'Cluster {row[1]+1}' not in result_dict:
        result_dict[f'Cluster {row[1]+1}'] = [row[0].split('/')[-1].split('.')[0]]
    else:
        result_dict[f'Cluster {row[1]+1}'].append(row[0].split('/')[-1].split('.')[0])

In [0]:
# list(result_dict.keys())
max_len = 0
for value in result_dict.values():
    max_len = max(max_len, len(value))

In [0]:
for value in result_dict.values():
    value += ['']*(max_len-len(value))

In [0]:
df = pd.DataFrame()
for col in ['Cluster ' + str(i) for i in range(1, 13)]:
    df[col] = result_dict[col]

In [0]:
df.to_csv(os.path.join(output_folder, 'result.csv'))

In [None]:
from xgboost import XGBClassifier