In [1]:
import numpy as np
import pandas as pd

In [2]:
train   = pd.read_csv('../input/shopee-product-matching/train.csv')
test    = pd.read_csv('../input/shopee-product-matching/test.csv')

print(train.shape)
train.head(3)

(34250, 5)


Unnamed: 0,posting_id,image,image_phash,title,label_group
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891


## Evaluation Metics(F1 Score)

In [None]:
def f1_score(col):
    def score(row):
        intersection = len(np.intersect1d(row['target'], row[col]))
        return 2 * intersection / (len(row['target']) + len(row[col]))
    return score

In [3]:
pid_mapping = train.groupby('image_phash')['posting_id'].agg('unique').to_dict()
train['matches_phash'] = train['image_phash'].map(pid_mapping)

train['f1_phash'] = train.apply(f1_score('matches_phash'), axis=1)
print('Train F1 Score - method:pHash =', train['f1_phash'].mean())

Train F1 Score - method:pHash = 0.5530933399167943


## EfficientNet Embedding

In [4]:
import tensorflow as tf
import cv2

class DataLoader(tf.keras.utils.Sequence):
    def __init__(self, data, img_dim=256, batch=32, path='/'): 
        self.data = data
        self.img_dim = img_dim
        self.batch = batch
        self.path = path
        self.indexes = np.arange(len(self.data))
    
    def __getitem__(self, index):
        indexes = self.indexes[self.batch * index: self.batch * (index + 1)]
        X = self.__data_generation(indexes)
        return X
        
    def __data_generation(self, indexes): 
        X = np.zeros((len(indexes), self.img_dim, self.img_dim, 3),
                        dtype=np.float32)
        data = self.data.iloc[indexes]

        for i, (_, row) in enumerate(data.iterrows()):
            img = cv2.imread(self.path + row.image)
            X[i,] = cv2.resize(img, (self.img_dim, self.img_dim))
        return X

    def __len__(self):
        n_batch = len(self.data) // self.batch
        n_batch += int(((len(self.data)) % self.batch) != 0)
        return n_batch

In [None]:
import gc
from tensorflow.keras.applications import EfficientNetB0

embeds = []
CHUNK = 1024 * 4
n_chunk = len(train) // CHUNK
if len(train) % CHUNK != 0:
    n_chunk += 1

In [5]:
model = EfficientNetB0(weights='../input/effnetb0/efficientnetb0_notop.h5',
                       include_top=False,
                       pooling='avg',
                       input_shape=None,
                      )

print('Computing image embeddings...')
for i, j in enumerate(range(n_chunk)):
    a = j * CHUNK
    b = (j + 1) * CHUNK
    b = min(b, len(train))
    print('chunk', a, 'to', b)
    
    train_gen = DataLoader(data=train.iloc[a:b],
                           batch_size=32,
                           path='../input/shopee-product-matching/train_images/')
    image_embeddings = model.predict(train_gen, verbose=1)

    embeds.append(image_embeddings)
    
del model
_ = gc.collect()
image_embeddings = np.concatenate(embeds)

print('image embeddings shape',image_embeddings.shape)

Computing image embeddings...
chunk 0 to 4096
chunk 8192 to 12288
chunk 12288 to 16384
chunk 16384 to 20480
chunk 20480 to 24576
chunk 24576 to 28672
chunk 28672 to 32768
chunk 32768 to 34250
image embeddings shape (34250, 1280)


## Clustering(Kmeans & DBSCAN)

In [6]:
from sklearn.cluster import KMeans

def kmeans_fit_match(train, embeddings, n_dims=200, n_clusters=50, max_n=50):
    train_0 = train.copy()
    embed_0 = embeddings.copy()
    
    print('fitting kmeans using {} samples and params: n_dims={}, n_clusters={}'\
        .format(len(train_0), n_dims, n_clusters))

    kmeans = KMeans(n_clusters).fit(embed_0[:, :n_dims])
    train_0['clusters'] = kmeans.labels_

    clustered = (train_0['clusters'] != -1)
    pid_mapping = train_0.loc[clustered].groupby('clusters')['posting_id']\
                    .agg('unique').to_dict()
    pid_mapping[-1] = []
    for key in pid_mapping:
        if len(pid_mapping[key]) > max_n:
            pid_mapping[key] = pid_mapping[key][:max_n]
    train_0['matches_kmeans'] = train_0['clusters'].map(pid_mapping)\
                                        .apply(match_self, axis=1)
    
    train_0['f1_kmeans'] = train_0.apply(f1_score('matches_kmeans'), axis=1)
    return db, train_0['clusters'], train_0['matches_kmeans'], train_0['f1_kmeans']

def match_self(row):
    if row['posting_id'] not in row['matches_kmeans']:
        return [row['posting_id']] + row['matches_kmeans']
    else:
        return row['matches_kmeans']

def combine_for_cv(row, match_cols):
    x = np.concatenate([row[col] for col in match_cols])
    return np.unique(x)

def combine_for_sub(row, match_cols):
    x = np.concatenate([row[col] for col in match_cols])
    return ' '.join(np.unique(x))

In [7]:
from sklearn.cluster import DBSCAN

def dbscan_fit_match(train, embeddings, n_dims=200, eps=2, min_samp=2, max_n=50):
    train_0 = train.copy()
    embed_0 = embeddings.copy()
    
    print('fitting dbscan using {} samples and params: n_dims={}, eps={}'\
        .format(len(train_0), n_dims, eps))

    db = DBSCAN(eps=eps, 
                min_samples=min_samp, 
                metric='euclidean', 
                n_jobs=-1).fit(embed_0[:, :n_dims])
    train_0['clusters'] = db.labels_

    clustered = (train_0['clusters'] != -1)
    pid_mapping = train_0.loc[clustered].groupby('clusters')['posting_id']\
            .agg('unique').to_dict()
    pid_mapping[-1] = []
    for key in pid_mapping:
        if len(pid_mapping[key]) > max_n:
            pid_mapping[key] = pid_mapping[key][:max_n]
    train_0['matches_dbscan'] = train_0['clusters'].map(pid_mapping)\
                                    .apply(match_self, axis=1)
    
    train_0['f1_dbscan'] = train_0.apply(f1_score('matches_dbscan'), axis=1)
    return db, train_0['clusters'], train_0['matches_dbscan'], train_0['f1_dbscan']

### Optimize eps param on euclidean distance

In [8]:
sample_labels = pd.Series(train['label_group'].unique())\
                    .sample(frac=.1, random_state=10)
train_sample = train[train['label_group'].isin(sample_labels)]
image_embeddings_sample = image_embeddings[train_sample.index]

In [9]:
from tqdm.notebook import tqdm

n_dims = 1280
eps_range = np.arange(1, 11, 1)
min_samp = 2

opt_matrix = []
for eps in tqdm(eps_range, total=len(eps_range)):
    print('fitting dbscan w/ eps={}...'.format(round(eps, 3)))
    _, clusters, train_sample['matches_dbscan'], train_sample['f1_dbscan'] = dbscan_fit_match(train_sample, image_embeddings_sample, n_dims=n_dims, eps=eps, min_samp=min_samp, show_vc=False, verbose=False, metric='euclidean')

    train_sample['matches'] = train_sample.apply(combine_for_cv, 
                                                 axis=1, 
                                                 match_cols=['matches_phash', 'matches_dbscan'])
    train_sample['f1_combined'] = train_sample.apply(f1_score('matches'),axis=1)
    opt_row = [n_dims,
               eps,
               clusters.value_counts(),
               train_sample['f1_phash'].mean(), 
               train_sample['f1_dbscan'].mean(), 
               train_sample['f1_combined'].mean()]
    opt_matrix.append(opt_row)
    
opt_df = pd.DataFrame(opt_matrix, columns=['n_dims', 'eps', 'counts', 'f1_phash', 'f1_dbscan', 'f1_combined'])
display(opt_df[['n_dims', 'eps', 'f1_phash', 'f1_dbscan', 'f1_combined']])

print('best dbscan f1 score = {}'.format(opt_df['f1_dbscan'].max()))
display(opt_df.sort_values(by='f1_dbscan', ascending=False)['counts'].iloc[0])

  0%|          | 0/10 [00:00<?, ?it/s]

fitting dbscan w/ eps=1...
fitting dbscan w/ eps=2...
fitting dbscan w/ eps=3...
fitting dbscan w/ eps=4...
fitting dbscan w/ eps=5...
fitting dbscan w/ eps=6...
fitting dbscan w/ eps=7...
fitting dbscan w/ eps=8...
fitting dbscan w/ eps=9...
fitting dbscan w/ eps=10...


Unnamed: 0,n_dims,eps,f1_phash,f1_dbscan,f1_combined
0,1280,1,0.539179,0.506883,0.543739
1,1280,2,0.539179,0.542535,0.553886
2,1280,3,0.539179,0.571889,0.573918
3,1280,4,0.539179,0.597058,0.596882
4,1280,5,0.539179,0.615709,0.614902
5,1280,6,0.539179,0.638603,0.637546
6,1280,7,0.539179,0.651855,0.650711
7,1280,8,0.539179,0.633975,0.634352
8,1280,9,0.539179,0.594504,0.598462
9,1280,10,0.539179,0.512375,0.522015


best dbscan f1 score = 0.651854747805969


-1      1762
 134      29
 288      23
 34       21
 137      18
        ... 
 89        2
 91        2
 103       2
 113       2
 0         2
Name: clusters, Length: 547, dtype: int64

In [11]:
n_dims=1280
n_clusters_lst = np.arange(20, 60, 5)

opt_matrix = []

for n_cluster in tqdm(n_clusters_lst, total=len(n_clusters_lst)):
    print('fitting kmeans w/ n_cluster={}...'.format(round(n_cluster, 3)))
    _, clusters, train_sample['matches_kmeans'], train_sample['f1_kmeans'] = kmeans_fit_match(train_sample,
                                                                                                image_embeddings_sample,
                                                                                                n_dims=n_dims,
                                                                                                n_clusters=n_cluster,
                                                                                                show_vc=False,
                                                                                                verbose=False)

    # combine dbscan matches and pHash matches
    train_sample['matches'] = train_sample.apply(combine_for_cv, 
                                                 axis=1, 
                                                 match_cols=['matches_phash', 'matches_kmeans'])
    train_sample['f1_combined'] = train_sample.apply(f1_score('matches'),axis=1)
    opt_row = [n_cluster,
                clusters.value_counts(),
                train_sample['f1_kmeans'].mean(), 
                train_sample['f1_combined'].mean()]
    opt_matrix.append(opt_row)

opt_df = pd.DataFrame(opt_matrix, columns=['n_cluster', 'counts', 'f1_kmeans', 'f1_combined'])
display(opt_df[['n_cluster', 'counts', 'f1_kmeans', 'f1_combined']])
print('best dbscan f1 score = {}'.format(opt_df['f1_kmeans'].max()))
display(opt_df.sort_values(by='f1_kmeans', ascending=False)['counts'].iloc[0])

  0%|          | 0/8 [00:00<?, ?it/s]

fitting kmeans w/ n_cluster=20...
fitting kmeans w/ n_cluster=25...
fitting kmeans w/ n_cluster=30...
fitting kmeans w/ n_cluster=35...
fitting kmeans w/ n_cluster=40...
fitting kmeans w/ n_cluster=45...
fitting kmeans w/ n_cluster=50...
fitting kmeans w/ n_cluster=55...


Unnamed: 0,n_cluster,counts,f1_kmeans,f1_combined
0,20,-1 1003  1 789  18 146  0 ...,0.512375,0.522015
1,25,-1 1003  1 789  18 146  0 ...,0.512375,0.522015
2,30,-1 1003  1 789  18 146  0 ...,0.512375,0.522015
3,35,-1 1003  1 789  18 146  0 ...,0.512375,0.522015
4,40,-1 1003  1 789  18 146  0 ...,0.512375,0.522015
5,45,-1 1003  1 789  18 146  0 ...,0.512375,0.522015
6,50,-1 1003  1 789  18 146  0 ...,0.512375,0.522015
7,55,-1 1003  1 789  18 146  0 ...,0.512375,0.522015


best dbscan f1 score = 0.5123750414148504


-1      1003
 1       789
 18      146
 0        42
 33       33
        ... 
 51        2
 53        2
 55        2
 57        2
 445       2
Name: clusters, Length: 447, dtype: int64