## Imports

In [16]:
import pandas as pd
import numpy as np
import pickle
from sklearn.neighbors import NearestNeighbors;
import sklearn
from tqdm import tqdm
import json

## ID Mapper

In [110]:
cropped = True
knn = 100
ensemble_mode = 'concat'
normalize = True
model = 'effnetv1_b7_backfin_train_' 
model_2 = ['effnetv1_b7_backfin_s20m2_', 'effnetv1_b5_subc_backfin_', 'effnetv1_b5_768fullbody_s20m2_', 'effnetv1_b7_600fullbody_s20m2_'] 
models = 5

# 'effnetv1_b7_backfin_train_', effnetv1_b7_backfin_s20m2_ , effnetv1_b5_subc_backfin_
# effnetv1_b5_768crop_s20m2_ , effnetv1_b7_cropped_
#  effnetv1_b5_768fullbody_s20m2_ , effnetv1_b7_600fullbody_s20m2_

if cropped:
    with open('./individual_ids.json', "r") as f:
        int2str = json.loads(f.read())
    int2str = {int2str[x]:x for x in int2str}
else:
    with open('./int2str.json', "r") as f:
        int2str = json.loads(f.read())

def normalized(a, axis=-1, order=2):
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2==0] = 1
    return a / np.expand_dims(l2, axis)

## Merging Embeddings

In [111]:
df_list_train = []
df_list_test = []

for i in range(5):
    with open(f'./embeddings/{model}{i}.npy', 'rb') as fh:
        train_ids = np.load(fh)
        train_embeddings = np.load(fh)
        train_targets = np.load(fh)
        test_ids = np.load(fh)
        test_embeddings = np.load(fh)
    df_train = pd.DataFrame({'ids': train_ids, f'targets_{i}':train_targets, f'embeddings_{i}': np.array_split(train_embeddings, train_embeddings.shape[0])})
    df_train.set_index('ids', inplace=True)
    df_list_train.append(df_train)
    
    df_test = pd.DataFrame({'ids': test_ids, f'embeddings_{i}': np.array_split(test_embeddings, test_embeddings.shape[0])})
    df_test.set_index('ids', inplace=True)
    df_list_test.append(df_test)

if model_2 is not None:
    df_list_train_2 = []
    df_list_test_2 = []

    for j, _model_2 in enumerate(model_2):
        for i in range(5):
            with open(f'./embeddings/{_model_2}{i}.npy', 'rb') as fh:
                train_ids = np.load(fh)
                train_embeddings = np.load(fh)
                train_targets = np.load(fh)
                test_ids = np.load(fh)
                test_embeddings = np.load(fh)
            df_train_2 = pd.DataFrame({'ids': train_ids, f'embeddings_{i+5*(j+1)}': np.array_split(train_embeddings, train_embeddings.shape[0])}) #f'targets_{i}':train_targets, 
            df_train_2.set_index('ids', inplace=True)
            df_list_train.append(df_train_2)
            
            df_test_2 = pd.DataFrame({'ids': test_ids, f'embeddings_{i+5*(j+1)}': np.array_split(test_embeddings, test_embeddings.shape[0])})
            df_test_2.set_index('ids', inplace=True)
            df_list_test.append(df_test_2)

df_train = df_list_train[0].join(df_list_train[1:])
df_train.drop(['targets_1', 'targets_2', 'targets_3', 'targets_4'], axis=1, inplace=True)
df_test = df_list_test[0].join(df_list_test[1:])

In [112]:
train_ids = np.squeeze(np.vstack(df_train.index.to_numpy()))
train_targets = np.squeeze(np.vstack(df_train['targets_0'].to_numpy()))
train_embeddings = []

for i in range(5*models):
    embeds = np.vstack(df_train[f'embeddings_{i}'].to_numpy())
    if normalize:
        embeds = normalized(embeds, 0)
    train_embeddings.append(embeds)
train_embeddings_concat = np.concatenate(train_embeddings, axis=1)
train_embeddings_mean = np.mean(np.stack(train_embeddings), axis=0)

In [113]:
test_ids = np.squeeze(np.vstack(df_test.index.to_numpy()))
test_embeddings = []
for i in range(5*models):
    embeds = np.vstack(df_test[f'embeddings_{i}'].to_numpy())
    if normalize:
        embeds = normalized(embeds, 0)
    test_embeddings.append(embeds)
test_embeddings_concat = np.concatenate(test_embeddings, axis=1)
test_embeddings = np.mean(np.stack(test_embeddings), axis=0)

## Inference

In [114]:
if ensemble_mode == 'concat':
    train_embeddings = train_embeddings_concat
    test_embeddings = test_embeddings_concat
else:
    train_embeddings = train_embeddings_mean
    train_targets = train_targets

In [None]:
print(train_embeddings.shape,train_targets.shape)
print(test_embeddings.shape)

In [None]:
neigh = NearestNeighbors(n_neighbors=knn,metric='cosine')
neigh.fit(train_embeddings)

In [None]:
test_nn_distances = []
test_nn_idxs = []
for i in tqdm(range(len(test_ids))):
    distances,idxs = neigh.kneighbors(np.expand_dims(test_embeddings[i], axis=0), knn, return_distance=True)
    test_nn_idxs.append(idxs)
    test_nn_distances.append(distances)
test_nn_distances = np.concatenate(test_nn_distances)
test_nn_idxs = np.concatenate(test_nn_idxs)

In [None]:
test_df = []
for i in tqdm(range(len(test_ids))):
    id_ = test_ids[i]
    targets = train_targets[test_nn_idxs[i]]
    distances = test_nn_distances[i]
    subset_preds = pd.DataFrame(np.stack([targets,distances],axis=1),columns=['target','distances'])
    subset_preds['image'] = id_
    test_df.append(subset_preds)
test_df = pd.concat(test_df).reset_index(drop=True)
test_df['confidence'] = 1-test_df['distances']
test_df = test_df.groupby(['image','target']).confidence.max().reset_index()
test_df = test_df.sort_values('confidence',ascending=False).reset_index(drop=True)
test_df['target'] = test_df['target'].map(int2str)
test_df.image.value_counts().value_counts()

In [119]:
sample_list = ['938b7e931166', '5bf17305f073', '7593d2aee842', '7362d7a01d00','956562ff2888']

In [None]:
threshold = 0.55

predictions = {}
for i,row in tqdm(test_df.iterrows()):
    if row.image in predictions:
        if len(predictions[row.image])==5:
            continue
        elif row.confidence<=threshold: # added
            if 'new_individual' not in predictions[row.image]: # added
                predictions[row.image].append('new_individual') # added
                if len(predictions[row.image])==5: # added
                    continue # added
        predictions[row.image].append(row.target)
    elif row.confidence>threshold:
        predictions[row.image] = [row.target,'new_individual'] # modified
    else:
        predictions[row.image] = ['new_individual',row.target]
        
for x in tqdm(predictions):
    if len(predictions[x])<5:
        remaining = [y for y in sample_list if y not in predictions]
        predictions[x] = predictions[x]+remaining
        predictions[x] = predictions[x][:5]
    predictions[x] = ' '.join(predictions[x])
    
predictions = pd.Series(predictions).reset_index()
predictions.columns = ['image','predictions']
predictions.to_csv('submission.csv',index=False)
predictions.head()

# Merge Predictions for Backfin

In [None]:
df2 = pd.read_csv("submission_55.csv") #prev submission
#whales without backfins determined with a simple classifier
ids = np.load("ids_without_backfin.npy", allow_pickle = True) 
ids2 = df2["image"][~df2["image"].isin(predictions["image"])] #images without a bounding box
submission = pd.concat([
    predictions[~(predictions["image"].isin(ids))],
    df2[df2["image"].isin(ids)],
    df2[df2["image"].isin(ids2)]
])
submission = submission.drop_duplicates()
submission.to_csv('submission.csv',index=False)
submission.head()