In [4]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))
display(HTML("<style>.output_result { max-width:75% !important; }</style>"))
display(HTML("<style>.prompt { display:none !important; }</style>"))

In [7]:
import numpy as np
import pandas as pd

from sklearn.metrics import top_k_accuracy_score
from tqdm.notebook import tqdm
from scipy import spatial

import math

In [6]:
train_embeddings = np.load('MFF-pytorch/train_embeddings.npy')
train_embeddings_lbls = np.load('MFF-pytorch/train_embeddings_lbls.npy')

val_embeddings = np.load('MFF-pytorch/val_embeddings.npy')
val_embeddings_lbls = np.load('MFF-pytorch/val_embeddings_lbls.npy')

print(f'train_embeddings shape: {train_embeddings.shape}, val_embeddings shape: {val_embeddings.shape}')
print(f'train_embeddings dtype: {train_embeddings.dtype}, val_embeddings dtype: {val_embeddings.dtype}')
print(f'train_embeddings_lbls shape: {train_embeddings_lbls.shape}, val_embeddings_lbls shape: {val_embeddings_lbls.shape}')
print(f'train_embeddings_lbls dtype: {train_embeddings_lbls.dtype}, val_embeddings_lbls dtype: {val_embeddings_lbls.dtype}')

train_embeddings shape: (300, 512), val_embeddings shape: (154, 512)
train_embeddings dtype: float32, val_embeddings dtype: float32
train_embeddings_lbls shape: (300,), val_embeddings_lbls shape: (154,)
train_embeddings_lbls dtype: int64, val_embeddings_lbls dtype: int64


In [13]:
print('### TRAIN EMBEDDINGS STATISTICS ###')
pd.Series(train_embeddings.flatten()).describe()

### TRAIN EMBEDDINGS STATISTICS ###


count    153600.000000
mean          0.097199
std           0.460197
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           8.923176
dtype: float64

In [15]:
print('### VALIDATION EMBEDDINGS STATISTICS ###')
display(pd.Series(train_embeddings.flatten()).describe())

### VALIDATION EMBEDDINGS STATISTICS ###


count    153600.000000
mean          0.097199
std           0.460197
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           8.923176
dtype: float64

In [19]:
def predict(query):
    predictions = []
    for embedding, lbl in zip(train_embeddings, train_embeddings_lbls):
        similarity = 1 - spatial.distance.cosine(embedding, query)
        predictions.append((similarity, lbl))
    predictions_sorted = sorted(predictions, key=lambda elem: elem[1])
    predictions_sorted = [sim for sim, _ in predictions_sorted]
    
    return predictions_sorted

def get_predictions():
    res = []
    res_lbls = []
    for query, query_lbl in zip(tqdm(val_embeddings), val_embeddings_lbls):
        prediction = predict(query)
        res.append(prediction)
        res_lbls.append(query_lbl)
        
    return np.array(res), np.array(res_lbls)
        

# y_score, y_true = get_predictions()

print(f'y_score shape: {y_score.shape}, y_true shape: {y_true.shape}')

top_k_accuracy_score(y_true, y_score, k=20, labels=np.arange(300))

y_score shape: (154, 300), y_true shape: (154,)


0.15584415584415584

In [66]:
_y_true = np.array([0, 1, 2, 2])
_y_score = np.array([[0.5, 0.2, 0.2],  # 0 is in top 2
                     [0.3, 0.4, 0.2],  # 1 is in top 2
                     [0.2, 0.4, 0.3],  # 2 is in top 2
                     [0.7, 0.2, 0.1]]) # 2 isn't in top 2
print(f'_y_score shape: {_y_score.shape}, _y_true shape: {_y_true.shape}')
top_k_accuracy_score(_y_true, _y_score, k=2)

_y_score shape: (4, 3), _y_true shape: (4,)


0.75

In [102]:
1 - spatial.distance.cosine([1,0], [1,0])

1.0

In [27]:
from sklearn.cluster import KMeans
import numpy as np
X = np.array([
    tuple([1,2]),
    tuple([5,2]),
    tuple([1,3]),
    tuple([7,2]),
])

kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
kmeans.labels_

array([0, 1, 0, 1])