In [1]:
pip install -q faiss-gpu

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import os, gc
import faiss
from tqdm.auto import tqdm
import time

In [3]:
# declare GPU resource to encapsulate chunk of GPU memory
res = faiss.StandardGpuResources()

In [4]:
feats0 = np.load("/kaggle/input/sentencetf-new/train_mpnet_basev2_F0_feats.npy")
feats0.shape

(351208, 768)

In [5]:
feats1 = np.load("/kaggle/input/sentencetf-new/train_mpnet_basev2_F1_feats.npy")
feats1.shape

(351209, 768)

In [6]:
feats = np.concatenate([feats0, feats1], axis=0)
feats.shape

(702417, 768)

In [7]:
del feats0, feats1
gc.collect()

121

In [8]:
d = feats.shape[1]

In [9]:
test_df = pd.read_parquet('/kaggle/input/xlm-roberta-embeddings-new/test.parquet', columns=['PRODUCT_ID', 'TITLE'])
test_df.isna().sum()

PRODUCT_ID    0
TITLE         5
dtype: int64

In [10]:
test_feats = np.load('/kaggle/input/sentencetf-new/test_mpnet_basev2_feats.npy').astype(np.float32)
test_feats.shape

(734736, 768)

In [11]:
test_df.shape

(734736, 2)

In [12]:
df0 = pd.read_parquet('/kaggle/input/amzz-best-folds/train0.parquet', columns=['index','PRODUCT_ID', 'PRODUCT_LENGTH', 'TITLE'])
df1 = pd.read_parquet('/kaggle/input/amzz-best-folds/train1.parquet', columns=['index','PRODUCT_ID', 'PRODUCT_LENGTH', 'TITLE'])

tr_df = pd.concat([df0, df1])
tr_df = tr_df.reset_index()
del df0, df1
gc.collect()
tr_df.shape

(702420, 5)

In [13]:
tr_df = tr_df.dropna(subset=['TITLE'])

In [14]:
tr_df.isna().sum()

level_0           0
index             0
PRODUCT_ID        0
PRODUCT_LENGTH    0
TITLE             0
dtype: int64

In [15]:
feats = feats[tr_df['PRODUCT_LENGTH'] < 1000]
tr_df = tr_df[tr_df['PRODUCT_LENGTH'] < 1000]

For Cosine Similarity, refer this link: https://github.com/facebookresearch/faiss/wiki/MetricType-and-distances

https://www.kaggle.com/code/dailysergey/howtodata-use-faiss-for-similarity-search-cpu/notebook

https://github.com/facebookresearch/faiss/wiki/Running-on-GPUs

In [16]:
nlist = 1000
# quantizer = faiss.IndexFlatL2(d)
quantizer = faiss.IndexFlatIP(d)
index = faiss.IndexIVFFlat(quantizer, d, nlist)
index = faiss.index_cpu_to_gpu(res, 0, index)

In [17]:
from sklearn.preprocessing import normalize
normed_feat = normalize(feats, axis=1, norm='l2')

In [18]:
feats = normed_feat
del normed_feat
gc.collect()

21

In [19]:
test_feats = normalize(test_feats, axis=1, norm='l2')

In [20]:
index.train(feats.astype(np.float32))

In [21]:
index.add(feats.astype(np.float32))
index.ntotal

546862

In [22]:
print(feats.shape, tr_df.shape)

(546862, 768) (546862, 5)


In [23]:
index.nprobe =10

In [24]:
k = 10
results = pd.DataFrame(columns=['PRODUCT_ID'])
results['PRODUCT_ID'] = test_df['PRODUCT_ID']
# results['values'] = np.zeros((test_df.shape[0]))
result_values = []

In [25]:
results_mean = pd.DataFrame(columns=['PRODUCT_ID'])
results_mean['PRODUCT_ID'] = test_df['PRODUCT_ID']
results_mean['PRODUCT_LENGTH'] = np.zeros((test_df.shape[0]))

In [26]:
results_min = pd.DataFrame(columns=['PRODUCT_ID'])
results_min['PRODUCT_ID'] = test_df['PRODUCT_ID']
results_min['PRODUCT_LENGTH'] = np.zeros((test_df.shape[0]))

In [27]:
results_median = pd.DataFrame(columns=['PRODUCT_ID'])
results_median['PRODUCT_ID'] = test_df['PRODUCT_ID']
results_median['PRODUCT_LENGTH'] = np.zeros((test_df.shape[0]))

In [28]:
results_wt = pd.DataFrame(columns=['PRODUCT_ID'])
results_wt['PRODUCT_ID'] = test_df['PRODUCT_ID']
results_wt['PRODUCT_LENGTH'] = np.zeros((test_df.shape[0]))

In [29]:
results_pr = pd.DataFrame(columns=['PRODUCT_ID'])
results_pr['PRODUCT_ID'] = test_df['PRODUCT_ID']
results_pr['PRODUCT_LENGTH'] = np.zeros((test_df.shape[0]))

In [30]:
def get_value_weighted_score(arr, k):
#     print(arr.shape)
    new_arr = np.zeros(arr.shape)
    tmp_k = k+1
    s = 0
    arr = np.sort(arr)
    for i in range(arr.shape[0]):
        val = arr[i]*tmp_k
        s += tmp_k
        new_arr[i] = val
        tmp_k -= 1
#         print(i, val, tmp_k, new_arr)
    return np.sum(new_arr/s)

In [31]:
def get_priority_weighted_score(arr, k):
#     print(arr.shape)
    new_arr = np.zeros(arr.shape)
    tmp_k = k+1
    s = 0
    for i in range(arr.shape[0]):
        val = arr[i]*tmp_k
        s += tmp_k
        new_arr[i] = val
        tmp_k -= 1
#         print(i, val, tmp_k, new_arr)
    return np.sum(new_arr/s)

In [32]:
test_df.isna().sum()

PRODUCT_ID    0
TITLE         5
dtype: int64

In [33]:
%%time
from tqdm.auto import tqdm
k = 10
verbose = False
log_step = 10000
rows = test_df.shape[0]
result_values = []
for row in tqdm(range(rows)):
    start = time.time()
    if row % log_step == 0:
        print(f'row number: {row}')
    if test_df.iloc[row]['TITLE'] is not np.NaN:
        if verbose:
            print(f'------------------{row}--------')
        xq = np.array(test_feats[row]).astype(np.float32)
        D, I = index.search(np.array([xq]), k)
        if verbose:
            print(f"Test title: {test_df.iloc[row]['TITLE']}")
            print(f'Train titles:-')
            print([f"{i}: {tr_df.iloc[i]['TITLE']}" for i in I[0]])
        values = np.array([tr_df.iloc[i]['PRODUCT_LENGTH'] for i in I[0]])
        mean = np.mean(values, axis=0)
#         median = np.median(values, axis=0)
#         minm = np.min(values, axis=0)
#         wt = get_value_weighted_score(values, k)
#         pr = get_priority_weighted_score(values, k)
        if verbose:
            print('--------------------------------')
    else:
        mean = 600
#         median = 600
#         minm = 600
#         wt = 600
    results_mean.loc[row, 'PRODUCT_LENGTH'] = mean
#     results_min.loc[row, 'PRODUCT_LENGTH'] = minm
#     results_median.loc[row, 'PRODUCT_LENGTH'] = median
#     results_wt.loc[row, 'PRODUCT_LENGTH'] = wt
#     results_pr.loc[row, 'PRODUCT_LENGTH'] = pr
    result_values.append(values)
    end = time.time()
#     print(f'Time needed = {(end-start)*test_feats.shape[0]}')

  0%|          | 0/734736 [00:00<?, ?it/s]

row number: 0
row number: 10000
row number: 20000
row number: 30000
row number: 40000
row number: 50000
row number: 60000
row number: 70000
row number: 80000
row number: 90000
row number: 100000
row number: 110000
row number: 120000
row number: 130000
row number: 140000
row number: 150000
row number: 160000
row number: 170000
row number: 180000
row number: 190000
row number: 200000
row number: 210000
row number: 220000
row number: 230000
row number: 240000
row number: 250000
row number: 260000
row number: 270000
row number: 280000
row number: 290000
row number: 300000
row number: 310000
row number: 320000
row number: 330000
row number: 340000
row number: 350000
row number: 360000
row number: 370000
row number: 380000
row number: 390000
row number: 400000
row number: 410000
row number: 420000
row number: 430000
row number: 440000
row number: 450000
row number: 460000
row number: 470000
row number: 480000
row number: 490000
row number: 500000
row number: 510000
row number: 520000
row num

In [34]:
results_mean.to_csv('results_mean.csv', index=False)

In [35]:
result_values_np = np.array(result_values)
print(result_values_np.shape)
np.save('result_values.npy', result_values_np)

(734736, 10)


In [36]:
results['values'] = result_values

In [37]:
results.to_csv('results.csv', index=False)

In [38]:
# results_mean.to_csv("submission_mean.csv", index=False)
# results_min.to_csv("submission_mean.csv", index=False)
# results_median.to_csv("submission_mean.csv", index=False)
# results_wt.to_csv("submission_mean.csv", index=False)
# results_pr.to_csv("submission_mean.csv", index=False)