In [1]:
pip install -q faiss-gpu

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import os, gc
import faiss
from tqdm.auto import tqdm
import time

In [3]:
# declare GPU resource to encapsulate chunk of GPU memory
res = faiss.StandardGpuResources()

In [4]:
feats0 = np.load("/kaggle/input/gpt2-embd-extraction/gpt2_features_0.npy")
feats1 = np.load("/kaggle/input/gpt2-embd-extraction/gpt2_features_1.npy")
feats2 = np.load("/kaggle/input/gpt2-embd-extraction/gpt2_features_2.npy")
feats3 = np.load("/kaggle/input/gpt2-embd-extraction/gpt2_features_3.npy")
feats4 = np.load("/kaggle/input/gpt2-embd-extraction/gpt2_features_4.npy")
feats = np.concatenate([feats0, feats1, feats2, feats3 ,feats4], axis=0)
del feats0,feats1,feats2,feats3 ,feats4
gc.collect()
feats.shape

(1756041, 768)

In [5]:
d = feats.shape[1]

In [6]:
test_df = pd.read_csv('/kaggle/input/amazon-ml/dataset/test.csv')
test_df = test_df[['PRODUCT_ID', 'TITLE']]
test_df.isna().sum()

PRODUCT_ID    0
TITLE         5
dtype: int64

In [7]:
test_feats = np.load('/kaggle/input/gpt2-embd-extraction/gpt2_test_features.npy').astype(np.float32)
test_feats.shape

(734731, 768)

In [8]:
test_df.shape

(734736, 2)

In [9]:
df0 = pd.read_parquet('/kaggle/input/amzz-best-folds/train0.parquet', columns=['index','PRODUCT_ID', 'PRODUCT_LENGTH', 'TITLE'])
df1 = pd.read_parquet('/kaggle/input/amzz-best-folds/train1.parquet', columns=['index','PRODUCT_ID', 'PRODUCT_LENGTH', 'TITLE'])
df2 = pd.read_parquet('/kaggle/input/amzz-best-folds/train2.parquet', columns=['index','PRODUCT_ID', 'PRODUCT_LENGTH', 'TITLE'])
df3 = pd.read_parquet('/kaggle/input/amzz-best-folds/train2.parquet', columns=['index','PRODUCT_ID', 'PRODUCT_LENGTH', 'TITLE'])
df4 = pd.read_parquet('/kaggle/input/amzz-best-folds/train2.parquet', columns=['index','PRODUCT_ID', 'PRODUCT_LENGTH', 'TITLE'])

tr_df = pd.concat([df0, df1, df2, df3, df4])
del df0, df1, df2, df3, df4
gc.collect()
tr_df = tr_df.reset_index()
tr_df.shape

(1756050, 5)

In [10]:
tr_df = tr_df.dropna(subset=['TITLE'])

In [11]:
print(tr_df.shape)
tr_df.isna().sum()

(1756041, 5)


level_0           0
index             0
PRODUCT_ID        0
PRODUCT_LENGTH    0
TITLE             0
dtype: int64

In [12]:
feats = feats[tr_df['PRODUCT_LENGTH'] < 1000]
tr_df = tr_df[tr_df['PRODUCT_LENGTH'] < 1000]

For Cosine Similarity, refer this link: https://github.com/facebookresearch/faiss/wiki/MetricType-and-distances

https://www.kaggle.com/code/dailysergey/howtodata-use-faiss-for-similarity-search-cpu/notebook

https://github.com/facebookresearch/faiss/wiki/Running-on-GPUs

In [13]:
nlist = 1000
# quantizer = faiss.IndexFlatL2(d)
quantizer = faiss.IndexFlatIP(d)
index = faiss.IndexIVFFlat(quantizer, d, nlist)
index = faiss.index_cpu_to_gpu(res, 0, index)

In [14]:
from sklearn.preprocessing import normalize
feats = normalize(feats, axis=1, norm='l2')

In [15]:
gc.collect()

24

In [16]:
test_feats = normalize(test_feats, axis=1, norm='l2')

In [17]:
index.train(feats)

In [18]:
index.add(feats)
index.ntotal

1367059

In [19]:
print(feats.shape, tr_df.shape)

(1367059, 768) (1367059, 5)


In [20]:
index.nprobe =10

In [21]:
k = 10
results = pd.DataFrame(columns=['PRODUCT_ID'])
results['PRODUCT_ID'] = test_df['PRODUCT_ID']
# results['values'] = np.zeros((test_df.shape[0]))
result_values = []

In [22]:
results_mean = pd.DataFrame(columns=['PRODUCT_ID'])
results_mean['PRODUCT_ID'] = test_df['PRODUCT_ID']
results_mean['PRODUCT_LENGTH'] = np.zeros((test_df.shape[0]))

In [23]:
def get_value_weighted_score(arr, k):
#     print(arr.shape)
    new_arr = np.zeros(arr.shape)
    tmp_k = k+1
    s = 0
    arr = np.sort(arr)
    for i in range(arr.shape[0]):
        val = arr[i]*tmp_k
        s += tmp_k
        new_arr[i] = val
        tmp_k -= 1
#         print(i, val, tmp_k, new_arr)
    return np.sum(new_arr/s)

In [24]:
def get_priority_weighted_score(arr, k):
#     print(arr.shape)
    new_arr = np.zeros(arr.shape)
    tmp_k = k+1
    s = 0
    for i in range(arr.shape[0]):
        val = arr[i]*tmp_k
        s += tmp_k
        new_arr[i] = val
        tmp_k -= 1
#         print(i, val, tmp_k, new_arr)
    return np.sum(new_arr/s)

In [25]:
test_df.isna().sum()

PRODUCT_ID    0
TITLE         5
dtype: int64

In [None]:
%%time
from tqdm.auto import tqdm
k = 10
verbose = False
log_step = 10000
rows = test_df.shape[0]
# tr_df = df0.copy()
result_values = []
for row in tqdm(range(rows)):
    start = time.time()
    if row % log_step == 0:
        print(f'row number: {row}')
    if test_df.iloc[row]['TITLE'] is not np.NaN:
        if verbose:
            print(f'------------------{row}--------')
        xq = np.array(test_feats[row]).astype(np.float32)
        D, I = index.search(np.array([xq]), k)
        if verbose:
            print(f"Test title: {test_df.iloc[row]['TITLE']}")
            print(f'Train titles:-')
            print([f"{i}: {tr_df.iloc[i]['TITLE']}" for i in I[0]])
        values = np.array([tr_df.iloc[i]['PRODUCT_LENGTH'] for i in I[0]])
        mean = np.mean(values, axis=0)
#         median = np.median(values, axis=0)
#         minm = np.min(values, axis=0)
#         wt = get_value_weighted_score(values, k)
#         pr = get_priority_weighted_score(values, k)
        if verbose:
            print('--------------------------------')
    else:
        mean = 600
        median = 600
        minm = 600
        wt = 600
    results_mean.loc[row, 'PRODUCT_LENGTH'] = mean
#     results_min.loc[row, 'PRODUCT_LENGTH'] = minm
#     results_median.loc[row, 'PRODUCT_LENGTH'] = median
#     results_wt.loc[row, 'PRODUCT_LENGTH'] = wt
#     results_pr.loc[row, 'PRODUCT_LENGTH'] = pr
    result_values.append(values)
    end = time.time()
#     print(f'Time needed = {(end-start)*test_feats.shape[0]}')

  0%|          | 0/734736 [00:00<?, ?it/s]

row number: 0
row number: 10000
row number: 20000
row number: 30000
row number: 40000
row number: 50000
row number: 60000
row number: 70000
row number: 80000


In [34]:
results_mean.to_csv('results_mean.csv', index=False)

In [35]:
result_values_np = np.array(result_values)
print(result_values_np.shape)
np.save('result_values.npy', result_values_np)

(734736, 10)


In [36]:
results['values'] = result_values

In [37]:
results.to_csv('results.csv', index=False)

In [38]:
# results_mean.to_csv("submission_mean.csv", index=False)
# results_min.to_csv("submission_mean.csv", index=False)
# results_median.to_csv("submission_mean.csv", index=False)
# results_wt.to_csv("submission_mean.csv", index=False)
# results_pr.to_csv("submission_mean.csv", index=False)