In [1]:
import os
import numpy as np
import pandas as pd
import json
import itertools
import pickle
import matplotlib.pyplot as plt
import torch
from sentence_transformers import SentenceTransformer, util

## Load data

In [30]:
source_path = '/home/mzhang01/google_2022/produce_complete_data/05-assemble-final-results'

In [31]:
# var = pd.read_csv(os.path.join(source_path, 'g2022_adid_var.csv.gz'))
text = pd.read_csv(os.path.join(source_path, 'g2022_adid_01062021_11082022_text_v20240115.csv.gz'))
cid = pd.read_csv('~/google_2022/produce_complete_data/04-dedup-2nd-time/final_output/cid_google2022_v20231203.csv',)

  text = pd.read_csv(os.path.join(source_path, 'g2022_adid_01062021_11082022_text_v20240115.csv.gz'))


In [32]:
df = cid.merge(text, on='ad_id', how='left').drop_duplicates()

In [33]:
df.shape

(233423, 14)

In [5]:
unique = df[['wmp_creative_id']].drop_duplicates(subset=['wmp_creative_id'], keep='last').reset_index(drop=True)
unique.shape

(64819, 1)

In [None]:
unique.to_csv('input_data/unique_creative_id_index_mapping.csv', index=False)

In [26]:
'''
Mapping between corpus embedding index and creative ID 
'''
unique = pd.read_csv('../input_data/unique_creative_id_index_mapping.csv')

In [19]:
unique

Unnamed: 0,index,wmp_creative_id
0,0,cid_50901
1,1,cid_6292
2,2,cid_32185
3,3,cid_55443
4,4,cid_56643
...,...,...
64814,64814,cid_58672
64815,64815,cid_48088
64816,64816,cid_60619
64817,64817,cid_58673


## Load original model  (Skip this section to load the reordered embeddings)

In [28]:
corpus_embeddings0 = torch.load('../model/corpus_embedding_google2022_unique_lite.pt', map_location=torch.device('cpu'))

In [29]:
corpus_embeddings0.shape

torch.Size([64819, 384])

## reorder corpus embedding indices to match cid ordering

**This would make the pairwise similarity computation much faster**

(Because pandas indexing is slow)

In [78]:
unique.loc[:, 'cid_index'] = unique.wmp_creative_id.apply(lambda x: int(x.lstrip('cid_')))

In [81]:
len(unique.cid_index.unique())

64819

In [110]:
reordered = torch.zeros(corpus_embeddings.shape[0], corpus_embeddings.shape[1], dtype=corpus_embeddings.dtype)

In [89]:
assert reordered.shape == corpus_embeddings.shape

In [104]:
indices = unique.cid_index.tolist()

indices = torch.tensor([indices for i in range(corpus_embeddings.shape[1])])

In [107]:
indices = indices.T

In [108]:
indices

tensor([[50901, 50901, 50901,  ..., 50901, 50901, 50901],
        [ 6292,  6292,  6292,  ...,  6292,  6292,  6292],
        [32185, 32185, 32185,  ..., 32185, 32185, 32185],
        ...,
        [60619, 60619, 60619,  ..., 60619, 60619, 60619],
        [58673, 58673, 58673,  ..., 58673, 58673, 58673],
        [60617, 60617, 60617,  ..., 60617, 60617, 60617]])

In [109]:
reordered

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [111]:
reordered.scatter_(0, indices, corpus_embeddings)

tensor([[ 0.0367,  0.0925, -0.1194,  ..., -0.0847, -0.0159,  0.0512],
        [ 0.0548,  0.0080,  0.0620,  ..., -0.1069, -0.0548, -0.0227],
        [ 0.0307, -0.0286, -0.0491,  ..., -0.0400, -0.0628, -0.0395],
        ...,
        [-0.0643, -0.0282,  0.0453,  ...,  0.0062,  0.0168, -0.0307],
        [ 0.0543, -0.0491,  0.0306,  ..., -0.0487, -0.1021,  0.0415],
        [ 0.0526, -0.0733, -0.0400,  ..., -0.0593, -0.0263,  0.0241]])

In [113]:
'''
Should load this model for google data analysis
'''
torch.save(reordered, 'model/corpus_embedding_google2022_unique_lite_reordered.pt') 

## Import the reordered embeddings

In [4]:
# using the reordered tensors is much, much faster
corpus_embeddings = torch.load('model/corpus_embedding_google2022_unique_lite_reordered.pt')

#### test with cosine similarity calculations

In [27]:
util.cos_sim(corpus_embeddings[0], corpus_embeddings[62750])

tensor([[0.1708]])

In [33]:
unique[unique.wmp_creative_id.isin(['cid_0', 'cid_62750'])]

Unnamed: 0,index,wmp_creative_id
44040,44040,cid_0
49492,49492,cid_62750


In [34]:
# same as: 
util.cos_sim(corpus_embeddings0[44040], corpus_embeddings0[49492])

tensor([[0.1708]])

In [35]:
torch.topk(util.cos_sim(corpus_embeddings[0], corpus_embeddings)[0], 10)

torch.return_types.topk(
values=tensor([1.0000, 0.6093, 0.5966, 0.5905, 0.5821, 0.5798, 0.5798, 0.5798, 0.5734,
        0.5734]),
indices=tensor([    0, 11081, 11093, 11096, 11094, 11018, 11079, 11034, 11089, 11047]))