In [None]:
from sklearn.decomposition import PCA
from transformers import AutoTokenizer, AutoModel
import torch
from pathlib import Path
import pickle
from exported import mean_pooling

In [None]:
upstream = ['load', 'validation']
product = None

In [None]:
MODEL_NAME = pickle.loads(Path(upstream['load']['MODEL_NAME']).read_bytes())
rev_df = pickle.loads(Path(upstream['validation']['rev_df']).read_bytes())

## Embedding

In [None]:
# Embedding model


# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[
        0
    ]  # First element of model_output contains all token embeddings
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )


tokenizer = AutoTokenizer.from_pretrained(f'{MODEL_NAME}')
model = AutoModel.from_pretrained(f'{MODEL_NAME}')

In [8]:
description_columns = ['desc_1', 'desc_2', 'desc_3']
embeddings = []
for desc_col in description_columns:
    rev_df[desc_col] = rev_df[desc_col].fillna('')
    encoded_input = tokenizer(
        rev_df[desc_col].to_list(), padding=True, truncation=True, return_tensors='pt'
    )
    with torch.no_grad():
        model_output = model(**encoded_input)
    embeddings.append(mean_pooling(model_output, encoded_input['attention_mask']))
stacked_embeddings = torch.hstack(embeddings).numpy()
embeddings_reduced = PCA(n_components=0.67).fit_transform(stacked_embeddings)

In [None]:
Path(product['embeddings_reduced']).parent.mkdir(exist_ok=True, parents=True)
Path(product['embeddings_reduced']).write_bytes(pickle.dumps(embeddings_reduced))

Path(product['rev_df']).parent.mkdir(exist_ok=True, parents=True)
Path(product['rev_df']).write_bytes(pickle.dumps(rev_df))