In [None]:
import numpy as np 
import pandas as pd 
import os, gc, re, warnings
from transformers import AutoModel,AutoTokenizer, DataCollatorWithPadding
import torch
import torch.nn.functional as F
from tqdm import tqdm
warnings.filterwarnings("ignore")

In [None]:
# set seed
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [None]:
# dftr = pd.read_csv("/kaggle/input/amazon-ml/dataset/train.csv")
# dftr["src"]="train"
# dfte = pd.read_csv("/kaggle/input/amazon-ml/dataset/test.csv")
# dfte["src"]="test"
# print('Train shape:',dftr.shape,'Test shape:',dfte.shape,'Test columns:',dfte.columns)
# # df = pd.concat([dftr,dfte],ignore_index=True)

# dftr.head()

In [None]:
# print(dftr.shape)
# dftr = dftr.dropna(axis=0, subset=['TITLE'])
# print(dftr.shape)

In [None]:
# dftr.fillna("", inplace=True)
# dfte.fillna("", inplace=True)

In [None]:
# dftr.isna().sum()

In [None]:
# target_cols = ['PRODUCT_LENGTH']

In [None]:
# import sys
# sys.path.append('../input/iterativestratification')
# from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
# FOLDS = 25
# skf = MultilabelStratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)
# for i,(train_index, val_index) in enumerate(skf.split(dftr,dftr[target_cols])):
#     dftr.loc[val_index,'FOLD'] = i
# print('Train samples per fold:')
# dftr.FOLD.value_counts()

In [None]:
# from sklearn.model_selection import StratifiedKFold
# def create_folds(data, num_splits):
#     data["kfold"] = -1
#     data = data.sample(frac=1).reset_index(drop=True)
#     y=data["PRODUCT_TYPE_ID"]
#     kf = StratifiedKFold(n_splits=num_splits)
#     for f, (t_, v_) in enumerate(kf.split(X=data, y=y)):
#         data.loc[v_, 'kfold'] = f
#     return data

In [None]:
# train=create_folds(dftr, 5)

In [None]:
# dftr=train.loc[train.kfold.isin([0,1])]
# dftr=dftr.reset_index(drop=True)
# display(dftr.head())
# dftr.shape

In [None]:
# dftr = dftr[['PRODUCT_ID', 'TITLE', 'PRODUCT_TYPE_ID', 'PRODUCT_LENGTH', 'src', 'kfold']]
# dfte = dfte[['PRODUCT_ID', 'TITLE', 'PRODUCT_TYPE_ID', 'src']]

In [None]:
# dfte.head()

In [None]:
rel_columns = ['PRODUCT_ID', 'TITLE', 'PRODUCT_LENGTH', 'PRODUCT_TYPE_ID', 'Fold', 'grp']

dftr0 = pd.read_parquet("/kaggle/input/amzz-best-folds/train0.parquet", columns=rel_columns)
dftr1 = pd.read_parquet("/kaggle/input/amzz-best-folds/train1.parquet", columns=rel_columns)
# dftr = pd.concat([dftr0, dftr1])
print(dftr0.shape, dftr1.shape)

dftr0 = dftr0.dropna(axis=0, subset=['TITLE'])
dftr1 = dftr1.dropna(axis=0, subset=['TITLE'])

In [None]:
print(dftr0.shape, dftr1.shape)

In [None]:
dfte = pd.read_csv("/kaggle/input/amazon-ml/dataset/test.csv")
dfte.to_parquet("test.parquet")
del dfte
gc.collect()

In [None]:
test_cols = ['PRODUCT_ID', 'TITLE', 'PRODUCT_TYPE_ID']
dfte = pd.read_parquet("/kaggle/working/test.parquet", columns=test_cols)
dfte.fillna("", inplace=True)

# Generate Embeddings

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state.detach().cpu()
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )

In [None]:
BATCH_SIZE = 32
tokenizer = None
MAX_LEN = 32

class EmbedDataset(torch.utils.data.Dataset):
    def __init__(self,df):
        self.df = df.reset_index(drop=True)
    def __len__(self):
        return len(self.df)
    def __getitem__(self,idx):
        text = self.df.loc[idx,"TITLE"]
        tokens = tokenizer(
                text,
                None,
                add_special_tokens=True,
                padding=False,
                truncation=True,
                max_length=32,return_tensors="pt")
        tokens = {k:v.squeeze(0) for k,v in tokens.items()}
        return tokens



In [None]:
def get_embeddings(MODEL_NM='', MAX=640, BATCH_SIZE=4, verbose=True,train_df=None, test_df=None, train_feat_path='', test_feat_path=''):
    global tokenizer, MAX_LEN
    DEVICE="cuda"
    model = AutoModel.from_pretrained( MODEL_NM )
    tokenizer = AutoTokenizer.from_pretrained( MODEL_NM )
    MAX_LEN = MAX
    
    ds_tr = EmbedDataset(train_df)
    embed_dataloader_tr = torch.utils.data.DataLoader(ds_tr,\
                        batch_size=BATCH_SIZE,\
                        shuffle=False, collate_fn=DataCollatorWithPadding(tokenizer))
    model = model.to(DEVICE)
    model.eval()
    all_train_text_feats = []
    print('Extracting for train data....')
    for batch in tqdm(embed_dataloader_tr,total=len(embed_dataloader_tr)):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        with torch.no_grad():
            model_output = model(input_ids=input_ids,attention_mask=attention_mask)
        sentence_embeddings = mean_pooling(model_output, attention_mask.detach().cpu())
        # Normalize the embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        sentence_embeddings =  sentence_embeddings.squeeze(0).detach().cpu().numpy()
        all_train_text_feats.extend(sentence_embeddings)
    all_train_text_feats = np.array(all_train_text_feats, dtype=np.float16)
    if verbose:
        print('Train embeddings shape',all_train_text_feats.shape)
    np.save(train_feat_path, all_train_text_feats)
    del all_train_text_feats
    gc.collect()
        
    if test_df is not None:
        ds_te = EmbedDataset(test_df)
        embed_dataloader_te = torch.utils.data.DataLoader(ds_te,\
                        batch_size=BATCH_SIZE,\
                        shuffle=False, collate_fn=DataCollatorWithPadding(tokenizer))
        te_text_feats = []
        print("Extracting for test data...")
        for batch in tqdm(embed_dataloader_te,total=len(embed_dataloader_te)):
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            with torch.no_grad():
                model_output = model(input_ids=input_ids,attention_mask=attention_mask)
            sentence_embeddings = mean_pooling(model_output, attention_mask.detach().cpu())
            # Normalize the embeddings
            sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
            sentence_embeddings =  sentence_embeddings.squeeze(0).detach().cpu().numpy()
            te_text_feats.extend(sentence_embeddings)
        te_text_feats = np.array(te_text_feats, dtype=np.float16)
        if verbose:
            print('Test embeddings shape',te_text_feats.shape)
        np.save(test_feat_path, te_text_feats)
        del te_text_feats
        gc.collect()
    
#     return all_train_text_feats, te_text_feats

In [None]:
MODEL_NM = 'sentence-transformers/all-mpnet-base-v2'
get_embeddings(MODEL_NM, MAX=32, BATCH_SIZE=64, train_df=dftr0, test_df=dfte, train_feat_path='train_mpnet_basev2_F0_feats.npy', test_feat_path='test_mpnet_basev2_feats.npy')

In [None]:
torch.cuda.empty_cache()
gc.collect()

MODEL_NM = 'sentence-transformers/all-mpnet-base-v2'
get_embeddings(MODEL_NM, MAX=32, BATCH_SIZE=64, train_df=dftr1, test_df=None, train_feat_path='train_mpnet_basev2_F1_feats.npy')

In [None]:
# MODEL_NM = 'albert-base-v2'
# get_embeddings(MODEL_NM, MAX=32, BATCH_SIZE=64, train_df=dftr0, test_df=dfte, train_feat_path='train_albert_basev2_F0_feats.npy', test_feat_path='test_albert_basev2_feats.npy')
# get_embeddings(MODEL_NM, MAX=32, BATCH_SIZE=64, train_df=dftr1, test_df=None, train_feat_path='train_albert_basev2_F1_feats.npy')