In [None]:
import numpy as np 
import pandas as pd 
import os, gc, re, warnings
warnings.filterwarnings("ignore")

In [None]:
dftr = pd.read_csv("/kaggle/input/amazon-ml/dataset/train.csv")
dftr["src"]="train"
dfte = pd.read_csv("/kaggle/input/amazon-ml/dataset/test.csv")
dfte["src"]="test"
print('Train shape:',dftr.shape,'Test shape:',dfte.shape,'Test columns:',dfte.columns)
df = pd.concat([dftr,dfte],ignore_index=True)

dftr.head()

In [None]:
print(dftr.shape)
dftr = dftr.dropna(axis=0, subset=['TITLE'])
print(dftr.shape)

In [None]:
dftr.fillna("", inplace=True)
dfte.fillna("", inplace=True)

In [None]:
dftr.isna().sum()

In [None]:
dfte.isna().sum()

In [None]:
target_cols = ['PRODUCT_LENGTH']

In [None]:
from transformers import AutoModel,AutoTokenizer, DataCollatorWithPadding
import torch
import torch.nn.functional as F
from tqdm import tqdm

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state.detach().cpu()
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )

In [None]:
BATCH_SIZE = 4
tokenizer = None
MAX_LEN = 128

class EmbedDataset(torch.utils.data.Dataset):
    def __init__(self,df):
        self.df = df.reset_index(drop=True)
    def __len__(self):
        return len(self.df)
    def __getitem__(self,idx):
        text = self.df.loc[idx,"TITLE"]
        tokens = tokenizer(
                text,
                None,
                add_special_tokens=True,
                padding=False,
                truncation=True,
                max_length=32,return_tensors="pt")
        tokens = {k:v.squeeze(0) for k,v in tokens.items()}
        return tokens

In [None]:
def get_embeddings(MODEL_NM='', MAX=640, BATCH_SIZE=4, verbose=True):
    global tokenizer, MAX_LEN
    DEVICE="cuda"
    model = AutoModel.from_pretrained( MODEL_NM )
    tokenizer = AutoTokenizer.from_pretrained( MODEL_NM )
    MAX_LEN = MAX
    
    ds_tr = EmbedDataset(dftr)
    embed_dataloader_tr = torch.utils.data.DataLoader(ds_tr,\
                        batch_size=BATCH_SIZE,\
                        shuffle=False, collate_fn=DataCollatorWithPadding(tokenizer))
    ds_te = EmbedDataset(dfte)
    embed_dataloader_te = torch.utils.data.DataLoader(ds_te,\
                        batch_size=BATCH_SIZE,\
                        shuffle=False, collate_fn=DataCollatorWithPadding(tokenizer))
    
    model = model.to(DEVICE)
    model.eval()
    all_train_text_feats = []
    for batch in tqdm(embed_dataloader_tr,total=len(embed_dataloader_tr)):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        with torch.no_grad():
            model_output = model(input_ids=input_ids,attention_mask=attention_mask)
        sentence_embeddings = mean_pooling(model_output, attention_mask.detach().cpu())
        # Normalize the embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        sentence_embeddings =  sentence_embeddings.squeeze(0).detach().cpu().numpy()
        all_train_text_feats.extend(sentence_embeddings)
    all_train_text_feats = np.array(all_train_text_feats)
    if verbose:
        print('Train embeddings shape',all_train_text_feats.shape)
        
    te_text_feats = []
    for batch in tqdm(embed_dataloader_te,total=len(embed_dataloader_te)):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        with torch.no_grad():
            model_output = model(input_ids=input_ids,attention_mask=attention_mask)
        sentence_embeddings = mean_pooling(model_output, attention_mask.detach().cpu())
        # Normalize the embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        sentence_embeddings =  sentence_embeddings.squeeze(0).detach().cpu().numpy()
        te_text_feats.extend(sentence_embeddings)
    te_text_feats = np.array(te_text_feats)
    if verbose:
        print('Test embeddings shape',te_text_feats.shape)
        
    return all_train_text_feats, te_text_feats

In [None]:
MODEL_NM = '../input/huggingface-deberta-variants/deberta-large/deberta-large'
all_train_text_feats3, te_text_feats3 = get_embeddings(MODEL_NM, MAX=32, BATCH_SIZE=64)

In [None]:
np.save('train_deberta_large_feats.npy', all_train_text_feats2)
np.save('test_deberta_large_feats.npy', te_text_feats2)

In [None]:
MODEL_NM = '../input/huggingface-deberta-variants/deberta-large-mnli/deberta-large-mnli'
all_train_text_feats4, te_text_feats4 = get_embeddings(MODEL_NM, MAX=32, BATCH_SIZE=64)

In [None]:
np.save('train_deberta_large_mnli_feats.npy', all_train_text_feats2)
np.save('test_deberta_large_mnli_feats.npy', te_text_feats2)