In [1]:
import numpy as np 
import pandas as pd 
import os, gc, re, warnings
from transformers import AutoModel,AutoTokenizer, DataCollatorWithPadding
import torch
import torch.nn.functional as F
from tqdm import tqdm
warnings.filterwarnings("ignore")

In [2]:
# set seed
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [3]:
# dftr = pd.read_csv("/kaggle/input/amazon-ml/dataset/train.csv")
# dftr["src"]="train"
# dfte = pd.read_csv("/kaggle/input/amazon-ml/dataset/test.csv")
# dfte["src"]="test"
# print('Train shape:',dftr.shape,'Test shape:',dfte.shape,'Test columns:',dfte.columns)
# # df = pd.concat([dftr,dfte],ignore_index=True)

# dftr.head()

In [4]:
# print(dftr.shape)
# dftr = dftr.dropna(axis=0, subset=['TITLE'])
# print(dftr.shape)

In [5]:
# dftr.fillna("", inplace=True)
# dfte.fillna("", inplace=True)

In [6]:
# dftr.isna().sum()

In [7]:
# target_cols = ['PRODUCT_LENGTH']

In [8]:
# import sys
# sys.path.append('../input/iterativestratification')
# from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
# FOLDS = 25
# skf = MultilabelStratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)
# for i,(train_index, val_index) in enumerate(skf.split(dftr,dftr[target_cols])):
#     dftr.loc[val_index,'FOLD'] = i
# print('Train samples per fold:')
# dftr.FOLD.value_counts()

In [9]:
# from sklearn.model_selection import StratifiedKFold
# def create_folds(data, num_splits):
#     data["kfold"] = -1
#     data = data.sample(frac=1).reset_index(drop=True)
#     y=data["PRODUCT_TYPE_ID"]
#     kf = StratifiedKFold(n_splits=num_splits)
#     for f, (t_, v_) in enumerate(kf.split(X=data, y=y)):
#         data.loc[v_, 'kfold'] = f
#     return data

In [10]:
# train=create_folds(dftr, 5)

In [11]:
# dftr=train.loc[train.kfold.isin([0,1])]
# dftr=dftr.reset_index(drop=True)
# display(dftr.head())
# dftr.shape

In [12]:
# dftr = dftr[['PRODUCT_ID', 'TITLE', 'PRODUCT_TYPE_ID', 'PRODUCT_LENGTH', 'src', 'kfold']]
# dfte = dfte[['PRODUCT_ID', 'TITLE', 'PRODUCT_TYPE_ID', 'src']]

In [13]:
# dfte.head()

In [14]:
dftr0 = pd.read_csv("/kaggle/input/amazon-fold/train_fold0.csv")
dftr1 = pd.read_csv("/kaggle/input/amazon-fold/train_fold1.csv")
dftr0['fold'] = 0
dftr1['fold'] = 1
dftr = pd.concat([dftr0, dftr1])
print(dftr0.shape, dftr1.shape, dftr.shape)

dfte = pd.read_csv("/kaggle/input/amazon-ml/dataset/test.csv")
dfte.fillna("", inplace=True)

(449938, 9) (449937, 9) (899875, 9)


In [15]:
del dftr0, dftr1
gc.collect()

21

# Generate Embeddings

In [16]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state.detach().cpu()
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )

In [17]:
BATCH_SIZE = 32
tokenizer = None
MAX_LEN = 32

class EmbedDataset(torch.utils.data.Dataset):
    def __init__(self,df):
        self.df = df.reset_index(drop=True)
    def __len__(self):
        return len(self.df)
    def __getitem__(self,idx):
        text = self.df.loc[idx,"TITLE"]
        tokens = tokenizer(
                text,
                None,
                add_special_tokens=True,
                padding=False,
                truncation=True,
                max_length=32,return_tensors="pt")
        tokens = {k:v.squeeze(0) for k,v in tokens.items()}
        return tokens



In [18]:
def get_embeddings(MODEL_NM='', MAX=640, BATCH_SIZE=4, verbose=True,train_feat='', test_feat=''):
    global tokenizer, MAX_LEN
    DEVICE="cuda"
    model = AutoModel.from_pretrained( MODEL_NM )
    tokenizer = AutoTokenizer.from_pretrained( MODEL_NM )
    MAX_LEN = MAX
    
    ds_tr = EmbedDataset(dftr)
    embed_dataloader_tr = torch.utils.data.DataLoader(ds_tr,\
                        batch_size=BATCH_SIZE,\
                        shuffle=False, collate_fn=DataCollatorWithPadding(tokenizer))
    ds_te = EmbedDataset(dfte)
    embed_dataloader_te = torch.utils.data.DataLoader(ds_te,\
                        batch_size=BATCH_SIZE,\
                        shuffle=False, collate_fn=DataCollatorWithPadding(tokenizer))
    
    model = model.to(DEVICE)
    model.eval()
    all_train_text_feats = []
    for batch in tqdm(embed_dataloader_tr,total=len(embed_dataloader_tr)):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        with torch.no_grad():
            model_output = model(input_ids=input_ids,attention_mask=attention_mask)
        sentence_embeddings = mean_pooling(model_output, attention_mask.detach().cpu())
        # Normalize the embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        sentence_embeddings =  sentence_embeddings.squeeze(0).detach().cpu().numpy()
        all_train_text_feats.extend(sentence_embeddings)
    all_train_text_feats = np.array(all_train_text_feats, dtype=np.float16)
    if verbose:
        print('Train embeddings shape',all_train_text_feats.shape)
    np.save(train_feat, all_train_text_feats)
    del all_train_text_feats
    gc.collect()
        
    te_text_feats = []
    for batch in tqdm(embed_dataloader_te,total=len(embed_dataloader_te)):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        with torch.no_grad():
            model_output = model(input_ids=input_ids,attention_mask=attention_mask)
        sentence_embeddings = mean_pooling(model_output, attention_mask.detach().cpu())
        # Normalize the embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        sentence_embeddings =  sentence_embeddings.squeeze(0).detach().cpu().numpy()
        te_text_feats.extend(sentence_embeddings)
    te_text_feats = np.array(te_text_feats, dtype=np.float16)
    if verbose:
        print('Test embeddings shape',te_text_feats.shape)
    np.save(test_feat, te_text_feats)
    del te_text_feats
    gc.collect()
    
#     return all_train_text_feats, te_text_feats

In [19]:
MODEL_NM = 'sentence-transformers/all-mpnet-base-v2'
get_embeddings(MODEL_NM, MAX=32, BATCH_SIZE=64, train_feat='train_mpnet_basev2_feats.npy', test_feat='test_mpnet_basev2_feats.npy')

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

  0%|          | 0/14061 [00:00<?, ?it/s]You're using a MPNetTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 14061/14061 [18:39<00:00, 12.56it/s]


Train embeddings shape (899875, 768)


100%|██████████| 11481/11481 [15:14<00:00, 12.56it/s]


Test embeddings shape (734736, 768)


In [20]:
MODEL_NM = 'albert-base-v2'
get_embeddings(MODEL_NM, MAX=32, BATCH_SIZE=64, train_feat='train_albert_basev2_feats.npy', test_feat='test_albert_basev2_feats.npy')

Downloading (…)lve/main/config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertModel: ['predictions.dense.bias', 'predictions.dense.weight', 'predictions.LayerNorm.weight', 'predictions.decoder.bias', 'predictions.decoder.weight', 'predictions.LayerNorm.bias', 'predictions.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)ve/main/spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

  0%|          | 0/14061 [00:00<?, ?it/s]You're using a AlbertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 14061/14061 [21:39<00:00, 10.82it/s]


Train embeddings shape (899875, 768)


100%|██████████| 11481/11481 [17:47<00:00, 10.75it/s]


Test embeddings shape (734736, 768)
