In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%ls


[0m[01;34mdrive[0m/  [01;34msample_data[0m/


In [3]:
%cd drive/MyDrive/

/content/drive/MyDrive


In [4]:
%cd RecSys_Data

/content/drive/MyDrive/RecSys_Data


In [5]:
import pandas as pd
import os

In [6]:
##source - https://www.geeksforgeeks.org/how-to-generate-word-embedding-using-bert/

In [7]:
import random
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

# Set a random seed
random_seed = 42
random.seed(random_seed)

# Set a random seed for PyTorch (for GPU as well)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [8]:
def get_encoding(text, tokenizer):
  text_encoding = tokenizer.batch_encode_plus( text,# List of input texts
      padding=True,              # Pad to the maximum sequence length
      truncation=True,           # Truncate to the maximum sequence length if necessary
      return_tensors='pt',      # Return PyTorch tensors
      add_special_tokens=True    # Add special tokens CLS and SEP
  )

  return text_encoding

In [9]:
def generate_embeddings_and_save_to_pt(model, input_ids, attention_mask, batch_size=128, file_prefix="batch_"):

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    model = model.to(device)

    num_samples = input_ids.size(0)

    for i in range(0, num_samples, batch_size):
        print(f'training batch {i}')
        batch_input_ids = input_ids[i:i+batch_size].to(device)
        batch_attention_mask = attention_mask[i:i+batch_size].to(device)

        with torch.no_grad():
            outputs = model(batch_input_ids, attention_mask=batch_attention_mask)
            batch_embeddings = outputs.last_hidden_state.cpu()  # Keep on CPU

        # Save each batch to a separate file
        torch.save(batch_embeddings, f"{file_prefix}{i // batch_size}.pt")

        del batch_embeddings  # Free memory
        torch.cuda.empty_cache()  # Clear GPU memory

    del input_ids
    del attention_mask
    torch.cuda.empty_cache()

In [10]:
def generate_embeddings(df, model, tokenizer, target_folder, batch_size):
  text = df.text.values
  titles = df.title.values

  if os.path.exists(target_folder + '/text_inputs.pt') and os.path.exists(target_folder + '/text_mask.pt'):

    text_input_ids = torch.load(target_folder + '/text_inputs.pt')
    text_attention_mask = torch.load(target_folder + '/text_mask.pt')

  else:

    text_encoding = get_encoding(text, tokenizer)

    print('Text tokenized...')

    text_input_ids = text_encoding['input_ids']  # Token IDs
    text_attention_mask = text_encoding['attention_mask']  # Attention mask

    text_input_ids = text_input_ids.narrow(1,0,256)
    text_attention_mask = text_attention_mask.narrow(1,0,256)

    torch.save(text_input_ids, target_folder + '/text_inputs.pt')
    torch.save(text_attention_mask, target_folder + '/text_mask.pt')

    print('Tokens saved...')

  print('Generating text embeddings...')

  generate_embeddings_and_save_to_pt(model, text_input_ids, text_attention_mask, batch_size, file_prefix = target_folder + '/text_embedding_')

  if os.path.exists(target_folder + '/title_inputs.pt') and os.path.exists(target_folder + '/title_mask.pt'):

    title_input_ids = torch.load(target_folder + '/title_inputs.pt')
    title_attention_mask = torch.load(target_folder + '/title_mask.pt')

  else:
    title_encoding = get_encoding(titles, tokenizer)

    print('Titles tokenized...')

    title_input_ids = title_encoding['input_ids']  # Token IDs
    title_attention_mask = title_encoding['attention_mask']  # Attention mask

    # title_input_ids = title_input_ids.narrow(1,0,64)
    # title_attention_mask = title_attention_mask.narrow(1,0,64)

    torch.save(title_input_ids, target_folder + '/title_inputs.pt')
    torch.save(title_attention_mask, target_folder + '/title_mask.pt')

    print('Tokens saved')

  print('Generating title embeddings...')

  generate_embeddings_and_save_to_pt(model, title_input_ids, title_attention_mask, batch_size, file_prefix = target_folder + '/title_embedding_')

In [11]:
test_to_embed = pd.read_csv('BERT/bert_testing_set.csv')
train_to_embed = pd.read_csv('BERT/bert_training_set.csv')
val_to_embed = pd.read_csv('BERT/bert_validation_set.csv')

In [12]:
generate_embeddings(train_to_embed, model, tokenizer, 'batched_article_embeddings/train', 1000)
torch.cuda.empty_cache()
generate_embeddings(test_to_embed, model, tokenizer, 'batched_article_embeddings/test', 1000)
torch.cuda.empty_cache()
generate_embeddings(val_to_embed, model, tokenizer, 'batched_article_embeddings/val', 1000)

Text tokenized...
Tokens saved...
Generating text embeddings...
training batch 0
training batch 1000
training batch 2000
training batch 3000
training batch 4000
training batch 5000
training batch 6000
training batch 7000
training batch 8000
training batch 9000
training batch 10000
training batch 11000
training batch 12000
training batch 13000
training batch 14000
training batch 15000
training batch 16000
training batch 17000
training batch 18000
training batch 19000
training batch 20000
training batch 21000
training batch 22000
training batch 23000
training batch 24000
training batch 25000
training batch 26000
training batch 27000
training batch 28000
training batch 29000
training batch 30000
training batch 31000
Titles tokenized...
Tokens saved
Generating title embeddings...
training batch 0
training batch 1000
training batch 2000
training batch 3000
training batch 4000
training batch 5000
training batch 6000
training batch 7000
training batch 8000
training batch 9000
training batch 1

In [13]:
####################################################