In [1]:
import pandas as pd
import numpy as np

import os
from transformers import AutoTokenizer, AutoModel
import torch
import time

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
cd '/content/drive/MyDrive/Studium/03 UC3M/Thesis/Data'

/content/drive/MyDrive/Studium/03 UC3M/Thesis/Data


# Load Data

In [4]:
data_orig = pd.read_csv("tcc_ceds_music.csv")

X = data_orig['lyrics']
y_valence = data_orig['valence']
y_arousal = data_orig['energy']

# split for valence and arousal (using the same split)
X_train, X_test, y_train_valence, y_test_valence = train_test_split(X, y_valence, test_size=0.2, random_state=18)
_, _, y_train_arousal, y_test_arousal = train_test_split(X, y_arousal, test_size=0.2, random_state=18)

print(X_train.shape)
print(X_test.shape)

(22697,)
(5675,)


In [None]:
# save training and testing partitions to local file

np.save('X_train.npy', X_train)
np.save('X_test.npy', X_test)

np.save('y_train_valence.npy', y_train_valence)
np.save('y_train_arousal.npy', y_train_arousal)
np.save('y_test_valence.npy', y_test_valence)
np.save('y_test_arousal.npy', y_test_arousal)

# TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', max_df=0.85, min_df=0.01)

In [None]:
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

X_train_tfidf.shape

(22697, 659)

In [None]:
# TF-IDF score for each word in each of the documents (used for transformation from word to document embeddings)
feature_names = tfidf_vectorizer.get_feature_names_out()
df_tfidf = pd.DataFrame(X_train_tfidf.toarray(), columns=feature_names)

df_tfidf.shape

(22697, 659)

In [None]:
# save embeddings to local file
from scipy import sparse

sparse.save_npz(os.path.join('TF-IDF', 'X_train_tfidf.npz'), X_train_tfidf)
sparse.save_npz(os.path.join('TF-IDF', 'X_test_tfidf.npz'), X_test_tfidf)

# Word2Vec

In [None]:
#!pip install gensim
import gensim
from gensim.models import Word2Vec

In [None]:
# Tokenize text
def preprocess(text):
    return gensim.utils.simple_preprocess(text)

X_train_tokenized = [preprocess(doc) for doc in X_train]
X_test_tokenized = [preprocess(doc) for doc in X_test]

## Pre-trained Word2Vec


In [None]:
from gensim.models import KeyedVectors

# Load pre-trained Word2Vec model
pretrained_model_path = 'GoogleNews-vectors-negative300.bin'
pretrained_word2vec = KeyedVectors.load_word2vec_format(pretrained_model_path, binary=True)

#### Mean Pooling

* obtaining document embeddings using Mean Pooling

In [None]:
def get_document_vector(doc):
  word_vectors = [pretrained_word2vec[word] for word in doc if word in pretrained_word2vec]  # get embedding for each word
  return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(pretrained_word2vec.vector_size) # get mean over all words

In [None]:
# Transform documents to vectors
X_train_Word2Vec_pretrained = np.array([get_document_vector(doc) for doc in X_train_tokenized])
X_test_Word2Vec_pretrained = np.array([get_document_vector(doc) for doc in X_test_tokenized])

X_train_Word2Vec_pretrained.shape

(22697, 300)

In [None]:
# save embeddings to local file

np.save(os.path.join('Word2Vec', 'X_train_Word2Vec_pretrained.npy'), X_train_Word2Vec_pretrained)
np.save(os.path.join('Word2Vec', 'X_test_Word2Vec_pretrained.npy'), X_test_Word2Vec_pretrained)

### Document embeddings with TF-IDF

* obtain document embeddings using TF-IDF scores

In [None]:
# for each document, get embedding for each word and multiply with TF-IDF score of the word in the document (df_tfidf)

def is_word_in_word2vec(word, word2vec_model_):
    return word in word2vec_model.wv

def get_document_embedding_tfidf(X_tokenized_, pretrained_Word2Vec=False):
  doc_index = 0
  doc_embeddings = []

  for doc in X_tokenized_:
    if doc_index % 2500 == 0:
      print(f'Document # {doc_index}')

    weighted_embeddings = []
    total_tfidf_weight = 0

    for token in doc:
      token_found = False

      # get word embedding
      if pretrained_Word2Vec == True: # pretrained
        if token in pretrained_word2vec:
          word_embedding = pretrained_word2vec[token]
          token_found = True

      else: # custom
        if is_word_in_word2vec(token, word2vec_model):
          word_embedding = word2vec_model.wv[token]
          token_found = True

      if token_found == True:
        # get tf-idf score for word (in the corresponding document), if there is no tf-idf value: 0
        try:
          tfidf_score = df_tfidf[token][doc_index]
        except KeyError:
          tfidf_score = 0.1   # try different approaches
        # print(tfidf_score)
        # print('______________')

        weighted_embedding = word_embedding*tfidf_score
        weighted_embeddings.append(weighted_embedding)
        total_tfidf_weight += tfidf_score   # total weight of TF-IDF for each doc

    # if all tfidf score = 0: return zero vector
    if total_tfidf_weight == 0:
      document_embedding = np.zeros(pretrained_word2vec.vector_size)
    else:
      # document_embedding = np.sum(weighted_embeddings, axis=0) / total_tfidf_weight
      document_embedding = np.sum(weighted_embeddings, axis=0) / len(doc)

    doc_embeddings.append(document_embedding)
    # if word not in word2vec: skip
    doc_index += 1
  return np.array(doc_embeddings)

In [None]:
# apply to train
X_train_Word2Vec_pretrained_tfidf = get_document_embedding_tfidf(X_train_tokenized, pretrained_Word2Vec=True)
print(X_train_Word2Vec_pretrained_tfidf.shape)

Document # 0
Document # 2500
Document # 5000
Document # 7500
Document # 10000
Document # 12500
Document # 15000
Document # 17500
Document # 20000
Document # 22500
(22697, 300)


In [None]:
# apply to test
X_test_Word2Vec_pretrained_tfidf = get_document_embedding_tfidf(X_test_tokenized, pretrained_Word2Vec=True)
print(X_test_Word2Vec_pretrained_tfidf.shape)

Document # 0
Document # 2500
Document # 5000
(5675, 300)


In [None]:
# save embeddings to local file
np.save(os.path.join('Word2Vec', 'X_train_Word2Vec_pretrained_tfidf.npy'), X_train_Word2Vec_pretrained_tfidf)
np.save(os.path.join('Word2Vec', 'X_test_Word2Vec_pretrained_tfidf.npy'), X_test_Word2Vec_pretrained_tfidf)

## Custom Word2Vec

* learn embeddings from training corpus

In [None]:
# train Word2Vec on train data
word2vec_model = Word2Vec(sentences=X_train_tokenized, vector_size=300, window=5, min_count=5, workers=4)

#### Mean Pooling

In [None]:
def get_document_vector(doc, model):
    word_vectors = [model.wv[word] for word in doc if word in model.wv]  # get embedding for each word
    return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(model.vector_size) # get mean over all words

X_train_Word2Vec_custom = np.array([get_document_vector(doc, word2vec_model) for doc in X_train_tokenized])
X_test_Word2Vec_custom = np.array([get_document_vector(doc, word2vec_model) for doc in X_test_tokenized])

X_train_Word2Vec_custom.shape

(22697, 300)

In [None]:
# save embeddings to local file

np.save(os.path.join('Word2Vec', 'X_train_Word2Vec_custom.npy'), X_train_Word2Vec_custom)
np.save(os.path.join('Word2Vec', 'X_test_Word2Vec_custom.npy'), X_test_Word2Vec_custom)

#### Document embeddings with TF-IDF

In [None]:
X_train_Word2Vec_custom_tfidf = get_document_embedding_tfidf(X_train_tokenized, pretrained_Word2Vec=False)
print(X_train_Word2Vec_custom_tfidf.shape)

Document # 0
Document # 2500
Document # 5000
Document # 7500
Document # 10000
Document # 12500
Document # 15000
Document # 17500
Document # 20000
Document # 22500
(22697, 300)


In [None]:
X_test_Word2Vec_custom_tfidf = get_document_embedding_tfidf(X_test_tokenized, pretrained_Word2Vec=True)
print(X_test_Word2Vec_custom_tfidf.shape)

Document # 0
Document # 2500
Document # 5000
(5675, 300)


In [None]:
# save embeddings to local file
np.save(os.path.join('Word2Vec', 'X_train_Word2Vec_custom_tfidf.npy'), X_train_Word2Vec_custom_tfidf)
np.save(os.path.join('Word2Vec', 'X_test_Word2Vec_custom_tfidf.npy'), X_test_Word2Vec_custom_tfidf)

# Doc2Vec


In [None]:
# using same tokenized data as before
train_corpus = list(X_train_tokenized)

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

def preprocess(text):
    return gensim.utils.simple_preprocess(text)

train_corpus = [TaggedDocument(words=preprocess(doc), tags=[i]) for i, doc in enumerate(X_train)]

len(train_corpus)

22697

In [None]:
# instantiate doc2Vec model
doc2vec_model = gensim.models.doc2vec.Doc2Vec(vector_size=300, min_count=5, epochs=5, window=5)

# build vocabulary
doc2vec_model.build_vocab(train_corpus)

In [None]:
# train on corpus
doc2vec_model.train(train_corpus, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

In [None]:
X_train_Doc2Vec = np.array([doc2vec_model.infer_vector(doc) for doc in X_train_tokenized])
X_test_Doc2Vec = np.array([doc2vec_model.infer_vector(doc) for doc in X_test_tokenized])

X_train_Doc2Vec.shape

(22697, 300)

In [None]:
# save embeddings to local file

np.save(os.path.join('Doc2Vec', 'X_train_Doc2Vec.npy'), X_train_Doc2Vec)
np.save(os.path.join('Doc2Vec', 'X_test_Doc2Vec.npy'), X_test_Doc2Vec)

# GloVe


In [None]:
# ! pip install glove-python3

from glove import Glove, Corpus

In [None]:
# tokenize texts

#!pip install gensim
import gensim

def preprocess(text):
    return gensim.utils.simple_preprocess(text)

X_train_tokenized = [preprocess(doc) for doc in X_train]
X_test_tokenized = [preprocess(doc) for doc in X_test]

## Pretrained

* Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors, 2.03 GB download): glove.840B.300d.zip

In [None]:
glove_embeddings = {}

# compute GloVe dictionary

# read pre-trained txt file line by line and extract word embeddings to dictionary
with open('glove.840B.300d.txt', 'r', encoding='utf-8') as file:
    for line in file:
      values = line.split(' ')
      word = values[0]
      vector = np.asarray(values[1:], dtype='float32')

      glove_embeddings[word] = vector

# save dictionary as pickle
import pickle
with open('pretrained_GloVe_dict.pkl', 'wb') as file:
    pickle.dump(glove_embeddings, file)

In [None]:
# load dictionary from local pickle file (to save time reading the large original file)
import pickle
with open('pretrained_GloVe_dict.pkl', 'rb') as file:
    glove_embeddings = pickle.load(file)

len(glove_embeddings)

2196016

In [None]:
def get_word_vectors(song):
  word_vectors = [glove_embeddings[token] for token in song if token in glove_embeddings]
  return(np.array(word_vectors))

# word-level embeddings
X_train_GloVe_pretrained_big = [get_word_vectors(song) for song in X_train_tokenized]
X_test_GloVe_pretrained_big = [get_word_vectors(song) for song in X_test_tokenized]

print(len(X_train_GloVe_pretrained_big))
print(X_train_GloVe_pretrained_big[0].shape)

22697
(57, 300)


In [None]:
# doc level embedding: mean pooling
X_train_GloVe_pretrained = np.array([np.mean(song, axis=0) for song in X_train_GloVe_pretrained_big])
X_test_GloVe_pretrained = np.array([np.mean(song, axis=0) for song in X_test_GloVe_pretrained_big])

print(X_train_GloVe_pretrained.shape)

(22697, 300)


In [None]:
# save embeddings to local file
np.save(os.path.join('GloVe', 'X_train_GloVe_pretrained.npy'), X_train_GloVe_pretrained)
np.save(os.path.join('GloVe', 'X_test_GloVe_pretrained.npy'), X_test_GloVe_pretrained)

## Custom

* train on training corpus



In [None]:
# Train GloVe

# initialize corpus and GloVe model
corpus_model = Corpus()
corpus_model.fit(X_train_tokenized, window=10)
glove_model = Glove(no_components=300, learning_rate=0.05)

# train on corpus
glove_model.fit(corpus_model.matrix, epochs=20, no_threads=4, verbose=True)
glove_model.add_dictionary(corpus_model.dictionary)

# save
glove_model.save(os.path.join('GloVe', 'glove_model.model'))

# load from previous training
# glove_model = Glove.load('glove_model.model')

Performing 20 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19


In [None]:
def compute_document_embedding(tokens):
    # initialize vector for document embedding
    song_embedding = np.zeros(glove_model.no_components)
    word_embeddings = []

    # the number of words in the song
    count = 0

    # itterate through each token
    for token in tokens:
        if token in glove_model.dictionary:
            word_embedding = glove_model.word_vectors[glove_model.dictionary[token]]
            word_embeddings.append(word_embedding)
            song_embedding += word_embedding
            count += 1

    # mean pooling across all words
    if count > 0:
        song_embedding /= count

    return song_embedding

# compute embeddings
X_train_GloVe_custom = np.array([compute_document_embedding(tokens) for tokens in X_train_tokenized])
X_test_GloVe_custom = np.array([compute_document_embedding(tokens) for tokens in X_test_tokenized])
X_train_GloVe_custom.shape

(22697, 300)

In [None]:
# save embeddings to local file
np.save(os.path.join('GloVe', 'X_train_GloVe_custom.npy'), X_train_GloVe_custom)
np.save(os.path.join('GloVe', 'X_test_GloVe_custom.npy'), X_test_GloVe_custom)

# BERT

In [5]:
# !pip install torch
from transformers import AutoTokenizer, AutoModel
import torch

In [6]:
# Model: BERT-base
bert_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/bert-base-nli-mean-tokens")
pretrained_BERT = AutoModel.from_pretrained("sentence-transformers/bert-base-nli-mean-tokens")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [7]:
# tokenize each document with a max length of 128 tokens
  # each document is represented with 128 tokens
  # each token is represented as a vector of size 768

# split into smaller batches to avoid RAM problems
batch_size = 500
X_train_batches = [X_train[i:i+batch_size] for i in range(0, len(X_train), batch_size)]
X_test_batches = [X_test[i:i+batch_size] for i in range(0, len(X_test), batch_size)]


print(f'Number of Batches (train): {len(X_train_batches)}')
print(X_train_batches[-1].shape) # last batch size

print(f'Number of Batches (test): {len(X_test_batches)}')
print(X_test_batches[-1].shape) # last batch size

Number of Batches (train): 46
(197,)
Number of Batches (test): 12
(175,)


#### Load saved Embedding

* if computed previously, load existing embeddings from local


In [None]:
# train
BERT_train_last_hidden_states = torch.load(os.path.join('BERT', 'BERT_train_last_hidden_states.pt'))
print(BERT_test_last_hidden_states.shape)

BERT_train_pooler_outputs = torch.load(os.path.join('BERT', 'BERT_train_pooler_outputs.pt'))
print(BERT_train_pooler_outputs.shape)


# test
BERT_test_last_hidden_states = torch.load(os.path.join('BERT', 'BERT_test_last_hidden_states.pt'))
print(BERT_test_last_hidden_states.shape)

BERT_test_pooler_outputs = torch.load(os.path.join('BERT', 'BERT_test_pooler_outputs.pt'))
print(BERT_test_pooler_outputs.shape)

### Compute Embeddings

* in 5 chunks with 10 batches of 500 songs each to avoid crashing because of RAM


In [11]:
# compute BERT embedding for each batch (caution: takes very long time)
def get_BERT_embeddings(data_batches, BERT_model, tokenizer):
  pretrained_BERT_output = []
  count = int(0)

  for batch in data_batches:
    count +=1
    print(f'Embedding Batch number: {count}/{len(data_batches)}')
    tokenized_batch = tokenizer(batch.to_list(), padding=True, truncation=True,
                                            max_length=128, return_tensors='pt')

    with torch.no_grad(): # disable gradient calculation to improve efficiency.
      pretrained_BERT_output.append(BERT_model(**tokenized_batch))

  return pretrained_BERT_output

In [None]:
# concatenate BERT embeddings into single tensor
def concat_BERT_embeddings(BERT_output):
  last_hidden_states_list = []
  pooler_outputs_list = []

  for batch_output in BERT_output:
    # hidden states of last layer of BERT
    last_hidden_states_list.append(batch_output['last_hidden_state'])

    # document-level (pooled) representation (aggregated from all 128 tokens)
      # no need to 'mean_pool' manually
    pooler_outputs_list.append(batch_output['pooler_output'])

  last_hidden_states = torch.cat(last_hidden_states_list, dim=0)
  pooler_outputs = torch.cat(pooler_outputs_list, dim=0)

  print(last_hidden_states.shape)
  print(pooler_outputs.shape) # each song represented as vector of 768
  return (last_hidden_states, pooler_outputs)

#### Train

In [None]:
BERT_output_train1 = get_BERT_embeddings(data_batches = X_train_batches[:10], BERT_model = pretrained_BERT, tokenizer = bert_tokenizer)
BERT_train_last_hidden_states1, BERT_train_pooler_outputs1 = concat_BERT_embeddings(BERT_output_train1)

torch.save(BERT_train_last_hidden_states1, 'BERT_train_last_hidden_states1.pt')
torch.save(BERT_train_pooler_outputs1, 'BERT_train_pooler_outputs1.pt')

torch.Size([5000, 128, 768])
torch.Size([5000, 768])


In [None]:
BERT_output_train2 = get_BERT_embeddings(data_batches = X_train_batches[10:20], BERT_model = pretrained_BERT, tokenizer = bert_tokenizer)
BERT_train_last_hidden_states2, BERT_train_pooler_outputs2 = concat_BERT_embeddings(BERT_output_train2)

torch.save(BERT_train_last_hidden_states2, 'BERT_train_last_hidden_states2.pt')
torch.save(BERT_train_pooler_outputs2, 'BERT_train_pooler_outputs2.pt')

Embedding Batch number: 1/10
Embedding Batch number: 2/10
Embedding Batch number: 3/10
Embedding Batch number: 4/10
Embedding Batch number: 5/10
Embedding Batch number: 6/10
Embedding Batch number: 7/10
Embedding Batch number: 8/10
Embedding Batch number: 9/10
Embedding Batch number: 10/10
torch.Size([5000, 128, 768])
torch.Size([5000, 768])


In [None]:
BERT_output_train3 = get_BERT_embeddings(data_batches = X_train_batches[20:30], BERT_model = pretrained_BERT, tokenizer = bert_tokenizer)
BERT_train_last_hidden_states3, BERT_train_pooler_outputs3 = concat_BERT_embeddings(BERT_output_train3)
torch.save(BERT_train_last_hidden_states3, 'BERT_train_last_hidden_states3.pt')
torch.save(BERT_train_pooler_outputs3, 'BERT_train_pooler_outputs3.pt')

Embedding Batch number: 1/10
Embedding Batch number: 2/10
Embedding Batch number: 3/10
Embedding Batch number: 4/10
Embedding Batch number: 5/10
Embedding Batch number: 6/10
Embedding Batch number: 7/10
Embedding Batch number: 8/10
Embedding Batch number: 9/10
Embedding Batch number: 10/10
torch.Size([5000, 128, 768])
torch.Size([5000, 768])


In [None]:
BERT_output_train4 = get_BERT_embeddings(data_batches = X_train_batches[30:40], BERT_model = pretrained_BERT, tokenizer = bert_tokenizer)
BERT_train_last_hidden_states4, BERT_train_pooler_outputs4 = concat_BERT_embeddings(BERT_output_train4)
torch.save(BERT_train_last_hidden_states4, 'BERT_train_last_hidden_states4.pt')
torch.save(BERT_train_pooler_outputs4, 'BERT_train_pooler_outputs4.pt')

Embedding Batch number: 1/10
Embedding Batch number: 2/10
Embedding Batch number: 3/10
Embedding Batch number: 4/10
Embedding Batch number: 5/10
Embedding Batch number: 6/10
Embedding Batch number: 7/10
Embedding Batch number: 8/10
Embedding Batch number: 9/10
Embedding Batch number: 10/10
torch.Size([5000, 128, 768])
torch.Size([5000, 768])


In [None]:
BERT_output_train5 = get_BERT_embeddings(data_batches = X_train_batches[40:46], BERT_model = pretrained_BERT, tokenizer = bert_tokenizer)
BERT_train_last_hidden_states5, BERT_train_pooler_outputs5 = concat_BERT_embeddings(BERT_output_train5)
torch.save(BERT_train_last_hidden_states5, 'BERT_train_last_hidden_states5.pt')
torch.save(BERT_train_pooler_outputs5, 'BERT_train_pooler_outputs5.pt')

Embedding Batch number: 1/6
Embedding Batch number: 2/6
Embedding Batch number: 3/6
Embedding Batch number: 4/6
Embedding Batch number: 5/6
Embedding Batch number: 6/6
torch.Size([2697, 128, 768])
torch.Size([2697, 768])


#### Test

In [None]:
BERT_output_test = get_BERT_embeddings(data_batches = X_test_batches, BERT_model = pretrained_BERT, tokenizer = bert_tokenizer)
BERT_test_last_hidden_states, BERT_test_pooler_outputs = concat_BERT_embeddings(BERT_output_test)

torch.save(BERT_test_last_hidden_states, 'BERT_test_last_hidden_states.pt')
torch.save(BERT_test_pooler_outputs, 'BERT_test_pooler_outputs.pt')

Embedding Batch number: 1/12
Embedding Batch number: 2/12
Embedding Batch number: 3/12
Embedding Batch number: 4/12
Embedding Batch number: 5/12
Embedding Batch number: 6/12
Embedding Batch number: 7/12
Embedding Batch number: 8/12
Embedding Batch number: 9/12
Embedding Batch number: 10/12
Embedding Batch number: 11/12
Embedding Batch number: 12/12


### Document Embeddings

In [None]:
BERT_train_last_hidden_states = torch.load(os.path.join('BERT', 'BERT_train_last_hidden_states.pt'))
BERT_test_last_hidden_states = torch.load(os.path.join('BERT', 'BERT_test_last_hidden_states.pt'))

print("BERT:", BERT_train_last_hidden_states.shape)

##### CLS

In [None]:
# CLS
X_train_BERT_CLS = BERT_train_last_hidden_states[:, 0, :].detach().cpu().numpy()
X_test_BERT_CLS = BERT_test_last_hidden_states[:, 0, :].detach().cpu().numpy()
print(X_train_BERT_CLS.shape)
print(X_test_BERT_CLS.shape)

In [None]:
np.save(os.path.join('BERT', 'X_train_BERT_CLS.npy'), X_train_BERT_CLS)
np.save(os.path.join('BERT', 'X_test_BERT_CLS.npy'), X_test_BERT_CLS)

#### Mean Pooling

In [None]:
X_train_BERT_MeanPooling = BERT_train_last_hidden_states.mean(dim=1).detach().cpu().numpy()
X_test_BERT_MeanPooling = BERT_test_last_hidden_states.mean(dim=1).detach().cpu().numpy()
print(X_train_BERT_MeanPooling.shape)
print(X_test_BERT_MeanPooling.shape)

In [None]:
np.save(os.path.join('BERT', 'X_train_BERT_MeanPooling.npy'), X_train_BERT_MeanPooling)
np.save(os.path.join('BERT', 'X_test_BERT_MeanPooling.npy'), X_test_BERT_MeanPooling)

# DistilBERT

In [8]:
# Model: DistilBERT
distilbert_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
pretrained_DistilBERT = AutoModel.from_pretrained("distilbert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

### Load saved Embedding

* load previously calculated embeddings from local

In [14]:
# Train
DistillBERT_train_last_hidden_states = torch.load(os.path.join('DistillBERT', 'DistillBERT_train_last_hidden_states.pt'), weights_only=True, weights_only=True)
print(DistillBERT_train_last_hidden_states.shape)

torch.Size([22697, 128, 768])


In [17]:
# Train alternative
  #  load from batches to avoid RAM issues

# 1&2
DistillBERT_train_last_hidden_states1 = torch.load(os.path.join('DistillBERT', 'DistillBERT_train_last_hidden_states1.pt'), weights_only=True)
DistillBERT_train_last_hidden_states2 = torch.load(os.path.join('DistillBERT', 'DistillBERT_train_last_hidden_states2.pt'), weights_only=True)
DistillBERT_train_last_hidden_states = torch.cat((DistillBERT_train_last_hidden_states1, DistillBERT_train_last_hidden_states2), dim=0)
del(DistillBERT_train_last_hidden_states1)
del(DistillBERT_train_last_hidden_states2)
print(DistillBERT_train_last_hidden_states.shape)

#3
DistillBERT_train_last_hidden_states3 = torch.load(os.path.join('DistillBERT', 'DistillBERT_train_last_hidden_states3.pt'), weights_only=True)
DistillBERT_train_last_hidden_states = torch.cat((DistillBERT_train_last_hidden_states, DistillBERT_train_last_hidden_states3), dim=0)
del(DistillBERT_train_last_hidden_states3)
print(DistillBERT_train_last_hidden_states.shape)

#4
DistillBERT_train_last_hidden_states4 = torch.load(os.path.join('DistillBERT', 'DistillBERT_train_last_hidden_states4.pt'), weights_only=True)
DistillBERT_train_last_hidden_states = torch.cat((DistillBERT_train_last_hidden_states, DistillBERT_train_last_hidden_states4), dim=0)
del(DistillBERT_train_last_hidden_states4)
print(DistillBERT_train_last_hidden_states.shape)

#5
DistillBERT_train_last_hidden_states5 = torch.load(os.path.join('DistillBERT', 'DistillBERT_train_last_hidden_states5.pt'), weights_only=True)
DistillBERT_train_last_hidden_states = torch.cat((DistillBERT_train_last_hidden_states, DistillBERT_train_last_hidden_states5), dim=0)
del(DistillBERT_train_last_hidden_states5)

print(DistillBERT_train_last_hidden_states.shape)

torch.save(DistillBERT_train_last_hidden_states, os.path.join('DistillBERT', 'DistillBERT_train_last_hidden_states.pt'))

torch.Size([10000, 128, 768])
torch.Size([15000, 128, 768])
torch.Size([22697, 128, 768])


In [None]:
# Test
DistillBERT_test_last_hidden_states = torch.load(os.path.join('DistillBERT', 'DistillBERT_test_last_hidden_states.pt'))
print(DistillBERT_test_last_hidden_states.shape)

torch.Size([5675, 128, 768])


### Compute Embeddings

In [9]:
# concatenate DistillBERT embeddings into single tensor
  # no pooler_output
def concat_DistillBERT_embeddings(BERT_output):
  last_hidden_states_list = []
  pooler_outputs_list = []

  for batch_output in BERT_output:
    # hidden states of last layer of BERT
    last_hidden_states_list.append(batch_output['last_hidden_state'])

  last_hidden_states = torch.cat(last_hidden_states_list, dim=0)
  print(last_hidden_states.shape)

  return last_hidden_states

#### Train

* in batches to avoid RAM issues

In [12]:
DistillBERT_output_train1 = get_BERT_embeddings(data_batches = X_train_batches[:10], BERT_model = pretrained_DistilBERT, tokenizer = distilbert_tokenizer)
DistillBERT_train_last_hidden_states1 = concat_DistillBERT_embeddings(DistillBERT_output_train1)

torch.save(DistillBERT_train_last_hidden_states1, os.path.join('DistillBERT', 'DistillBERT_train_last_hidden_states1.pt'))

Embedding Batch number: 1/10
Embedding Batch number: 2/10
Embedding Batch number: 3/10
Embedding Batch number: 4/10
Embedding Batch number: 5/10
Embedding Batch number: 6/10
Embedding Batch number: 7/10
Embedding Batch number: 8/10
Embedding Batch number: 9/10
Embedding Batch number: 10/10
torch.Size([5000, 128, 768])


In [None]:
DistillBERT_output_train2 = get_BERT_embeddings(data_batches = X_train_batches[10:20], BERT_model = pretrained_DistilBERT, tokenizer = distilbert_tokenizer)
DistillBERT_train_last_hidden_states2 = concat_DistillBERT_embeddings(DistillBERT_output_train2)

torch.save(DistillBERT_train_last_hidden_states2, os.path.join('DistillBERT', 'DistillBERT_train_last_hidden_states2.pt'))

Embedding Batch number: 1/10
Embedding Batch number: 2/10
Embedding Batch number: 3/10
Embedding Batch number: 4/10
Embedding Batch number: 5/10
Embedding Batch number: 6/10
Embedding Batch number: 7/10
Embedding Batch number: 8/10
Embedding Batch number: 9/10
Embedding Batch number: 10/10
torch.Size([5000, 128, 768])


In [None]:
DistillBERT_output_train3 = get_BERT_embeddings(data_batches = X_train_batches[20:30], BERT_model = pretrained_DistilBERT, tokenizer = distilbert_tokenizer)
DistillBERT_train_last_hidden_states3 = concat_DistillBERT_embeddings(DistillBERT_output_train3)

torch.save(DistillBERT_train_last_hidden_states3, os.path.join('DistillBERT', 'DistillBERT_train_last_hidden_states3.pt'))

Embedding Batch number: 1/10
Embedding Batch number: 2/10
Embedding Batch number: 3/10
Embedding Batch number: 4/10
Embedding Batch number: 5/10
Embedding Batch number: 6/10
Embedding Batch number: 7/10
Embedding Batch number: 8/10
Embedding Batch number: 9/10
Embedding Batch number: 10/10
torch.Size([5000, 128, 768])


In [None]:
DistillBERT_output_train4 = get_BERT_embeddings(data_batches = X_train_batches[30:40], BERT_model = pretrained_DistilBERT, tokenizer = distilbert_tokenizer)
DistillBERT_train_last_hidden_states4 = concat_DistillBERT_embeddings(DistillBERT_output_train4)

torch.save(DistillBERT_train_last_hidden_states4, os.path.join('DistillBERT', 'DistillBERT_train_last_hidden_states4.pt'))

Embedding Batch number: 1/10
Embedding Batch number: 2/10
Embedding Batch number: 3/10
Embedding Batch number: 4/10
Embedding Batch number: 5/10
Embedding Batch number: 6/10
Embedding Batch number: 7/10
Embedding Batch number: 8/10
Embedding Batch number: 9/10
Embedding Batch number: 10/10
torch.Size([5000, 128, 768])


In [None]:
DistillBERT_output_train5 = get_BERT_embeddings(data_batches = X_train_batches[40:], BERT_model = pretrained_DistilBERT, tokenizer = distilbert_tokenizer)
DistillBERT_train_last_hidden_states5 = concat_DistillBERT_embeddings(DistillBERT_output_train5)

torch.save(DistillBERT_train_last_hidden_states5, os.path.join('DistillBERT', 'DistillBERT_train_last_hidden_states5.pt'))

Embedding Batch number: 1/6
Embedding Batch number: 2/6
Embedding Batch number: 3/6
Embedding Batch number: 4/6
Embedding Batch number: 5/6
Embedding Batch number: 6/6
torch.Size([2697, 128, 768])


#### Test

In [None]:
DistillBERT_output_test = get_BERT_embeddings(data_batches = X_test_batches, BERT_model = pretrained_DistilBERT, tokenizer = distilbert_tokenizer)
DistillBERT_test_last_hidden_states = concat_DistillBERT_embeddings(DistillBERT_output_test)

torch.save(DistillBERT_test_last_hidden_states, os.path.join('DistillBERT', 'DistillBERT_test_last_hidden_states.pt'))

Embedding Batch number: 1/12
Embedding Batch number: 2/12
Embedding Batch number: 3/12
Embedding Batch number: 4/12
Embedding Batch number: 5/12
Embedding Batch number: 6/12
Embedding Batch number: 7/12
Embedding Batch number: 8/12
Embedding Batch number: 9/12
Embedding Batch number: 10/12
Embedding Batch number: 11/12
Embedding Batch number: 12/12
torch.Size([5675, 128, 768])


### Document Embeddings

In [None]:
# load from local
DistillBERT_train_last_hidden_states = torch.load(os.path.join('DistillBERT', 'DistillBERT_train_last_hidden_states.pt'))
DistillBERT_test_last_hidden_states = torch.load(os.path.join('DistillBERT', 'DistillBERT_test_last_hidden_states.pt'))
print(DistillBERT_train_last_hidden_states.shape)
print(DistillBERT_test_last_hidden_states.shape)

#### CLS

In [None]:
X_train_DistillBERT_CLS = DistillBERT_train_last_hidden_states[:, 0, :].detach().cpu().numpy()
X_test_DistillBERT_CLS = DistillBERT_test_last_hidden_states[:, 0, :].detach().cpu().numpy()
print(X_train_DistillBERT_CLS.shape)
print(X_test_DistillBERT_CLS.shape)

In [None]:
np.save(os.path.join('DistillBERT', 'X_train_DistillBERT_CLS.npy'), X_train_DistillBERT_CLS)
np.save(os.path.join('DistillBERT', 'X_test_DistillBERT_CLS.npy'), X_test_DistillBERT_CLS)

#### Mean Pooling

In [None]:
X_train_DistillBERT_MeanPooling = DistillBERT_train_last_hidden_states.mean(dim=1).detach().cpu().numpy()
X_test_DistillBERT_MeanPooling = DistillBERT_test_last_hidden_states.mean(dim=1).detach().cpu().numpy()
print(X_train_DistillBERT_MeanPooling.shape)
print(X_test_DistillBERT_MeanPooling.shape)

In [None]:
np.save(os.path.join('DistillBERT', 'X_train_DistillBERT_MeanPooling.npy'), X_train_DistillBERT_MeanPooling)
np.save(os.path.join('DistillBERT', 'X_test_DistillBERT_MeanPooling.npy'), X_test_DistillBERT_MeanPooling)