In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
cd '/content/drive/MyDrive/Studium/03 UC3M/Thesis/Data'

/content/drive/MyDrive/Studium/03 UC3M/Thesis/Data


In [3]:
import pandas as pd
import numpy as np

import pickle
import os
from transformers import AutoTokenizer, AutoModel
import torch
import time

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# Data Augmentation


## Load from local

* if data already augmented, load from local file

In [4]:
# load augmented data from local file
augmented_data_df = pd.read_pickle('Data Augmentation nlpaug/augmented_data_df.pkl')

# split into train and test
X_train_augmented, _ = train_test_split(augmented_data_df['X'], test_size=0.2, random_state=18)
print(X_train_augmented.shape)
y_train_valence_augmented, _, y_train_arousal_augmented, _ = train_test_split(augmented_data_df['y_valence'], augmented_data_df['y_arousal'], test_size=0.2, random_state=18)
print(y_train_valence_augmented.shape)
print(y_train_valence_augmented.shape)


# load testing partition (non-augmented)
X_test = np.load('X_test.npy', allow_pickle=True)
y_test_valence = np.load('y_test_valence.npy', allow_pickle=True)
y_test_arousal = np.load('y_test_arousal.npy', allow_pickle=True)
print(X_test.shape)
print(y_test_valence.shape)
print(y_test_arousal.shape)

(108945,)
(108945,)
(108945,)
(5675,)
(5675,)
(5675,)


In [5]:
augmented_data_df.head()

Unnamed: 0,X,y_valence,y_arousal
0,think stand strong feel abeknOeath feet happin...,0.084913,0.219195
1,sail away stand miss face feel like drownin ti...,0.838211,0.79679
2,yesterday life fill rrai0n 5s_mile ease pqapin...,0.299258,0.294272
3,dark night small hours uncertain aznQxioEus ne...,0.167354,0.643633
4,name like lady Kmyst6ic smile cause lonely nam...,0.882523,0.626615


In [6]:
# save train/test to local file

# X
subfolder_path = 'Data Augmentation nlpaug'
np.save(os.path.join(subfolder_path, 'X_train_augmented.npy'), X_train_augmented)

# y
np.save(os.path.join(subfolder_path, 'y_train_valence_augmented.npy'), y_train_valence_augmented)
np.save(os.path.join(subfolder_path, 'y_train_arousal_augmented.npy'), y_train_arousal_augmented)

## Augment Data

In [8]:
!pip install nlpaug

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/410.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/410.5 kB[0m [31m4.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nlpaug
Successfully installed nlpaug-1.1.11


In [9]:
X_train = np.load('X_train.npy', allow_pickle=True)
X_test = np.load('X_test.npy', allow_pickle=True)
y_train_valence = np.load('y_train_valence.npy', allow_pickle=True)
y_train_arousal = np.load('y_train_arousal.npy', allow_pickle=True)

X_list = list(X_train)

# character augmenters
import nlpaug.augmenter.char as nac
# word augmenters
import nlpaug.augmenter.word as naw

augmenters = [
    nac.RandomCharAug(action="insert"),
    nac.RandomCharAug(action="substitute"),
    naw.RandomWordAug(action="swap"),
    naw.RandomWordAug(action="delete"),
    naw.WordEmbsAug(model_type='word2vec', model_path='GoogleNews-vectors-negative300.bin', action="insert")
]

Xy_augmented = []
y_valence = []
y_arousal = []

# apply augmenter and collect augmented data
for aug in augmenters:
    print(f'Working on: {aug}')
    augmented_texts = aug.augment(X_list)
    Xy_augmented.extend(augmented_texts)
    y_valence.extend(y_train_valence)
    y_arousal.extend(y_train_arousal)

# include the original training data
Xy_augmented.extend(X_list)
y_valence.extend(y_train_valence)
y_arousal.extend(y_train_arousal)


Xy_train_augmented = pd.DataFrame({
    'X': Xy_augmented,
    'y_valence': y_valence,
    'y_arousal': y_arousal
})

# save augmented data to local
Xy_train_augmented.to_pickle(os.path.join('Data Augmentation nlpaug', 'Xy_train_augmented.pkl'))
Xy_train_augmented.shape

Working on: Name:RandomChar_Aug, Action:insert, Method:char
Working on: Name:RandomChar_Aug, Action:substitute, Method:char
Working on: Name:RandomWord_Aug, Action:swap, Method:word
Working on: Name:RandomWord_Aug, Action:delete, Method:word
Working on: Name:WordEmbs_Aug, Action:insert, Method:word


(136182, 3)

In [10]:
X_train_augmented = Xy_train_augmented['X']
y_train_valence_augmented = Xy_train_augmented['y_valence']
y_train_arousal_augmented = Xy_train_augmented['y_arousal']

subfolder_path = 'Data Augmentation nlpaug'
np.save(os.path.join(subfolder_path, 'X_train_augmented_NEW.npy'), X_train_augmented)
np.save(os.path.join(subfolder_path, 'y_train_valence_augmented_NEW.npy'), y_train_valence_augmented)
np.save(os.path.join(subfolder_path, 'y_train_arousal_augmented_NEW.npy'), y_train_arousal_augmented)

# Feature Extraction

## TF-IDF

In [None]:
# TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', max_df=0.85, min_df=0.01)

X_train_tfidf_augmented = tfidf_vectorizer.fit_transform(X_train_augmented)
X_test_tfidf = tfidf_vectorizer.transform(X_test) # also apply to testing partition to assure same dimensionality of train and test
print(X_train_tfidf_augmented.shape)
print(X_test_tfidf.shape)

# save embeddings to local file
from scipy import sparse

sparse.save_npz(os.path.join('Data Augmentation nlpaug', 'X_train_tfidf_augmented_NEW.npz'), X_train_tfidf_augmented)
sparse.save_npz(os.path.join('Data Augmentation nlpaug', 'X_test_tfidf_NEW.npz'), X_test_tfidf)

(68091, 641)
(5675, 641)


## Word2Vec
* pre-trained, Mean Pooling

In [None]:
# Word2Vec (pre-trained, Mean Pooling)
import gensim
from gensim.models import Word2Vec

# Tokenize text
def preprocess(text):
    return gensim.utils.simple_preprocess(text)

X_train_tokenized_augmented = [preprocess(doc) for doc in X_train_augmented]

from gensim.models import KeyedVectors

# Load pre-trained Word2Vec model
pretrained_model_path = 'GoogleNews-vectors-negative300.bin'
pretrained_word2vec = KeyedVectors.load_word2vec_format(pretrained_model_path, binary=True)

def get_document_vector(doc):
  word_vectors = [pretrained_word2vec[word] for word in doc if word in pretrained_word2vec]  # get embedding for each word
  return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(pretrained_word2vec.vector_size) # get mean over all words

# Transform documents to vectors
X_train_Word2Vec_pretrained_augmented = np.array([get_document_vector(doc) for doc in X_train_tokenized_augmented])
X_train_Word2Vec_pretrained_augmented.shape


# save to local
np.save(os.path.join('Data Augmentation nlpaug', 'X_train_Word2Vec_pretrained_augmented_NEW.npy'), X_train_Word2Vec_pretrained_augmented)

## GloVe

* pre-trained

In [None]:
# GloVe (pre-trained)
! pip install glove-python3

from glove import Glove, Corpus

# load dictionary from local pickle file (created in original notebook)
with open('pretrained_GloVe_dict.pkl', 'rb') as file:
    glove_embeddings = pickle.load(file)

len(glove_embeddings)
if 'party' in glove_embeddings:
  print(glove_embeddings['party'].shape)


# compute embeddings

def get_word_vectors(song):
  word_vectors = [glove_embeddings[token] for token in song if token in glove_embeddings]
  return(np.array(word_vectors))

# word-level embeddings
X_train_GloVe_pretrained_big = [get_word_vectors(song) for song in X_train_tokenized_augmented]

# song level embedding: mean pooling (average over all words in document)
# X_train_GloVe_pretrained_augmented = np.array([np.mean(song, axis=0) for song in X_train_GloVe_pretrained_big])

  # if len(song) = 0 (no word in vocabulary) -> zero vector
X_train_GloVe_pretrained_augmented = np.array([
    np.mean(song, axis=0) if len(song) > 0 else np.zeros(300)
    for song in X_train_GloVe_pretrained_big
])

print(X_train_GloVe_pretrained_augmented.shape)


# save to local
np.save(os.path.join('Data Augmentation nlpaug', 'X_train_GloVe_pretrained_augmented_NEW.npy'), X_train_GloVe_pretrained_augmented)

Collecting glove-python3
  Downloading glove_python3-0.1.0.tar.gz (326 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/327.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/327.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m327.0/327.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: glove-python3
  Building wheel for glove-python3 (setup.py) ... [?25l[?25hdone
  Created wheel for glove-python3: filename=glove_python3-0.1.0-cp310-cp310-linux_x86_64.whl size=1059021 sha256=1327fae510426f8a96e95fa10aa6eb9cb5a9843a0a4d3b9e3a1be26c62db057d
  Stored in directory: /root/.cache/pip/wheels/fe/2f/79/34314d44a0907e90e323c8c182ec23f126eb460829e02d98cf
Successfully built glove-python3
Installing collected packages: glove-python3
Succe

## BERT

* only Pooler Outputs

In [11]:
# load from local
pooler_outputs = torch.load(f'./Data Augmentation nlpaug/pooler_outputs.pt')
pooler_outputs.shape

  pooler_outputs = torch.load(f'./Data Augmentation nlpaug/pooler_outputs.pt')


torch.Size([136182, 768])

In [5]:
# !pip install torch
from transformers import AutoTokenizer, AutoModel
import torch

# Model: BERT-base
bert_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/bert-base-nli-mean-tokens")
pretrained_BERT = AutoModel.from_pretrained("sentence-transformers/bert-base-nli-mean-tokens")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [6]:
from torch.utils.data import DataLoader, Dataset

class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        return self.texts[idx]


def get_BERT_embeddings(X_, BERT_model, tokenizer, batch_size=32):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    BERT_model.to(device)

    dataset = TextDataset(X_)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    pretrained_BERT_output = []

    # process batches separately
    for batch in dataloader:
        tokenized_batch = tokenizer(batch, padding=True, truncation=True,
                                    max_length=128, return_tensors='pt').to(device)

        with torch.no_grad():
            outputs = BERT_model(**tokenized_batch)
            # only the pooler output
            pooler_output = outputs.pooler_output
            pretrained_BERT_output.append(pooler_output)

        # free memory
        del tokenized_batch
        del outputs
        torch.cuda.empty_cache()  # empty GPU cache

    return torch.cat(pretrained_BERT_output).to('cpu')  # move to CPU before returning or saving


In [None]:
pooler_outputs = get_BERT_embeddings(X_ = X_train_augmented.tolist(),
                    BERT_model = pretrained_BERT, tokenizer = bert_tokenizer,
                    batch_size = 32)

In [9]:
X_train_BERT_augmented = pooler_outputs.numpy()
print(X_train_BERT_augmented.shape)

np.save(os.path.join(subfolder_path, 'X_train_BERT_augmented.npy'), X_train_BERT_augmented)
torch.save(pooler_outputs, './Data Augmentation nlpaug/pooler_outputs.pt')

(108945, 768)
