# Setup

In [1]:
import pandas as pd
import numpy as np
import time
import pickle
import os
from transformers import AutoTokenizer, AutoModel
import torch
from scipy import sparse

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [2]:
from sklearn.model_selection import GridSearchCV
from skopt import BayesSearchCV
from skopt.space import Integer, Real, Categorical
np.int = int
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import KFold, cross_val_score
cross_val = KFold(n_splits=5, shuffle = True, random_state=18)

In [3]:
# local
os.chdir('G:/Meine Ablage/Studium/03 UC3M/Thesis/Data')
subfolder = 'Data Augmentation nlpaug'

# Load Data

In [4]:
##################################################################################
# y_test (not augmented!!)

y_test_valence = np.load('y_test_valence.npy')
y_test_arousal = np.load('y_test_arousal.npy')
print(y_test_valence.shape)
print(y_test_arousal.shape)

##################################################################################
# y_train
y_train_valence_augmented = np.load(os.path.join(subfolder, 'y_train_valence_augmented.npy'))
y_train_arousal_augmented = np.load(os.path.join(subfolder, 'y_train_arousal_augmented.npy'))
print(y_train_valence_augmented.shape)
print(y_train_arousal_augmented.shape)


##################################################################################
# TF-IDF
X_train_tfidf_augmented = sparse.load_npz(os.path.join(subfolder, 'X_train_tfidf_augmented.npz')).toarray()
X_test_tfidf = sparse.load_npz(os.path.join(subfolder, 'X_test_tfidf.npz')).toarray()
print("TF-IDF augmented:", X_train_tfidf_augmented.shape, "& ", X_test_tfidf.shape)


##################################################################################
# Word2Vec pretrained
X_train_Word2Vec_pretrained_augmented = np.load(os.path.join(subfolder, 'X_train_Word2Vec_pretrained_augmented.npy'))
X_test_Word2Vec_pretrained = np.load(os.path.join('Word2Vec', 'X_test_Word2Vec_pretrained.npy'))
print("Word2Vec pretrained augmented:", X_train_Word2Vec_pretrained_augmented.shape, "& ", X_test_Word2Vec_pretrained.shape)


##################################################################################
# GloVE pretrained
X_train_GloVe_pretrained_augmented = np.load(os.path.join(subfolder, 'X_train_GloVe_pretrained_augmented.npy'))
X_test_GloVe_pretrained = np.load(os.path.join('GloVe', 'X_test_GloVe_pretrained.npy'))
print("GloVe pretrained augmented:", X_train_GloVe_pretrained_augmented.shape,  "& ", X_test_GloVe_pretrained.shape)

##################################################################################
# BERT
X_train_BERT_augmented = np.load(os.path.join(subfolder, 'X_train_BERT_augmented.npy'))
X_test_BERT = torch.load(os.path.join('BERT', 'BERT_test_pooler_outputs.pt')).numpy()
print("BERT pretrained augmented:", X_train_BERT_augmented.shape, "& ", X_test_BERT.shape)

(5675,)
(5675,)
(108945,)
(108945,)
TF-IDF augmented: (108945, 635) &  (5675, 635)
Word2Vec pretrained augmented: (108945, 300) &  (5675, 300)
GloVe pretrained augmented: (108945, 300) &  (5675, 300)
BERT pretrained augmented: (108945, 768) &  (5675, 768)


# Linear Regression

In [9]:
from sklearn.linear_model import SGDRegressor
import warnings

def SGDR_Regressor(X_train_, y_train_, X_test_, y_test_, param_grid_):
    t0 = time.time()
    
    param_grid = param_grid_
    
    warnings.filterwarnings('ignore', message='The objective has been evaluated at this point before.')
    SGDR_opt = BayesSearchCV(
      SGDRegressor(random_state=18),
      search_spaces=param_grid_, n_iter=50, cv=cross_val, scoring='neg_mean_squared_error', n_jobs=4, verbose=False, random_state=18)
    SGDR_opt.fit(X_train_, y_train_)
    
    t1 = time.time()-t0
    print(f'Duration: {round(t1,2)} s')

    print(f'{SGDR_opt.best_params_=}')
    print(f'{SGDR_opt.best_score_=}')

    # predict test
    predictions = SGDR_opt.best_estimator_.predict(X_test_)
    MSE_test = mean_squared_error(y_test_, predictions)
    print(f'MSE Test: {round(MSE_test,4)}')
    
    return SGDR_opt, predictions, MSE_test

In [10]:
param_grid = {
    'max_iter': Integer(1000,10000),
    'tol': [1e-6],
    'penalty': ['l2', 'l1', 'elasticnet'],
    'alpha': Real(1e-5, 1e0), 
}

In [8]:
##################################################################################
# TF-IDF
##################################################################################

# Valence
model_SGDR_valence_tfidf_augmented, predictions_SGDR_valence_tfidf_augmented, MSE_SGDR_valence_tfidf_augmented = SGDR_Regressor(
    X_train_tfidf_augmented, y_train_valence_augmented, X_test_tfidf, y_test_valence, param_grid)

print('---------------------------------------------------------')

# arousal
model_SGDR_arousal_tfidf_augmented, predictions_SGDR_arousal_tfidf_augmented, MSE_SGDR_arousal_tfidf_augmented = SGDR_Regressor(
    X_train_tfidf_augmented, y_train_arousal_augmented, X_test_tfidf, y_test_arousal, param_grid)

Duration: 725.23 s
SGDR_opt.best_params_=OrderedDict([('alpha', 1e-05), ('max_iter', 8436), ('penalty', 'l2'), ('tol', 1e-06)])
SGDR_opt.best_score_=-0.05562443534995691
MSE Test: 0.0578
---------------------------------------------------------
Duration: 737.09 s
SGDR_opt.best_params_=OrderedDict([('alpha', 1e-05), ('max_iter', 8436), ('penalty', 'l2'), ('tol', 1e-06)])
SGDR_opt.best_score_=-0.04730765979995083
MSE Test: 0.0482


In [9]:
##################################################################################
# Word2Vec pretrained
##################################################################################

# valence
model_SGDR_valence_Word2Vec_pretrained_augmented, predictions_SGDR_valence_Word2Vec_pretrained_augmented, MSE_SGDR_valence_Word2Vec_pretrained_augmented = SGDR_Regressor(
    X_train_Word2Vec_pretrained_augmented, y_train_valence_augmented, X_test_Word2Vec_pretrained, y_test_valence, param_grid)

print('---------------------------------------------------------')

# arousal
model_SGDR_arousal_Word2Vec_pretrained_augmented, predictions_SGDR_arousal_Word2Vec_pretrained_augmented, MSE_SGDR_arousal_Word2Vec_pretrained_augmented = SGDR_Regressor(
    X_train_Word2Vec_pretrained_augmented, y_train_arousal_augmented, X_test_Word2Vec_pretrained, y_test_arousal, param_grid)

Duration: 506.87 s
SGDR_opt.best_params_=OrderedDict([('alpha', 1e-05), ('max_iter', 8437), ('penalty', 'l2'), ('tol', 1e-06)])
SGDR_opt.best_score_=-0.056751446851602494
MSE Test: 0.0578
---------------------------------------------------------
Duration: 620.8 s
SGDR_opt.best_params_=OrderedDict([('alpha', 1e-05), ('max_iter', 8436), ('penalty', 'l2'), ('tol', 1e-06)])
SGDR_opt.best_score_=-0.048492466346989754
MSE Test: 0.0481


In [10]:
##################################################################################
# GloVe pretrained
##################################################################################

# valence
model_SGDR_valence_GloVe_pretrained_augmented, predictions_SGDR_valence_GloVe_pretrained_augmented, MSE_SGDR_valence_GloVe_pretrained_augmented = SGDR_Regressor(
    X_train_GloVe_pretrained_augmented, y_train_valence_augmented, X_test_GloVe_pretrained, y_test_valence, param_grid)

print('---------------------------------------------------------')

# arousal
model_SGDR_arousal_GloVe_pretrained_augmented, predictions_SGDR_arousal_GloVe_pretrained_augmented, MSE_SGDR_arousal_GloVe_pretrained_augmented = SGDR_Regressor(
    X_train_GloVe_pretrained_augmented, y_train_arousal_augmented, X_test_GloVe_pretrained, y_test_arousal, param_grid)

Duration: 466.21 s
SGDR_opt.best_params_=OrderedDict([('alpha', 1e-05), ('max_iter', 8435), ('penalty', 'l2'), ('tol', 1e-06)])
SGDR_opt.best_score_=-0.05636248443668407
MSE Test: 0.0577
---------------------------------------------------------
Duration: 447.84 s
SGDR_opt.best_params_=OrderedDict([('alpha', 1e-05), ('max_iter', 10000), ('penalty', 'elasticnet'), ('tol', 1e-06)])
SGDR_opt.best_score_=-0.04787488795489142
MSE Test: 0.0478


In [12]:
##################################################################################
# BERT
##################################################################################

# valence
model_SGDR_valence_BERT_augmented, predictions_SGDR_valence_BERT_augmented, MSE_SGDR_valence_BERT_augmented = SGDR_Regressor(
    X_train_BERT_augmented, y_train_valence_augmented, X_test_BERT, y_test_valence, param_grid)

print('---------------------------------------------------------')

# arousal
model_SGDR_arousal_BERT_augmented, predictions_SGDR_arousal_BERT_augmented, MSE_SGDR_arousal_BERT_augmented = SGDR_Regressor(
    X_train_BERT_augmented, y_train_arousal_augmented, X_test_BERT, y_test_arousal, param_grid)

Duration: 1504.45 s
SGDR_opt.best_params_=OrderedDict([('alpha', 0.00012358063208553572), ('max_iter', 8811), ('penalty', 'elasticnet'), ('tol', 1e-06)])
SGDR_opt.best_score_=-0.05610375195049999
MSE Test: 0.0579
---------------------------------------------------------
Duration: 1586.14 s
SGDR_opt.best_params_=OrderedDict([('alpha', 1e-05), ('max_iter', 1000), ('penalty', 'elasticnet'), ('tol', 1e-06)])
SGDR_opt.best_score_=-0.04684045299839301
MSE Test: 0.0459


# Feedforward Neural Network

In [15]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from scipy.sparse import issparse
import random

def NN2(X_train_, y_train_, X_test_, y_test_, epochs_=2):
    set_seeds(18)

    if issparse(X_train_):
        X_train_ = X_train_.toarray()
    if issparse(X_test_):
        X_test_ = X_test_.toarray()
    
    # holdout
    X_train_NN, X_val_NN, y_train_NN, y_val_NN = train_test_split(X_train_, y_train_, test_size=0.2, random_state=18)

    dropout_ = 0.25
    # build NN model
    model = Sequential()
    model.add(Dense(128, activation='relu', input_shape=(X_train_NN.shape[1],)))
    model.add(Dropout(dropout_))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(dropout_))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='linear'))  # Output layer for regression

    # compile 
    model.compile(optimizer= tf.compat.v1.train.AdamOptimizer(), loss='mean_squared_error')
    lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=3)
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    
    # train NN
    model.fit(X_train_NN, y_train_NN, epochs=epochs_, batch_size=32,
              validation_data=(X_val_NN, y_val_NN))
    
    # evaluate model on test set    
    predictions = model.predict(X_test_)
    MSE_test = mean_squared_error(y_test_, predictions)
    print(f'MSE Test: {round(MSE_test,4)}')

    return model, predictions, MSE_test

def set_seeds(seed=18):
    np.random.seed(seed)
    random.seed(seed)
    tf.random.set_seed(seed)




In [18]:
##################################################################################
# TF-IDF
##################################################################################

# Valence
model_NN2_valence_tfidf_augmented, predictions_NN2_valence_tfidf_augmented, MSE_NN2_valence_tfidf_augmented = NN2(
    X_train_tfidf_augmented, y_train_valence_augmented, X_test_tfidf, y_test_valence,
    epochs_=20)

print('---------------------------------------------------------')

# arousal
model_NN2_arousal_tfidf_augmented, predictions_NN2_arousal_tfidf_augmented, MSE_NN2_arousal_tfidf_augmented = NN2(
    X_train_tfidf_augmented, y_train_arousal_augmented, X_test_tfidf, y_test_arousal,
    epochs_=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
MSE Test: 0.0723
---------------------------------------------------------
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
MSE Test: 0.0557


In [19]:
##################################################################################
# Word2Vec, Pretrained
##################################################################################

# Valence
model_NN2_valence_Word2Vec_pretrained_augmented, predictions_NN2_valence_Word2Vec_pretrained_augmented, MSE_NN2_valence_Word2Vec_pretrained_augmented = NN2(
    X_train_Word2Vec_pretrained_augmented, y_train_valence_augmented, X_test_Word2Vec_pretrained, y_test_valence,
    epochs_=20)

print('---------------------------------------------------------')

# arousal
model_NN2_arousal_Word2Vec_pretrained_augmented, predictions_NN2_arousal_Word2Vec_pretrained_augmented, MSE_NN2_arousal_Word2Vec_pretrained_augmented = NN2(
    X_train_Word2Vec_pretrained_augmented, y_train_arousal_augmented, X_test_Word2Vec_pretrained, y_test_arousal,
    epochs_=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
MSE Test: 0.0625
---------------------------------------------------------
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
MSE Test: 0.0529


In [17]:
##################################################################################
# GloVe pretrained
##################################################################################

# valence
model_NN2_valence_GloVe_pretrained_augmented, predictions_NN2_valence_GloVe_pretrained_augmented, MSE_NN2_valence_GloVe_pretrained_augmented = NN2(
    X_train_GloVe_pretrained_augmented, y_train_valence_augmented, X_test_GloVe_pretrained, y_test_valence,
    epochs_=20)

print('---------------------------------------------------------')

# arousal
model_NN2_arousal_GloVe_pretrained_augmented, predictions_NN2_arousal_GloVe_pretrained_augmented, MSE_NN2_arousal_GloVe_pretrained_augmented = NN2(
    X_train_GloVe_pretrained_augmented, y_train_arousal_augmented, X_test_GloVe_pretrained, y_test_arousal,
    epochs_=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
MSE Test: 0.0647
---------------------------------------------------------
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
MSE Test: 0.0534


In [16]:
##################################################################################
# BERT
##################################################################################

# valence
model_NN2_valence_BERT_augmented, predictions_NN2_valence_BERT_augmented, MSE_NN2_valence_BERT_augmented = NN2(
    X_train_BERT_augmented, y_train_valence_augmented, X_test_BERT, y_test_valence,
    epochs_=20)

print('---------------------------------------------------------')

# arousal
model_NN2_arousal_BERT_augmented, predictions_NN2_arousal_BERT_augmented, MSE_NN2_arousal_BERT_augmented = NN2(
    X_train_BERT_augmented, y_train_arousal_augmented, X_test_BERT, y_test_arousal,
    epochs_=20)



Epoch 1/20

Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
MSE Test: 0.0584
---------------------------------------------------------
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
MSE Test: 0.0521
