In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm_notebook

In [None]:
import numpy as np 
from bert_serving.client import BertClient
bc = BertClient()

#Generate sentence representation by Bert-As-Service
text_all_embs = []
sentence_len = []
for i in range(len(text_all)):
    text = text_all[i].split("\n")
    sentence_len.append(len(text))
    text_embs = bc.encode(text)
    #text_embs = np.concatenate((text_embs,np.array([np.array(past_volatility_all[i])]*len(text_embs))),axis=1)
    text_all_embs.append(text_embs)
    
# Padding
dim = 1024 # Depends on the dimensions of your selected token-level pretrained model
b = np.zeros([len(text_all_embs),len(max(text_all_embs,key = lambda x: len(x))),dim]) 
for i,j in enumerate(text_all_embs): 
    b[i][0:len(j),:] = j 

In [None]:
import sys
# Add the ptdraft folder path to the sys.path list
sys.path.append(package-path)
import transformer
import torch

model = transformers.RobertaModel.from_pretrained('roberta-large')
tokenizer = transformers.RobertaTokenizer.from_pretrained('roberta-large')

def emb_str():
    input_ids = torch.tensor([tokenizer.encode(str('inputs here'), add_special_tokens=False)]) 
    with torch.no_grad():
        last_hidden_states = model(input_ids)[0]  # Models outputs are now tuples
        emb = last_hidden_states.cpu().numpy()
    return emb

In [None]:
audio_feature = AUDIO_DATA
audio_feature_3 = AUDIO_DATA_THREE_DAYS

In [None]:
text_feature = np.load('SENTENCE_EMBEDDING_DATA')
text_feature_3 = np.load('SENTENCE_EMBEDDING_DATA_THREE_DAYS')

In [None]:
text_audio_embs = [np.concatenate((text_feature[i],np.array(audio_feature[i])),axis=1) for i in range(audio_feature.shape[0])]

In [None]:
text_audio_embs_3 = [np.concatenate((text_feature_3[i],np.array(audio_feature_3[i])),axis=1) for i in range(audio_feature_3.shape[0])]

# Experiments Using GPU

## Find the optimal alpha for connecting two tasks

#### 3 days -- validation set

In [8]:
import torch, os

def mask_(matrices, maskval=0.0, mask_diagonal=True):
    """
    Masks out all values in the given batch of matrices where i <= j holds,
    i < j if mask_diagonal is false

    In place operation

    :param tns:
    :return:
    """

    b, h, w = matrices.size()

    indices = torch.triu_indices(h, w, offset=0 if mask_diagonal else 1)
    matrices[:, indices[0], indices[1]] = maskval

def d(tensor=None):
    """
    Returns a device string either for the best available device,
    or for the device corresponding to the argument
    :param tensor:
    :return:
    """
    if tensor is None:
        return 'cuda' if torch.cuda.is_available() else 'cpu'
    return 'cuda' if tensor.is_cuda else 'cpu'

def here(subpath=None):
    """
    :return: the path in which the package resides (the directory containing the 'former' dir)
    """
    if subpath is None:
        return os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))

    return os.path.abspath(os.path.join(os.path.dirname(__file__), '../..', subpath))

def contains_nan(tensor):
    return bool((tensor != tensor).sum() > 0)

In [11]:
import os
import time
import matplotlib
from tqdm import tqdm_notebook
from datetime import date
from matplotlib import pyplot as plt
from pylab import rcParams

#math package
import random, math
from numpy.random import seed
#from tensorflow import set_random_seed
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

#Pytorch Package
import torch
from torch import nn
import torch.nn.functional as F

#Customized Transformers Util
# from transformer import util
# from .util import mask_
# from .util import d

class SelfAttention(nn.Module):
    def __init__(self, emb, heads=8, mask=False):
        """
        :param emb:
        :param heads:
        :param mask:
        """

        super().__init__()

        self.emb = emb
        self.heads = heads
        self.mask = mask

        self.tokeys = nn.Linear(emb, emb * heads, bias=False)
        self.toqueries = nn.Linear(emb, emb * heads, bias=False)
        self.tovalues = nn.Linear(emb, emb * heads, bias=False)

        self.unifyheads = nn.Linear(heads * emb, emb)

    def forward(self, x):

        b, t, e = x.size()
        h = self.heads
        assert e == self.emb

        keys    = self.tokeys(x)   .view(b, t, h, e)
        queries = self.toqueries(x).view(b, t, h, e)
        values  = self.tovalues(x) .view(b, t, h, e)

        # compute scaled dot-product self-attention

        # - fold heads into the batch dimension
        keys = keys.transpose(1, 2).contiguous().view(b * h, t, e)
        queries = queries.transpose(1, 2).contiguous().view(b * h, t, e)
        values = values.transpose(1, 2).contiguous().view(b * h, t, e)

        queries = queries / (e ** (1/4))
        keys    = keys / (e ** (1/4))
        # - Instead of dividing the dot products by sqrt(e), we scale the keys and values.
        #   This should be more memory efficient

        # - get dot product of queries and keys, and scale
        dot = torch.bmm(queries, keys.transpose(1, 2))

        assert dot.size() == (b*h, t, t)

        if self.mask: # mask out the lower half of the dot matrix,including the diagonal
            mask_(dot, maskval=float('-inf'), mask_diagonal=False)

        dot = F.softmax(dot, dim=2) # dot now has row-wise self-attention probabilities

        #assert not util.contains_nan(dot[:, 1:, :]) # only the forst row may contain nan
        assert not contains_nan(dot[:, 1:, :]) # only the forst row may contain nan

        if self.mask == 'first':
            dot = dot.clone()
            dot[:, :1, :] = 0.0
            # - The first row of the first attention matrix is entirely masked out, so the softmax operation results
            #   in a division by zero. We set this row to zero by hand to get rid of the NaNs

        # apply the self attention to the values
        out = torch.bmm(dot, values).view(b, h, t, e)

        # swap h, t back, unify heads
        out = out.transpose(1, 2).contiguous().view(b, t, h * e)

        return self.unifyheads(out)

class TransformerBlock(nn.Module):
    def __init__(self, emb, heads, mask, seq_length, ff_hidden_mult=4, dropout=0.0):
        super().__init__()

        self.attention = SelfAttention(emb, heads=heads, mask=mask)
        self.mask = mask

        self.norm1 = nn.LayerNorm(emb)
        self.norm2 = nn.LayerNorm(emb)

        self.ff = nn.Sequential(
            nn.Linear(emb, ff_hidden_mult * emb),
            nn.ReLU(),
            nn.Linear(ff_hidden_mult * emb, emb)
        )

        self.do = nn.Dropout(dropout)

    def forward(self, x):

        attended = self.attention(x)

        x = self.norm1(attended + x)

        x = self.do(x)

        fedforward = self.ff(x)

        x = self.norm2(fedforward + x)

        x = self.do(x)

        return x
##RTransformer

class RTransformer(nn.Module):
    """
    Transformer for sequences Regression    
    
    """

    def __init__(self, emb, heads, depth, seq_length, num_tokens, num_classes, max_pool=True, dropout=0.0):
        """
        emb: Embedding dimension
        heads: nr. of attention heads
        depth: Number of transformer blocks
        seq_length: Expected maximum sequence length
        num_tokens: Number of tokens (usually words) in the vocabulary
        num_classes: Number of classes.
        max_pool: If true, use global max pooling in the last layer. If false, use global
                         average pooling.
        """
        super().__init__()

        self.num_tokens, self.max_pool = num_tokens, max_pool

        #self.token_embedding = nn.Embedding(embedding_dim=emb, num_embeddings=num_tokens)
        self.pos_embedding = nn.Embedding(embedding_dim=emb, num_embeddings=seq_length)

        tblocks = []
        for i in range(depth):
            tblocks.append(
                TransformerBlock(emb=emb, heads=heads, seq_length=seq_length, mask=False, dropout=dropout))

        self.tblocks = nn.Sequential(*tblocks)

        self.toprobs = nn.Linear(emb, num_classes)
        self.toprobs_b = nn.Linear(emb, num_classes)
        self.do = nn.Dropout(dropout)

    def forward(self, x):
        """
        :param x: A batch by sequence length integer tensor of token indices.
        :return: predicted log-probability vectors for each token based on the preceding tokens.
        """
        sentences_emb = x
        b, t, e = x.size()

        positions = self.pos_embedding(torch.arange(t, device=d()))[None, :, :].expand(b, t, e)
        #positions = self.pos_embedding(torch.arange(t))[None, :, :].expand(b, t, e)
        #positions = torch.tensor(positions, dtype=torch.float32)
        x = sentences_emb.cuda() + positions
        x = self.do(x)

        x = self.tblocks(x)

        x = x.max(dim=1)[0] if self.max_pool else x.mean(dim=1) # pool over the time dimension
        
        
        x_a = self.toprobs(x)
        x_b = self.toprobs_b(x)
        x_a = torch.squeeze(x_a)
        x_b = torch.squeeze(x_b)
        #print('x shape: ',x.shape)
        return x_a, x_b

class CTransformer(nn.Module):
    """
    Transformer for classifying sequences
    """

    def __init__(self, emb, heads, depth, seq_length, num_tokens, num_classes, max_pool=True, dropout=0.0, wide=False):

        super().__init__()

        self.num_tokens, self.max_pool = num_tokens, max_pool

        self.token_embedding = nn.Embedding(embedding_dim=emb, num_embeddings=num_tokens)
        self.pos_embedding = nn.Embedding(embedding_dim=emb, num_embeddings=seq_length)

        tblocks = []
        for i in range(depth):
            tblocks.append(
                TransformerBlock(emb=emb, heads=heads, seq_length=seq_length, mask=False, dropout=dropout, wide=wide))

        self.tblocks = nn.Sequential(*tblocks)

        self.toprobs = nn.Linear(emb, num_classes)

        self.do = nn.Dropout(dropout)

    def forward(self, x):
        """
        :param x: A batch by sequence length integer tensor of token indices.
        :return: predicted log-probability vectors for each token based on the preceding tokens.
        """
        tokens = self.token_embedding(x)
        b, t, e = tokens.size()

        positions = self.pos_embedding(torch.arange(t, device=d()))[None, :, :].expand(b, t, e)
        x = tokens + positions
        x = self.do(x)

        x = self.tblocks(x)

        x = x.max(dim=1)[0] if self.max_pool else x.mean(dim=1) # pool over the time dimension

        x = self.toprobs(x)

        return F.log_softmax(x, dim=1)

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

#Customized Transformers Util
from transformer.util import d, here, mask_
from transformer.transformers_gpu import *

import torch
from torch import nn
from torch.autograd import Variable
import torch.nn.functional as F
from torch.utils import data
from sklearn.model_selection import train_test_split
from transformer import util

from torchtext import data, datasets, vocab
from argparse import ArgumentParser
from torch.utils.tensorboard import SummaryWriter

import random, math
from numpy.random import seed
# from tensorflow import set_random_seed
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import random, tqdm, sys, math, gzip


class Dataset(data.Dataset):
    def __init__(self, texts, labels, labels_b):
        'Initialization'
        self.labels = labels
        self.text = texts
        self.labels_b = labels_b
        
    def __len__(self):
        'Denotes the total number of samples'
        return len(self.labels)
    
    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        if torch.is_tensor(index):
            index = index.tolist()

        # Load data and get label
        X = self.text[index,:,:]
        y = self.labels[index]
        y_b = self.labels_b[index]
        return X, y, y_b
    
def go(arg):
    """
    Creates and trains a basic transformer for the volatility regression task.
    """
    LOG2E = math.log2(math.e)
    NUM_CLS = 1
    
    print(" Loading Data ...")
    TEXT_emb = np.load(arg.input_dir)
    LABEL_emb = np.load(arg.label_dir)
    LABEL_emb_b = np.load(arg.label_dir_b)
    print(" Finish Loading Data... ")
    
    if arg.final:
        
        train, test = train_test_split(TEXT_emb, test_size=0.2)
        train_label, test_label = train_test_split(LABEL_emb, test_size=0.2)
        train_label_b, test_label_b = train_test_split(LABEL_emb_b, test_size=0.2)
        
        training_set = Dataset(train, train_label, train_label_b)
        val_set = Dataset(test, test_label, test_label_b)
        
    else:
        data, _ = train_test_split(TEXT_emb, test_size=0.2)
        train,val = train_test_split(data, test_size=0.125)
        
        data_label, _ = train_test_split(LABEL_emb, test_size=0.2) 
        train_label, val_label = train_test_split(data_label, test_size=0.125)
        
        data_label_b, _ = train_test_split(LABEL_emb_b, test_size=0.2)
        train_label_b, val_label_b = train_test_split(data_label_b, test_size=0.125)
        
        
        training_set = Dataset(train, train_label, train_label_b)
        val_set = Dataset(val, val_label, val_label_b)
        
    trainloader=torch.utils.data.DataLoader(training_set, batch_size=arg.batch_size, shuffle=False, num_workers=2) 
    testloader=torch.utils.data.DataLoader(val_set, batch_size=len(val_set), shuffle=False, num_workers=2)
    print('training examples', len(training_set))
        
    if arg.final:
          print('test examples', len(val_set))
    else:
          print('validation examples', len(val_set))
          

    # create the model
    model = RTransformer(emb=arg.embedding_size, heads=arg.num_heads, depth=arg.depth, \
                         seq_length=arg.max_length, num_tokens=arg.vocab_size, num_classes=NUM_CLS, max_pool=arg.max_pool)
    
    if arg.gpu:
        if torch.cuda.is_available():
            os.environ["CUDA_VISIBLE_DEVICES"] = arg.cuda_id
            model.cuda()
        
    opt = torch.optim.Adam(lr=arg.lr, params=model.parameters())

    # training loop
    seen = 0
    evaluation= {'epoch': [],'Train Accuracy': [], 'Test Accuracy':[], 'Test Accuracy B':[], 'Outputs':[]}
    for e in tqdm.tqdm_notebook(range(arg.num_epochs)):
        train_loss_tol = 0.0
        print('\n epoch ',e)
        model.train(True)

        for i, data in enumerate(trainloader):
            # learning rate warmup
            # - we linearly increase the learning rate from 10e-10 to arg.lr over the first
            #   few thousand batches
            if arg.lr_warmup > 0 and seen < arg.lr_warmup:
                lr = max((arg.lr / arg.lr_warmup) * seen, 1e-10)
                opt.lr = lr

            opt.zero_grad()
            
            inputs, labels, labels_b = data
            inputs = Variable(inputs.type(torch.FloatTensor))
            labels = torch.tensor(labels, dtype=torch.float32).cuda()
            labels_b = torch.tensor(labels_b, dtype=torch.float32).cuda()
            #if i ==0:
                #print (inputs.shape)
            if inputs.size(1) > arg.max_length:
                inputs = inputs[:, :arg.max_length, :]
                
            out_a,out_b = model(inputs)
            #print(out_a.shape,out_b.shape)
            #print(out.shape,labels.shape)

            loss_a = F.mse_loss(out_a, labels)
            loss_b = F.mse_loss(out_b, labels_b)
            loss = arg.alpha*loss_a + (1 - arg.alpha)*loss_b
            train_loss_tol += loss
            
            loss.backward()

            # clip gradients
            # - If the total gradient vector has a length > 1, we clip it back down to 1.
            if arg.gradient_clipping > 0.0:
                nn.utils.clip_grad_norm_(model.parameters(), arg.gradient_clipping)

            opt.step()

            seen += inputs.size(0)
            #tbw.add_scalar('classification/train-loss', float(loss.item()), seen)
        #print('train_loss: ',train_loss_tol)
        train_loss_tol = train_loss_tol/(i+1)
        with torch.no_grad():

            model.train(False)
            tot, cor= 0.0, 0.0

            loss_test = 0.0
            loss_test_b = 0.0
            for i, data in enumerate(testloader):
                inputs, labels, labels_b = data
                inputs, labels, labels_b = torch.tensor(inputs, dtype=torch.float32), torch.tensor(labels, dtype=torch.float32).cuda(), torch.tensor(labels_b, dtype=torch.float32).cuda()         
                if inputs.size(1) > arg.max_length:
                    inputs = inputs[:, :arg.max_length, :]
                out_a,out_b = model(inputs)
            
                loss_test += F.mse_loss(out_a, labels)
                loss_test_b += F.mse_loss(out_b, labels_b)
                #tot = float(inputs.size(0))
                #cor += float(labels.sum().item())

            acc = loss_test          
#             if arg.final:
#                 print('test accuracy', acc)
#             else:
#                 print('validation accuracy', acc)
        #torch.save(model, '/data/exp/checkpoints_torch_volatility/checkpoint-epoch'+str(e)+'.pth')
        evaluation['epoch'].append(e)
        evaluation['Train Accuracy'].append(train_loss_tol.item())
        evaluation['Test Accuracy'].append(acc.item())
        evaluation['Test Accuracy B'].append(loss_test_b.item())
        evaluation['Outputs'].append(out_a)
        
    evaluation = pd.DataFrame(evaluation)
    evaluation.sort_values(["Test Accuracy"],ascending=True,inplace=True)
    
    return evaluation

In [None]:
best_alpha = {'alpha': [],'best':[]}
for i in np.arange(0.1,1.1,0.1):   
    if __name__ == "__main__":
        #print('OPTIONS ', options)
        # Tuning Parameters: 
        import easydict
        from argparse import ArgumentParser
        parser = ArgumentParser()
        args = parser.parse_known_args()[0]
        args = easydict.EasyDict({
                "num_epochs": 10,
                "batch_size": 4,
                "lr": 2e-5,
                "tb_dir": "./runs",
                "final": False,
                "max_pool": False,
                "embedding_size" : 1051, # 1024(textual feature)+27(audio features)
                "max_length" : 520,
                "num_heads" : 2,
                "depth" : 2,
                "seed" : 1,
                "lr_warmup" : 1000,
                "gradient_clipping" : 1.0,
                "input_dir": "SENTENCE_EMBEDDING_DATA",
                "label_dir": "MAIN_LABEL",
                "label_dir_b": "AUXILIARY_LAEBL",
                "alpha" : i,
                "gpu": True,
                "save": False
        })

        evaluation = run_gpu.go(args)
        print('Results in alpha = ',i)
        print(evaluation)
        best_alpha['alpha'].append(i)
        best_alpha['best'].append(evaluation['Test Accuracy'].iloc[0])
        
best_alpha = pd.DataFrame(best_alpha)
best_alpha.sort_values(["best"],ascending=True,inplace=True)
best_alpha.to_csv('./results/3_days_result_vocal.csv')

best_alpha
